diff options
Diffstat (limited to 'include/system')
67 files changed, 10350 insertions, 0 deletions
diff --git a/include/system/accel-blocker.h b/include/system/accel-blocker.h new file mode 100644 index 0000000..e10099d --- /dev/null +++ b/include/system/accel-blocker.h @@ -0,0 +1,55 @@ +/* + * Accelerator blocking API, to prevent new ioctls from starting and wait the + * running ones finish. + * This mechanism differs from pause/resume_all_vcpus() in that it does not + * release the BQL. + * + * Copyright (c) 2022 Red Hat Inc. + * + * Author: Emanuele Giuseppe Esposito <eesposit@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef ACCEL_BLOCKER_H +#define ACCEL_BLOCKER_H + +#include "system/cpus.h" + +void accel_blocker_init(void); + +/* + * accel_{cpu_}ioctl_begin/end: + * Mark when ioctl is about to run or just finished. + * + * accel_{cpu_}ioctl_begin will block after accel_ioctl_inhibit_begin() is + * called, preventing new ioctls to run. They will continue only after + * accel_ioctl_inibith_end(). + */ +void accel_ioctl_begin(void); +void accel_ioctl_end(void); +void accel_cpu_ioctl_begin(CPUState *cpu); +void accel_cpu_ioctl_end(CPUState *cpu); + +/* + * accel_ioctl_inhibit_begin: start critical section + * + * This function makes sure that: + * 1) incoming accel_{cpu_}ioctl_begin() calls block + * 2) wait that all ioctls that were already running reach + * accel_{cpu_}ioctl_end(), kicking vcpus if necessary. + * + * This allows the caller to access shared data or perform operations without + * worrying of concurrent vcpus accesses. + */ +void accel_ioctl_inhibit_begin(void); + +/* + * accel_ioctl_inhibit_end: end critical section started by + * accel_ioctl_inhibit_begin() + * + * This function allows blocked accel_{cpu_}ioctl_begin() to continue. + */ +void accel_ioctl_inhibit_end(void); + +#endif /* ACCEL_BLOCKER_H */ diff --git a/include/system/accel-ops.h b/include/system/accel-ops.h new file mode 100644 index 0000000..4c99d25 --- /dev/null +++ b/include/system/accel-ops.h @@ -0,0 +1,73 @@ +/* + * Accelerator OPS, used for cpus.c module + * + * Copyright 2021 SUSE LLC + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef ACCEL_OPS_H +#define ACCEL_OPS_H + +#include "exec/vaddr.h" +#include "qom/object.h" + +#define ACCEL_OPS_SUFFIX "-ops" +#define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX +#define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS) + +DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS) + +/** + * struct AccelOpsClass - accelerator interfaces + * + * This structure is used to abstract accelerator differences from the + * core CPU code. Not all have to be implemented. + */ +struct AccelOpsClass { + /*< private >*/ + ObjectClass parent_class; + /*< public >*/ + + /* initialization function called when accel is chosen */ + void (*ops_init)(AccelOpsClass *ops); + + bool (*cpus_are_resettable)(void); + void (*cpu_reset_hold)(CPUState *cpu); + + void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */ + void (*kick_vcpu_thread)(CPUState *cpu); + bool (*cpu_thread_is_idle)(CPUState *cpu); + + void (*synchronize_post_reset)(CPUState *cpu); + void (*synchronize_post_init)(CPUState *cpu); + void (*synchronize_state)(CPUState *cpu); + void (*synchronize_pre_loadvm)(CPUState *cpu); + void (*synchronize_pre_resume)(bool step_pending); + + void (*handle_interrupt)(CPUState *cpu, int mask); + + /** + * @get_virtual_clock: fetch virtual clock + * @set_virtual_clock: set virtual clock + * + * These allow the timer subsystem to defer to the accelerator to + * fetch time. The set function is needed if the accelerator wants + * to track the changes to time as the timer is warped through + * various timer events. + */ + int64_t (*get_virtual_clock)(void); + void (*set_virtual_clock)(int64_t time); + + int64_t (*get_elapsed_ticks)(void); + + /* gdbstub hooks */ + bool (*supports_guest_debug)(void); + int (*update_guest_debug)(CPUState *cpu); + int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); + int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); + void (*remove_all_breakpoints)(CPUState *cpu); +}; + +#endif /* ACCEL_OPS_H */ diff --git a/include/system/address-spaces.h b/include/system/address-spaces.h new file mode 100644 index 0000000..72d17af --- /dev/null +++ b/include/system/address-spaces.h @@ -0,0 +1,35 @@ +/* + * Internal memory management interfaces + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Avi Kivity <avi@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef SYSTEM_ADDRESS_SPACES_H +#define SYSTEM_ADDRESS_SPACES_H + +/* + * Internal interfaces between memory.c/exec.c/vl.c. Do not #include unless + * you're one of them. + */ + +/* Get the root memory region. This interface should only be used temporarily + * until a proper bus interface is available. + */ +MemoryRegion *get_system_memory(void); + +/* Get the root I/O port region. This interface should only be used + * temporarily until a proper bus interface is available. + */ +MemoryRegion *get_system_io(void); + +extern AddressSpace address_space_memory; +extern AddressSpace address_space_io; + +#endif diff --git a/include/system/arch_init.h b/include/system/arch_init.h new file mode 100644 index 0000000..51e24c3 --- /dev/null +++ b/include/system/arch_init.h @@ -0,0 +1,30 @@ +#ifndef QEMU_ARCH_INIT_H +#define QEMU_ARCH_INIT_H + + +enum { + QEMU_ARCH_ALL = -1, + QEMU_ARCH_ALPHA = (1 << 0), + QEMU_ARCH_ARM = (1 << 1), + QEMU_ARCH_I386 = (1 << 3), + QEMU_ARCH_M68K = (1 << 4), + QEMU_ARCH_MICROBLAZE = (1 << 6), + QEMU_ARCH_MIPS = (1 << 7), + QEMU_ARCH_PPC = (1 << 8), + QEMU_ARCH_S390X = (1 << 9), + QEMU_ARCH_SH4 = (1 << 10), + QEMU_ARCH_SPARC = (1 << 11), + QEMU_ARCH_XTENSA = (1 << 12), + QEMU_ARCH_OPENRISC = (1 << 13), + QEMU_ARCH_TRICORE = (1 << 16), + QEMU_ARCH_HPPA = (1 << 18), + QEMU_ARCH_RISCV = (1 << 19), + QEMU_ARCH_RX = (1 << 20), + QEMU_ARCH_AVR = (1 << 21), + QEMU_ARCH_HEXAGON = (1 << 22), + QEMU_ARCH_LOONGARCH = (1 << 23), +}; + +bool qemu_arch_available(unsigned qemu_arch_mask); + +#endif diff --git a/include/system/balloon.h b/include/system/balloon.h new file mode 100644 index 0000000..867687b --- /dev/null +++ b/include/system/balloon.h @@ -0,0 +1,27 @@ +/* + * Balloon + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_BALLOON_H +#define QEMU_BALLOON_H + +#include "exec/cpu-common.h" +#include "qapi/qapi-types-machine.h" + +typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target); +typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info); + +int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, + QEMUBalloonStatus *stat_func, void *opaque); +void qemu_remove_balloon_handler(void *opaque); + +#endif diff --git a/include/system/block-backend-common.h b/include/system/block-backend-common.h new file mode 100644 index 0000000..780cea7 --- /dev/null +++ b/include/system/block-backend-common.h @@ -0,0 +1,103 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_COMMON_H +#define BLOCK_BACKEND_COMMON_H + +#include "qemu/iov.h" +#include "block/throttle-groups.h" + +/* + * TODO Have to include block/block.h for a bunch of block layer + * types. Unfortunately, this pulls in the whole BlockDriverState + * API, which we don't want used by many BlockBackend users. Some of + * the types belong here, and the rest should be split into a common + * header and one for the BlockDriverState API. + */ +#include "block/block.h" + +/* Callbacks for block device models */ +typedef struct BlockDevOps { + + /* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + + /* + * Runs when virtual media changed (monitor commands eject, change) + * Argument load is true on load and false on eject. + * Beware: doesn't run when a host device's physical media + * changes. Sure would be useful if it did. + * Device models with removable media must implement this callback. + */ + void (*change_media_cb)(void *opaque, bool load, Error **errp); + /* + * Runs when an eject request is issued from the monitor, the tray + * is closed, and the medium is locked. + * Device models that do not implement is_medium_locked will not need + * this callback. Device models that can lock the medium or tray might + * want to implement the callback and unlock the tray when "force" is + * true, even if they do not support eject requests. + */ + void (*eject_request_cb)(void *opaque, bool force); + + /* + * Is the virtual medium locked into the device? + * Device models implement this only when device has such a lock. + */ + bool (*is_medium_locked)(void *opaque); + + /* + * Runs when the backend receives a drain request. + */ + void (*drained_begin)(void *opaque); + /* + * Runs when the backend's last drain request ends. + */ + void (*drained_end)(void *opaque); + /* + * Is the device still busy? + */ + bool (*drained_poll)(void *opaque); + + /* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + + /* + * Is the virtual tray open? + * Device models implement this only when the device has a tray. + */ + bool (*is_tray_open)(void *opaque); + + /* + * Runs when the size changed (e.g. monitor command block_resize) + */ + void (*resize_cb)(void *opaque); +} BlockDevOps; + +/* + * This struct is embedded in (the private) BlockBackend struct and contains + * fields that must be public. This is in particular for QLIST_ENTRY() and + * friends so that BlockBackends can be kept in lists outside block-backend.c + */ +typedef struct BlockBackendPublic { + ThrottleGroupMember throttle_group_member; +} BlockBackendPublic; + +#endif /* BLOCK_BACKEND_COMMON_H */ diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h new file mode 100644 index 0000000..35b5e83 --- /dev/null +++ b/include/system/block-backend-global-state.h @@ -0,0 +1,124 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_GLOBAL_STATE_H +#define BLOCK_BACKEND_GLOBAL_STATE_H + +#include "block-backend-common.h" + +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + +BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm); + +BlockBackend * no_coroutine_fn +blk_new_with_bs(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm, + Error **errp); + +BlockBackend * coroutine_fn no_co_wrapper +blk_co_new_with_bs(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm, + Error **errp); + +BlockBackend * no_coroutine_fn +blk_new_open(const char *filename, const char *reference, QDict *options, + int flags, Error **errp); + +BlockBackend * coroutine_fn no_co_wrapper +blk_co_new_open(const char *filename, const char *reference, QDict *options, + int flags, Error **errp); + +int blk_get_refcnt(BlockBackend *blk); +void blk_ref(BlockBackend *blk); + +void no_coroutine_fn blk_unref(BlockBackend *blk); +void coroutine_fn no_co_wrapper blk_co_unref(BlockBackend *blk); + +void blk_remove_all_bs(void); +BlockBackend *blk_by_name(const char *name); +BlockBackend *blk_next(BlockBackend *blk); +BlockBackend *blk_all_next(BlockBackend *blk); +bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp); +void monitor_remove_blk(BlockBackend *blk); + +BlockBackendPublic *blk_get_public(BlockBackend *blk); + +void blk_remove_bs(BlockBackend *blk); +int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp); +int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp); +bool GRAPH_RDLOCK bdrv_has_blk(BlockDriverState *bs); +bool GRAPH_RDLOCK bdrv_is_root_node(BlockDriverState *bs); +int GRAPH_UNLOCKED blk_set_perm(BlockBackend *blk, uint64_t perm, + uint64_t shared_perm, Error **errp); +void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm); + +void blk_iostatus_enable(BlockBackend *blk); +BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk); +void blk_iostatus_reset(BlockBackend *blk); +int blk_attach_dev(BlockBackend *blk, DeviceState *dev); +void blk_detach_dev(BlockBackend *blk, DeviceState *dev); +DeviceState *blk_get_attached_dev(BlockBackend *blk); +BlockBackend *blk_by_dev(void *dev); +BlockBackend *blk_by_qdev_id(const char *id, Error **errp); +void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque); + +int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags); +void blk_aio_cancel(BlockAIOCB *acb); +int blk_commit_all(void); +bool blk_in_drain(BlockBackend *blk); +void blk_drain(BlockBackend *blk); +void blk_drain_all(void); +void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, + BlockdevOnError on_write_error); +bool blk_supports_write_perm(BlockBackend *blk); +bool blk_is_sg(BlockBackend *blk); +void blk_set_enable_write_cache(BlockBackend *blk, bool wce); +int blk_get_flags(BlockBackend *blk); +int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, + Error **errp); +void blk_add_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *new_context, void *opaque), + void (*detach_aio_context)(void *opaque), void *opaque); +void blk_remove_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *, + void *), + void (*detach_aio_context)(void *), + void *opaque); +void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify); +BlockBackendRootState *blk_get_root_state(BlockBackend *blk); +void blk_update_root_state(BlockBackend *blk); +bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk); +int blk_get_open_flags_from_root_state(BlockBackend *blk); + +int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size); +int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size); +int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz); +int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo); + +void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg); +void blk_io_limits_disable(BlockBackend *blk); +void blk_io_limits_enable(BlockBackend *blk, const char *group); +void blk_io_limits_update_group(BlockBackend *blk, const char *group); +void blk_set_force_allow_inactivate(BlockBackend *blk); + +bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp); +void blk_unregister_buf(BlockBackend *blk, void *host, size_t size); + +const BdrvChild *blk_root(BlockBackend *blk); + +int blk_make_empty(BlockBackend *blk, Error **errp); + +#endif /* BLOCK_BACKEND_GLOBAL_STATE_H */ diff --git a/include/system/block-backend-io.h b/include/system/block-backend-io.h new file mode 100644 index 0000000..ba8dfcc --- /dev/null +++ b/include/system/block-backend-io.h @@ -0,0 +1,237 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_IO_H +#define BLOCK_BACKEND_IO_H + +#include "block-backend-common.h" +#include "block/accounting.h" + +/* + * I/O API functions. These functions are thread-safe. + * + * See include/block/block-io.h for more information about + * the I/O API. + */ + +const char *blk_name(const BlockBackend *blk); + +BlockDriverState *blk_bs(BlockBackend *blk); + +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow); +void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow); +void blk_set_disable_request_queuing(BlockBackend *blk, bool disable); +bool blk_iostatus_is_enabled(const BlockBackend *blk); + +/* + * Return the qdev ID, or if no ID is assigned the QOM path, + * of the block device attached to the BlockBackend. + * + * The caller is responsible for releasing the value returned + * with g_free() after use. + */ +char *blk_get_attached_dev_id(BlockBackend *blk); + +BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); + +BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_flush(BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, + BlockCompletionFunc *cb, void *opaque); +void blk_aio_cancel_async(BlockAIOCB *acb); +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); + +void blk_inc_in_flight(BlockBackend *blk); +void blk_dec_in_flight(BlockBackend *blk); + +bool coroutine_fn GRAPH_RDLOCK blk_co_is_inserted(BlockBackend *blk); +bool co_wrapper_mixed_bdrv_rdlock blk_is_inserted(BlockBackend *blk); + +bool coroutine_fn GRAPH_RDLOCK blk_co_is_available(BlockBackend *blk); +bool co_wrapper_mixed_bdrv_rdlock blk_is_available(BlockBackend *blk); + +void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked); +void co_wrapper blk_lock_medium(BlockBackend *blk, bool locked); + +void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag); +void co_wrapper blk_eject(BlockBackend *blk, bool eject_flag); + +int64_t coroutine_fn blk_co_getlength(BlockBackend *blk); +int64_t co_wrapper_mixed blk_getlength(BlockBackend *blk); + +void coroutine_fn blk_co_get_geometry(BlockBackend *blk, + uint64_t *nb_sectors_ptr); +void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr); + +int64_t coroutine_fn blk_co_nb_sectors(BlockBackend *blk); +int64_t blk_nb_sectors(BlockBackend *blk); + +void *blk_try_blockalign(BlockBackend *blk, size_t size); +void *blk_blockalign(BlockBackend *blk, size_t size); +bool blk_is_writable(BlockBackend *blk); +bool blk_enable_write_cache(BlockBackend *blk); +BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read); +BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, + int error); +void blk_error_action(BlockBackend *blk, BlockErrorAction action, + bool is_read, int error); +void blk_iostatus_set_err(BlockBackend *blk, int error); +int blk_get_max_iov(BlockBackend *blk); +int blk_get_max_hw_iov(BlockBackend *blk); + +AioContext *blk_get_aio_context(BlockBackend *blk); +BlockAcctStats *blk_get_stats(BlockBackend *blk); +void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, + BlockCompletionFunc *cb, + void *opaque, int ret); + +uint32_t blk_get_request_alignment(BlockBackend *blk); +uint32_t blk_get_max_transfer(BlockBackend *blk); +uint64_t blk_get_max_hw_transfer(BlockBackend *blk); + +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, + BlockBackend *blk_out, int64_t off_out, + int64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + +int coroutine_fn blk_co_block_status_above(BlockBackend *blk, + BlockDriverState *base, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file); +int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk, + BlockDriverState *base, + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum); + +/* + * "I/O or GS" API functions. These functions can run without + * the BQL, but only in one specific iothread/main loop. + * + * See include/block/block-io.h for more information about + * the "I/O or GS" API. + */ + +int co_wrapper_mixed blk_pread(BlockBackend *blk, int64_t offset, + int64_t bytes, void *buf, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes, + void *buf, BdrvRequestFlags flags); + +int co_wrapper_mixed blk_preadv(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + +int co_wrapper_mixed blk_preadv_part(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset, + BdrvRequestFlags flags); +int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset, BdrvRequestFlags flags); + +int co_wrapper_mixed blk_pwrite(BlockBackend *blk, int64_t offset, + int64_t bytes, const void *buf, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes, + const void *buf, BdrvRequestFlags flags); + +int co_wrapper_mixed blk_pwritev(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + +int co_wrapper_mixed blk_pwritev_part(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, + int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); + +int co_wrapper_mixed blk_pwrite_compressed(BlockBackend *blk, + int64_t offset, int64_t bytes, + const void *buf); +int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset, + int64_t bytes, const void *buf); + +int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); + +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); +int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); + +int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, + int64_t bytes); +int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, + int64_t bytes); + +int co_wrapper_mixed blk_flush(BlockBackend *blk); +int coroutine_fn blk_co_flush(BlockBackend *blk); + +int co_wrapper_mixed blk_ioctl(BlockBackend *blk, unsigned long int req, + void *buf); +int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req, + void *buf); + +int co_wrapper_mixed blk_truncate(BlockBackend *blk, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp); +int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, + Error **errp); + +#endif /* BLOCK_BACKEND_IO_H */ diff --git a/include/system/block-backend.h b/include/system/block-backend.h new file mode 100644 index 0000000..038be9f --- /dev/null +++ b/include/system/block-backend.h @@ -0,0 +1,21 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014-2016 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_BACKEND_H +#define BLOCK_BACKEND_H + +#include "block-backend-global-state.h" +#include "block-backend-io.h" + +/* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */ + +#endif diff --git a/include/system/block-ram-registrar.h b/include/system/block-ram-registrar.h new file mode 100644 index 0000000..d8b2f79 --- /dev/null +++ b/include/system/block-ram-registrar.h @@ -0,0 +1,37 @@ +/* + * BlockBackend RAM Registrar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef BLOCK_RAM_REGISTRAR_H +#define BLOCK_RAM_REGISTRAR_H + +#include "exec/ramlist.h" + +/** + * struct BlockRAMRegistrar: + * + * Keeps RAMBlock memory registered with a BlockBackend using + * blk_register_buf() including hotplugged memory. + * + * Emulated devices or other BlockBackend users initialize a BlockRAMRegistrar + * with blk_ram_registrar_init() before submitting I/O requests with the + * BDRV_REQ_REGISTERED_BUF flag set. + */ +typedef struct { + BlockBackend *blk; + RAMBlockNotifier notifier; + bool ok; +} BlockRAMRegistrar; + +void blk_ram_registrar_init(BlockRAMRegistrar *r, BlockBackend *blk); +void blk_ram_registrar_destroy(BlockRAMRegistrar *r); + +/* Have all RAMBlocks been registered successfully? */ +static inline bool blk_ram_registrar_ok(BlockRAMRegistrar *r) +{ + return r->ok; +} + +#endif /* BLOCK_RAM_REGISTRAR_H */ diff --git a/include/system/blockdev.h b/include/system/blockdev.h new file mode 100644 index 0000000..3211b16 --- /dev/null +++ b/include/system/blockdev.h @@ -0,0 +1,64 @@ +/* + * QEMU host block devices + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef BLOCKDEV_H +#define BLOCKDEV_H + +#include "block/block.h" +#include "qemu/queue.h" + +typedef enum { + IF_DEFAULT = -1, /* for use with drive_add() only */ + /* + * IF_NONE must be zero, because we want MachineClass member + * block_default_type to default-initialize to IF_NONE + */ + IF_NONE = 0, + IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO, IF_XEN, + IF_COUNT +} BlockInterfaceType; + +struct DriveInfo { + BlockInterfaceType type; + int bus; + int unit; + int auto_del; /* see blockdev_mark_auto_del() */ + bool is_default; /* Added by default_drive() ? */ + int media_cd; + QemuOpts *opts; + QTAILQ_ENTRY(DriveInfo) next; +}; + +/* + * Global state (GS) API. These functions run under the BQL. + * + * See include/block/block-global-state.h for more information about + * the GS API. + */ + +void blockdev_mark_auto_del(BlockBackend *blk); +void blockdev_auto_del(BlockBackend *blk); + +DriveInfo *blk_legacy_dinfo(BlockBackend *blk); +DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo); +BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo); + +void override_max_devs(BlockInterfaceType type, int max_devs); + +DriveInfo *drive_get(BlockInterfaceType type, int bus, int unit); +void drive_check_orphaned(void); +DriveInfo *drive_get_by_index(BlockInterfaceType type, int index); +int drive_get_max_bus(BlockInterfaceType type); + +QemuOpts *drive_add(BlockInterfaceType type, int index, const char *file, + const char *optstr); +DriveInfo *drive_new(QemuOpts *arg, BlockInterfaceType block_default_type, + Error **errp); + +#endif diff --git a/include/system/confidential-guest-support.h b/include/system/confidential-guest-support.h new file mode 100644 index 0000000..ea46b50 --- /dev/null +++ b/include/system/confidential-guest-support.h @@ -0,0 +1,95 @@ +/* + * QEMU Confidential Guest support + * This interface describes the common pieces between various + * schemes for protecting guest memory or other state against a + * compromised hypervisor. This includes memory encryption (AMD's + * SEV and Intel's MKTME) or special protection modes (PEF on POWER, + * or PV on s390x). + * + * Copyright Red Hat. + * + * Authors: + * David Gibson <david@gibson.dropbear.id.au> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ +#ifndef QEMU_CONFIDENTIAL_GUEST_SUPPORT_H +#define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H + +#include "qom/object.h" + +#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support" +OBJECT_DECLARE_TYPE(ConfidentialGuestSupport, + ConfidentialGuestSupportClass, + CONFIDENTIAL_GUEST_SUPPORT) + + +struct ConfidentialGuestSupport { + Object parent; + + /* + * True if the machine should use guest_memfd for RAM. + */ + bool require_guest_memfd; + + /* + * ready: flag set by CGS initialization code once it's ready to + * start executing instructions in a potentially-secure + * guest + * + * The definition here is a bit fuzzy, because this is essentially + * part of a self-sanity-check, rather than a strict mechanism. + * + * It's not feasible to have a single point in the common machine + * init path to configure confidential guest support, because + * different mechanisms have different interdependencies requiring + * initialization in different places, often in arch or machine + * type specific code. It's also usually not possible to check + * for invalid configurations until that initialization code. + * That means it would be very easy to have a bug allowing CGS + * init to be bypassed entirely in certain configurations. + * + * Silently ignoring a requested security feature would be bad, so + * to avoid that we check late in init that this 'ready' flag is + * set if CGS was requested. If the CGS init hasn't happened, and + * so 'ready' is not set, we'll abort. + */ + bool ready; +}; + +typedef struct ConfidentialGuestSupportClass { + ObjectClass parent; + + int (*kvm_init)(ConfidentialGuestSupport *cgs, Error **errp); + int (*kvm_reset)(ConfidentialGuestSupport *cgs, Error **errp); +} ConfidentialGuestSupportClass; + +static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs, + Error **errp) +{ + ConfidentialGuestSupportClass *klass; + + klass = CONFIDENTIAL_GUEST_SUPPORT_GET_CLASS(cgs); + if (klass->kvm_init) { + return klass->kvm_init(cgs, errp); + } + + return 0; +} + +static inline int confidential_guest_kvm_reset(ConfidentialGuestSupport *cgs, + Error **errp) +{ + ConfidentialGuestSupportClass *klass; + + klass = CONFIDENTIAL_GUEST_SUPPORT_GET_CLASS(cgs); + if (klass->kvm_reset) { + return klass->kvm_reset(cgs, errp); + } + + return 0; +} + +#endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */ diff --git a/include/system/cpu-throttle.h b/include/system/cpu-throttle.h new file mode 100644 index 0000000..44bf6a5 --- /dev/null +++ b/include/system/cpu-throttle.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2012 SUSE LINUX Products GmbH + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see + * <http://www.gnu.org/licenses/gpl-2.0.html> + */ + +#ifndef SYSTEM_CPU_THROTTLE_H +#define SYSTEM_CPU_THROTTLE_H + +#include "qemu/timer.h" + +/** + * cpu_throttle_init: + * + * Initialize the CPU throttling API. + */ +void cpu_throttle_init(void); + +/** + * cpu_throttle_set: + * @new_throttle_pct: Percent of sleep time. Valid range is 1 to 99. + * + * Throttles all vcpus by forcing them to sleep for the given percentage of + * time. A throttle_percentage of 25 corresponds to a 75% duty cycle roughly. + * (example: 10ms sleep for every 30ms awake). + * + * cpu_throttle_set can be called as needed to adjust new_throttle_pct. + * Once the throttling starts, it will remain in effect until cpu_throttle_stop + * is called. + */ +void cpu_throttle_set(int new_throttle_pct); + +/** + * cpu_throttle_stop: + * + * Stops the vcpu throttling started by cpu_throttle_set. + */ +void cpu_throttle_stop(void); + +/** + * cpu_throttle_active: + * + * Returns: %true if the vcpus are currently being throttled, %false otherwise. + */ +bool cpu_throttle_active(void); + +/** + * cpu_throttle_get_percentage: + * + * Returns the vcpu throttle percentage. See cpu_throttle_set for details. + * + * Returns: The throttle percentage in range 1 to 99. + */ +int cpu_throttle_get_percentage(void); + +/** + * cpu_throttle_dirty_sync_timer_tick: + * + * Dirty sync timer hook. + */ +void cpu_throttle_dirty_sync_timer_tick(void *opaque); + +/** + * cpu_throttle_dirty_sync_timer: + * + * Start or stop the dirty sync timer. + */ +void cpu_throttle_dirty_sync_timer(bool enable); + +#endif /* SYSTEM_CPU_THROTTLE_H */ diff --git a/include/system/cpu-timers-internal.h b/include/system/cpu-timers-internal.h new file mode 100644 index 0000000..94bb739 --- /dev/null +++ b/include/system/cpu-timers-internal.h @@ -0,0 +1,71 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef TIMERS_STATE_H +#define TIMERS_STATE_H + +/* timers state, for sharing between icount and cpu-timers */ + +typedef struct TimersState { + /* Protected by BQL. */ + int64_t cpu_ticks_prev; + int64_t cpu_ticks_offset; + + /* + * Protect fields that can be respectively read outside the + * BQL, and written from multiple threads. + */ + QemuSeqLock vm_clock_seqlock; + QemuSpin vm_clock_lock; + + int16_t cpu_ticks_enabled; + + /* Conversion factor from emulated instructions to virtual clock ticks. */ + int16_t icount_time_shift; + /* Icount delta used for shift auto adjust. */ + int64_t last_delta; + + /* Compensate for varying guest execution speed. */ + aligned_int64_t qemu_icount_bias; + + int64_t vm_clock_warp_start; + int64_t cpu_clock_offset; + + /* Only written by TCG thread */ + int64_t qemu_icount; + + /* for adjusting icount */ + QEMUTimer *icount_rt_timer; + QEMUTimer *icount_vm_timer; + QEMUTimer *icount_warp_timer; +} TimersState; + +extern TimersState timers_state; + +/* + * icount needs this internal from cpu-timers when adjusting the icount shift. + */ +int64_t cpu_get_clock_locked(void); + +#endif /* TIMERS_STATE_H */ diff --git a/include/system/cpu-timers.h b/include/system/cpu-timers.h new file mode 100644 index 0000000..a1abed0 --- /dev/null +++ b/include/system/cpu-timers.h @@ -0,0 +1,46 @@ +/* + * CPU timers state API + * + * Copyright 2020 SUSE LLC + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#ifndef SYSTEM_CPU_TIMERS_H +#define SYSTEM_CPU_TIMERS_H + +#include "qemu/timer.h" + +/* init the whole cpu timers API, including icount, ticks, and cpu_throttle */ +void cpu_timers_init(void); + +/* + * CPU Ticks and Clock + */ + +/* Caller must hold BQL */ +void cpu_enable_ticks(void); +/* Caller must hold BQL */ +void cpu_disable_ticks(void); + +/* + * return the time elapsed in VM between vm_start and vm_stop. + * cpu_get_ticks() uses units of the host CPU cycle counter. + */ +int64_t cpu_get_ticks(void); + +/* + * Returns the monotonic time elapsed in VM, i.e., + * the time between vm_start and vm_stop + */ +int64_t cpu_get_clock(void); + +void qemu_timer_notify_cb(void *opaque, QEMUClockType type); + +/* get/set VIRTUAL clock and VM elapsed ticks via the cpus accel interface */ +int64_t cpus_get_virtual_clock(void); +void cpus_set_virtual_clock(int64_t new_time); +int64_t cpus_get_elapsed_ticks(void); + +#endif /* SYSTEM_CPU_TIMERS_H */ diff --git a/include/system/cpus.h b/include/system/cpus.h new file mode 100644 index 0000000..3226c76 --- /dev/null +++ b/include/system/cpus.h @@ -0,0 +1,49 @@ +#ifndef QEMU_CPUS_H +#define QEMU_CPUS_H + +/* register accel-specific operations */ +void cpus_register_accel(const AccelOpsClass *i); + +/* return registers ops */ +const AccelOpsClass *cpus_get_accel(void); + +/* accel/dummy-cpus.c */ + +/* Create a dummy vcpu for AccelOpsClass->create_vcpu_thread */ +void dummy_start_vcpu_thread(CPUState *); + +/* interface available for cpus accelerator threads */ + +/* For temporary buffers for forming a name */ +#define VCPU_THREAD_NAME_SIZE 16 + +void cpus_kick_thread(CPUState *cpu); +bool cpu_work_list_empty(CPUState *cpu); +bool cpu_thread_is_idle(CPUState *cpu); +bool all_cpu_threads_idle(void); +bool cpu_can_run(CPUState *cpu); +void qemu_wait_io_event_common(CPUState *cpu); +void qemu_wait_io_event(CPUState *cpu); +void cpu_thread_signal_created(CPUState *cpu); +void cpu_thread_signal_destroyed(CPUState *cpu); +void cpu_handle_guest_debug(CPUState *cpu); + +/* end interface for cpus accelerator threads */ + +bool qemu_in_vcpu_thread(void); +void qemu_init_cpu_loop(void); +void resume_all_vcpus(void); +void pause_all_vcpus(void); +void cpu_stop_current(void); + +/* Unblock cpu */ +void qemu_cpu_kick_self(void); + +bool cpus_are_resettable(void); + +void cpu_synchronize_all_states(void); +void cpu_synchronize_all_post_reset(void); +void cpu_synchronize_all_post_init(void); +void cpu_synchronize_all_pre_loadvm(void); + +#endif diff --git a/include/system/cryptodev-vhost-user.h b/include/system/cryptodev-vhost-user.h new file mode 100644 index 0000000..5138c14 --- /dev/null +++ b/include/system/cryptodev-vhost-user.h @@ -0,0 +1,50 @@ +/* + * QEMU Crypto Device Common Vhost User Implement + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Gonglei <arei.gonglei@huawei.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#ifndef CRYPTODEV_VHOST_USER_H +#define CRYPTODEV_VHOST_USER_H + +#include "system/cryptodev-vhost.h" + +#define VHOST_USER_MAX_AUTH_KEY_LEN 512 +#define VHOST_USER_MAX_CIPHER_KEY_LEN 64 + + +/** + * cryptodev_vhost_user_get_vhost: + * @cc: the client object for each queue + * @b: the cryptodev backend common vhost object + * @queue: the queue index + * + * Gets a new cryptodev backend common vhost object based on + * @b and @queue + * + * Returns: the cryptodev backend common vhost object + */ +CryptoDevBackendVhost * +cryptodev_vhost_user_get_vhost( + CryptoDevBackendClient *cc, + CryptoDevBackend *b, + uint16_t queue); + +#endif /* CRYPTODEV_VHOST_USER_H */ diff --git a/include/system/cryptodev-vhost.h b/include/system/cryptodev-vhost.h new file mode 100644 index 0000000..b0bb09e --- /dev/null +++ b/include/system/cryptodev-vhost.h @@ -0,0 +1,153 @@ +/* + * QEMU Crypto Device Common Vhost Implement + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Gonglei <arei.gonglei@huawei.com> + * Jay Zhou <jianjay.zhou@huawei.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ +#ifndef CRYPTODEV_VHOST_H +#define CRYPTODEV_VHOST_H + +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-backend.h" +#include "chardev/char.h" + +#include "system/cryptodev.h" + + +typedef struct CryptoDevBackendVhostOptions { + VhostBackendType backend_type; + void *opaque; + int total_queues; + CryptoDevBackendClient *cc; +} CryptoDevBackendVhostOptions; + +typedef struct CryptoDevBackendVhost { + struct vhost_dev dev; + struct vhost_virtqueue vqs[1]; + int backend; + CryptoDevBackendClient *cc; +} CryptoDevBackendVhost; + +/** + * cryptodev_vhost_get_max_queues: + * @crypto: the cryptodev backend common vhost object + * + * Get the maximum queue number of @crypto. + * + * + * Returns: the maximum queue number + */ +uint64_t +cryptodev_vhost_get_max_queues( + CryptoDevBackendVhost *crypto); + + +/** + * cryptodev_vhost_init: + * @options: the common vhost object's option + * + * Creates a new cryptodev backend common vhost object + * + ** The returned object must be released with + * cryptodev_vhost_cleanup() when no + * longer required + * + * Returns: the cryptodev backend common vhost object + */ +struct CryptoDevBackendVhost * +cryptodev_vhost_init( + CryptoDevBackendVhostOptions *options); + +/** + * cryptodev_vhost_cleanup: + * @crypto: the cryptodev backend common vhost object + * + * Clean the resource associated with @crypto that realizaed + * by cryptodev_vhost_init() + * + */ +void cryptodev_vhost_cleanup( + CryptoDevBackendVhost *crypto); + +/** + * cryptodev_get_vhost: + * @cc: the client object for each queue + * @b: the cryptodev backend common vhost object + * @queue: the cryptodev backend queue index + * + * Gets a new cryptodev backend common vhost object based on + * @b and @queue + * + * Returns: the cryptodev backend common vhost object + */ +CryptoDevBackendVhost * +cryptodev_get_vhost(CryptoDevBackendClient *cc, + CryptoDevBackend *b, + uint16_t queue); +/** + * cryptodev_vhost_start: + * @dev: the virtio crypto object + * @total_queues: the total count of queue + * + * Starts the vhost crypto logic + * + * Returns: 0 for success, negative for errors + */ +int cryptodev_vhost_start(VirtIODevice *dev, int total_queues); + +/** + * cryptodev_vhost_stop: + * @dev: the virtio crypto object + * @total_queues: the total count of queue + * + * Stops the vhost crypto logic + * + */ +void cryptodev_vhost_stop(VirtIODevice *dev, int total_queues); + +/** + * cryptodev_vhost_virtqueue_mask: + * @dev: the virtio crypto object + * @queue: the cryptodev backend queue index + * @idx: the virtqueue index + * @mask: mask or not (true or false) + * + * Mask/unmask events for @idx virtqueue on @dev device + * + */ +void cryptodev_vhost_virtqueue_mask(VirtIODevice *dev, + int queue, + int idx, bool mask); + +/** + * cryptodev_vhost_virtqueue_pending: + * @dev: the virtio crypto object + * @queue: the cryptodev backend queue index + * @idx: the virtqueue index + * + * Test and clear event pending status for @idx virtqueue on @dev device. + * Should be called after unmask to avoid losing events. + * + * Returns: true for success, false for errors + */ +bool cryptodev_vhost_virtqueue_pending(VirtIODevice *dev, + int queue, int idx); + +#endif /* CRYPTODEV_VHOST_H */ diff --git a/include/system/cryptodev.h b/include/system/cryptodev.h new file mode 100644 index 0000000..b20822d --- /dev/null +++ b/include/system/cryptodev.h @@ -0,0 +1,447 @@ +/* + * QEMU Crypto Device Implementation + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Gonglei <arei.gonglei@huawei.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ +#ifndef CRYPTODEV_H +#define CRYPTODEV_H + +#include "qemu/queue.h" +#include "qemu/throttle.h" +#include "qom/object.h" +#include "qapi/qapi-types-cryptodev.h" + +/** + * CryptoDevBackend: + * + * The CryptoDevBackend object is an interface + * for different cryptodev backends, which provides crypto + * operation wrapper. + * + */ + +#define TYPE_CRYPTODEV_BACKEND "cryptodev-backend" + +OBJECT_DECLARE_TYPE(CryptoDevBackend, CryptoDevBackendClass, + CRYPTODEV_BACKEND) + + +#define MAX_CRYPTO_QUEUE_NUM 64 + +typedef struct CryptoDevBackendConf CryptoDevBackendConf; +typedef struct CryptoDevBackendPeers CryptoDevBackendPeers; +typedef struct CryptoDevBackendClient + CryptoDevBackendClient; + +/** + * CryptoDevBackendSymSessionInfo: + * + * @cipher_alg: algorithm type of CIPHER + * @key_len: byte length of cipher key + * @hash_alg: algorithm type of HASH/MAC + * @hash_result_len: byte length of HASH operation result + * @auth_key_len: byte length of authenticated key + * @add_len: byte length of additional authenticated data + * @op_type: operation type (refer to virtio_crypto.h) + * @direction: encryption or direction for CIPHER + * @hash_mode: HASH mode for HASH operation (refer to virtio_crypto.h) + * @alg_chain_order: order of algorithm chaining (CIPHER then HASH, + * or HASH then CIPHER) + * @cipher_key: point to a key of CIPHER + * @auth_key: point to an authenticated key of MAC + * + */ +typedef struct CryptoDevBackendSymSessionInfo { + /* corresponding with virtio crypto spec */ + uint32_t cipher_alg; + uint32_t key_len; + uint32_t hash_alg; + uint32_t hash_result_len; + uint32_t auth_key_len; + uint32_t add_len; + uint8_t op_type; + uint8_t direction; + uint8_t hash_mode; + uint8_t alg_chain_order; + uint8_t *cipher_key; + uint8_t *auth_key; +} CryptoDevBackendSymSessionInfo; + +/** + * CryptoDevBackendAsymSessionInfo: + */ +typedef struct CryptoDevBackendRsaPara { + uint32_t padding_algo; + uint32_t hash_algo; +} CryptoDevBackendRsaPara; + +typedef struct CryptoDevBackendAsymSessionInfo { + /* corresponding with virtio crypto spec */ + uint32_t algo; + uint32_t keytype; + uint32_t keylen; + uint8_t *key; + union { + CryptoDevBackendRsaPara rsa; + } u; +} CryptoDevBackendAsymSessionInfo; + +typedef struct CryptoDevBackendSessionInfo { + uint32_t op_code; + union { + CryptoDevBackendSymSessionInfo sym_sess_info; + CryptoDevBackendAsymSessionInfo asym_sess_info; + } u; + uint64_t session_id; +} CryptoDevBackendSessionInfo; + +/** + * CryptoDevBackendSymOpInfo: + * + * @aad_len: byte length of additional authenticated data + * @iv_len: byte length of initialization vector or counter + * @src_len: byte length of source data + * @dst_len: byte length of destination data + * @digest_result_len: byte length of hash digest result + * @hash_start_src_offset: Starting point for hash processing, specified + * as number of bytes from start of packet in source data, only used for + * algorithm chain + * @cipher_start_src_offset: Starting point for cipher processing, specified + * as number of bytes from start of packet in source data, only used for + * algorithm chain + * @len_to_hash: byte length of source data on which the hash + * operation will be computed, only used for algorithm chain + * @len_to_cipher: byte length of source data on which the cipher + * operation will be computed, only used for algorithm chain + * @op_type: operation type (refer to virtio_crypto.h) + * @iv: point to the initialization vector or counter + * @src: point to the source data + * @dst: point to the destination data + * @aad_data: point to the additional authenticated data + * @digest_result: point to the digest result data + * @data[0]: point to the extensional memory by one memory allocation + * + */ +typedef struct CryptoDevBackendSymOpInfo { + uint32_t aad_len; + uint32_t iv_len; + uint32_t src_len; + uint32_t dst_len; + uint32_t digest_result_len; + uint32_t hash_start_src_offset; + uint32_t cipher_start_src_offset; + uint32_t len_to_hash; + uint32_t len_to_cipher; + uint8_t op_type; + uint8_t *iv; + uint8_t *src; + uint8_t *dst; + uint8_t *aad_data; + uint8_t *digest_result; + uint8_t data[]; +} CryptoDevBackendSymOpInfo; + + +/** + * CryptoDevBackendAsymOpInfo: + * + * @src_len: byte length of source data + * @dst_len: byte length of destination data + * @src: point to the source data + * @dst: point to the destination data + * + */ +typedef struct CryptoDevBackendAsymOpInfo { + uint32_t src_len; + uint32_t dst_len; + uint8_t *src; + uint8_t *dst; +} CryptoDevBackendAsymOpInfo; + +typedef void (*CryptoDevCompletionFunc) (void *opaque, int ret); + +typedef struct CryptoDevBackendOpInfo { + QCryptodevBackendAlgoType algtype; + uint32_t op_code; + uint32_t queue_index; + CryptoDevCompletionFunc cb; + void *opaque; /* argument for cb */ + uint64_t session_id; + union { + CryptoDevBackendSymOpInfo *sym_op_info; + CryptoDevBackendAsymOpInfo *asym_op_info; + } u; + QTAILQ_ENTRY(CryptoDevBackendOpInfo) next; +} CryptoDevBackendOpInfo; + +struct CryptoDevBackendClass { + ObjectClass parent_class; + + void (*init)(CryptoDevBackend *backend, Error **errp); + void (*cleanup)(CryptoDevBackend *backend, Error **errp); + + int (*create_session)(CryptoDevBackend *backend, + CryptoDevBackendSessionInfo *sess_info, + uint32_t queue_index, + CryptoDevCompletionFunc cb, + void *opaque); + + int (*close_session)(CryptoDevBackend *backend, + uint64_t session_id, + uint32_t queue_index, + CryptoDevCompletionFunc cb, + void *opaque); + + int (*do_op)(CryptoDevBackend *backend, + CryptoDevBackendOpInfo *op_info); +}; + +struct CryptoDevBackendClient { + QCryptodevBackendType type; + char *info_str; + unsigned int queue_index; + int vring_enable; + QTAILQ_ENTRY(CryptoDevBackendClient) next; +}; + +struct CryptoDevBackendPeers { + CryptoDevBackendClient *ccs[MAX_CRYPTO_QUEUE_NUM]; + uint32_t queues; +}; + +struct CryptoDevBackendConf { + CryptoDevBackendPeers peers; + + /* Supported service mask */ + uint32_t crypto_services; + + /* Detailed algorithms mask */ + uint32_t cipher_algo_l; + uint32_t cipher_algo_h; + uint32_t hash_algo; + uint32_t mac_algo_l; + uint32_t mac_algo_h; + uint32_t aead_algo; + uint32_t akcipher_algo; + /* Maximum length of cipher key */ + uint32_t max_cipher_key_len; + /* Maximum length of authenticated key */ + uint32_t max_auth_key_len; + /* Maximum size of each crypto request's content */ + uint64_t max_size; +}; + +typedef struct CryptodevBackendSymStat { + int64_t encrypt_ops; + int64_t decrypt_ops; + int64_t encrypt_bytes; + int64_t decrypt_bytes; +} CryptodevBackendSymStat; + +typedef struct CryptodevBackendAsymStat { + int64_t encrypt_ops; + int64_t decrypt_ops; + int64_t sign_ops; + int64_t verify_ops; + int64_t encrypt_bytes; + int64_t decrypt_bytes; + int64_t sign_bytes; + int64_t verify_bytes; +} CryptodevBackendAsymStat; + +struct CryptoDevBackend { + Object parent_obj; + + bool ready; + /* Tag the cryptodev backend is used by virtio-crypto or not */ + bool is_used; + CryptoDevBackendConf conf; + CryptodevBackendSymStat *sym_stat; + CryptodevBackendAsymStat *asym_stat; + + ThrottleState ts; + ThrottleTimers tt; + ThrottleConfig tc; + QTAILQ_HEAD(, CryptoDevBackendOpInfo) opinfos; +}; + +#define CryptodevSymStatInc(be, op, bytes) do { \ + be->sym_stat->op##_bytes += (bytes); \ + be->sym_stat->op##_ops += 1; \ +} while (/*CONSTCOND*/0) + +#define CryptodevSymStatIncEncrypt(be, bytes) \ + CryptodevSymStatInc(be, encrypt, bytes) + +#define CryptodevSymStatIncDecrypt(be, bytes) \ + CryptodevSymStatInc(be, decrypt, bytes) + +#define CryptodevAsymStatInc(be, op, bytes) do { \ + be->asym_stat->op##_bytes += (bytes); \ + be->asym_stat->op##_ops += 1; \ +} while (/*CONSTCOND*/0) + +#define CryptodevAsymStatIncEncrypt(be, bytes) \ + CryptodevAsymStatInc(be, encrypt, bytes) + +#define CryptodevAsymStatIncDecrypt(be, bytes) \ + CryptodevAsymStatInc(be, decrypt, bytes) + +#define CryptodevAsymStatIncSign(be, bytes) \ + CryptodevAsymStatInc(be, sign, bytes) + +#define CryptodevAsymStatIncVerify(be, bytes) \ + CryptodevAsymStatInc(be, verify, bytes) + + +/** + * cryptodev_backend_new_client: + * + * Creates a new cryptodev backend client object. + * + * The returned object must be released with + * cryptodev_backend_free_client() when no + * longer required + * + * Returns: a new cryptodev backend client object + */ +CryptoDevBackendClient *cryptodev_backend_new_client(void); + +/** + * cryptodev_backend_free_client: + * @cc: the cryptodev backend client object + * + * Release the memory associated with @cc that + * was previously allocated by cryptodev_backend_new_client() + */ +void cryptodev_backend_free_client( + CryptoDevBackendClient *cc); + +/** + * cryptodev_backend_cleanup: + * @backend: the cryptodev backend object + * @errp: pointer to a NULL-initialized error object + * + * Clean the resource associated with @backend that realizaed + * by the specific backend's init() callback + */ +void cryptodev_backend_cleanup( + CryptoDevBackend *backend, + Error **errp); + +/** + * cryptodev_backend_create_session: + * @backend: the cryptodev backend object + * @sess_info: parameters needed by session creating + * @queue_index: queue index of cryptodev backend client + * @errp: pointer to a NULL-initialized error object + * @cb: callback when session create is compeleted + * @opaque: parameter passed to callback + * + * Create a session for symmetric/asymmetric algorithms + * + * Returns: 0 for success and cb will be called when creation is completed, + * negative value for error, and cb will not be called. + */ +int cryptodev_backend_create_session( + CryptoDevBackend *backend, + CryptoDevBackendSessionInfo *sess_info, + uint32_t queue_index, + CryptoDevCompletionFunc cb, + void *opaque); + +/** + * cryptodev_backend_close_session: + * @backend: the cryptodev backend object + * @session_id: the session id + * @queue_index: queue index of cryptodev backend client + * @errp: pointer to a NULL-initialized error object + * @cb: callback when session create is compeleted + * @opaque: parameter passed to callback + * + * Close a session for which was previously + * created by cryptodev_backend_create_session() + * + * Returns: 0 for success and cb will be called when creation is completed, + * negative value for error, and cb will not be called. + */ +int cryptodev_backend_close_session( + CryptoDevBackend *backend, + uint64_t session_id, + uint32_t queue_index, + CryptoDevCompletionFunc cb, + void *opaque); + +/** + * cryptodev_backend_crypto_operation: + * @backend: the cryptodev backend object + * @op_info: pointer to a CryptoDevBackendOpInfo object + * + * Do crypto operation, such as encryption, decryption, signature and + * verification + * + * Returns: 0 for success and cb will be called when creation is completed, + * negative value for error, and cb will not be called. + */ +int cryptodev_backend_crypto_operation( + CryptoDevBackend *backend, + CryptoDevBackendOpInfo *op_info); + +/** + * cryptodev_backend_set_used: + * @backend: the cryptodev backend object + * @used: true or false + * + * Set the cryptodev backend is used by virtio-crypto or not + */ +void cryptodev_backend_set_used(CryptoDevBackend *backend, bool used); + +/** + * cryptodev_backend_is_used: + * @backend: the cryptodev backend object + * + * Return the status that the cryptodev backend is used + * by virtio-crypto or not + * + * Returns: true on used, or false on not used + */ +bool cryptodev_backend_is_used(CryptoDevBackend *backend); + +/** + * cryptodev_backend_set_ready: + * @backend: the cryptodev backend object + * @ready: true or false + * + * Set the cryptodev backend is ready or not, which is called + * by the children of the cryptodev banckend interface. + */ +void cryptodev_backend_set_ready(CryptoDevBackend *backend, bool ready); + +/** + * cryptodev_backend_is_ready: + * @backend: the cryptodev backend object + * + * Return the status that the cryptodev backend is ready or not + * + * Returns: true on ready, or false on not ready + */ +bool cryptodev_backend_is_ready(CryptoDevBackend *backend); + +#endif /* CRYPTODEV_H */ diff --git a/include/system/device_tree.h b/include/system/device_tree.h new file mode 100644 index 0000000..49d8482 --- /dev/null +++ b/include/system/device_tree.h @@ -0,0 +1,213 @@ +/* + * Header with function prototypes to help device tree manipulation using + * libfdt. It also provides functions to read entries from device tree proc + * interface. + * + * Copyright 2008 IBM Corporation. + * Authors: Jerone Young <jyoung5@us.ibm.com> + * Hollis Blanchard <hollisb@us.ibm.com> + * + * This work is licensed under the GNU GPL license version 2 or later. + * + */ + +#ifndef DEVICE_TREE_H +#define DEVICE_TREE_H + +void *create_device_tree(int *sizep); +void *load_device_tree(const char *filename_path, int *sizep); +#ifdef CONFIG_LINUX +/** + * load_device_tree_from_sysfs: reads the device tree information in the + * /proc/device-tree directory and return the corresponding binary blob + * buffer pointer. Asserts in case of error. + */ +void *load_device_tree_from_sysfs(void); +#endif + +/** + * qemu_fdt_node_path: return the paths of nodes matching a given + * name and compat string + * @fdt: pointer to the dt blob + * @name: node name + * @compat: compatibility string + * @errp: handle to an error object + * + * returns a newly allocated NULL-terminated array of node paths. + * Use g_strfreev() to free it. If one or more nodes were found, the + * array contains the path of each node and the last element equals to + * NULL. If there is no error but no matching node was found, the + * returned array contains a single element equal to NULL. If an error + * was encountered when parsing the blob, the function returns NULL + * + * @name may be NULL to wildcard names and only match compatibility + * strings. + */ +char **qemu_fdt_node_path(void *fdt, const char *name, const char *compat, + Error **errp); + +/** + * qemu_fdt_node_unit_path: return the paths of nodes matching a given + * node-name, ie. node-name and node-name@unit-address + * @fdt: pointer to the dt blob + * @name: node name + * @errp: handle to an error object + * + * returns a newly allocated NULL-terminated array of node paths. + * Use g_strfreev() to free it. If one or more nodes were found, the + * array contains the path of each node and the last element equals to + * NULL. If there is no error but no matching node was found, the + * returned array contains a single element equal to NULL. If an error + * was encountered when parsing the blob, the function returns NULL + */ +char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp); + +int qemu_fdt_setprop(void *fdt, const char *node_path, + const char *property, const void *val, int size); +int qemu_fdt_setprop_cell(void *fdt, const char *node_path, + const char *property, uint32_t val); +int qemu_fdt_setprop_u64(void *fdt, const char *node_path, + const char *property, uint64_t val); +int qemu_fdt_setprop_string(void *fdt, const char *node_path, + const char *property, const char *string); + +/** + * qemu_fdt_setprop_string_array: set a string array property + * + * @fdt: pointer to the dt blob + * @name: node name + * @prop: property array + * @array: pointer to an array of string pointers + * @len: length of array + * + * assigns a string array to a property. This function converts and + * array of strings to a sequential string with \0 separators before + * setting the property. + */ +int qemu_fdt_setprop_string_array(void *fdt, const char *node_path, + const char *prop, char **array, int len); + +int qemu_fdt_setprop_phandle(void *fdt, const char *node_path, + const char *property, + const char *target_node_path); +/** + * qemu_fdt_getprop: retrieve the value of a given property + * @fdt: pointer to the device tree blob + * @node_path: node path + * @property: name of the property to find + * @lenp: fdt error if any or length of the property on success + * @errp: handle to an error object + * + * returns a pointer to the property on success and NULL on failure + */ +const void *qemu_fdt_getprop(void *fdt, const char *node_path, + const char *property, int *lenp, + Error **errp); +/** + * qemu_fdt_getprop_cell: retrieve the value of a given 4 byte property + * @fdt: pointer to the device tree blob + * @node_path: node path + * @property: name of the property to find + * @lenp: fdt error if any or -EINVAL if the property size is different from + * 4 bytes, or 4 (expected length of the property) upon success. + * @errp: handle to an error object + * + * returns the property value on success + */ +uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path, + const char *property, int *lenp, + Error **errp); +uint32_t qemu_fdt_get_phandle(void *fdt, const char *path); +uint32_t qemu_fdt_alloc_phandle(void *fdt); +int qemu_fdt_nop_node(void *fdt, const char *node_path); +int qemu_fdt_add_subnode(void *fdt, const char *name); +int qemu_fdt_add_path(void *fdt, const char *path); + +#define qemu_fdt_setprop_cells(fdt, node_path, property, ...) \ + do { \ + uint32_t qdt_tmp[] = { __VA_ARGS__ }; \ + for (unsigned i_ = 0; i_ < ARRAY_SIZE(qdt_tmp); i_++) { \ + qdt_tmp[i_] = cpu_to_be32(qdt_tmp[i_]); \ + } \ + qemu_fdt_setprop(fdt, node_path, property, qdt_tmp, \ + sizeof(qdt_tmp)); \ + } while (0) + +/** + * qemu_fdt_setprop_sized_cells_from_array: + * @fdt: device tree blob + * @node_path: node to set property on + * @property: property to set + * @numvalues: number of values + * @values: array of number-of-cells, value pairs + * + * Set the specified property on the specified node in the device tree + * to be an array of cells. The values of the cells are specified via + * the values list, which alternates between "number of cells used by + * this value" and "value". + * number-of-cells must be either 1 or 2 (other values will result in + * an error being returned). If a value is too large to fit in the + * number of cells specified for it, an error is returned. + * + * This function is useful because device tree nodes often have cell arrays + * which are either lists of addresses or lists of address,size tuples, but + * the number of cells used for each element vary depending on the + * #address-cells and #size-cells properties of their parent node. + * If you know all your cell elements are one cell wide you can use the + * simpler qemu_fdt_setprop_cells(). If you're not setting up the + * array programmatically, qemu_fdt_setprop_sized_cells may be more + * convenient. + * + * Return value: 0 on success, <0 on error. + */ +int qemu_fdt_setprop_sized_cells_from_array(void *fdt, + const char *node_path, + const char *property, + int numvalues, + uint64_t *values); + +/** + * qemu_fdt_setprop_sized_cells: + * @fdt: device tree blob + * @node_path: node to set property on + * @property: property to set + * @...: list of number-of-cells, value pairs + * + * Set the specified property on the specified node in the device tree + * to be an array of cells. The values of the cells are specified via + * the variable arguments, which alternates between "number of cells + * used by this value" and "value". + * + * This is a convenience wrapper for the function + * qemu_fdt_setprop_sized_cells_from_array(). + * + * Return value: 0 on success, <0 on error. + */ +#define qemu_fdt_setprop_sized_cells(fdt, node_path, property, ...) \ + ({ \ + uint64_t qdt_tmp[] = { __VA_ARGS__ }; \ + qemu_fdt_setprop_sized_cells_from_array(fdt, node_path, \ + property, \ + ARRAY_SIZE(qdt_tmp) / 2, \ + qdt_tmp); \ + }) + + +/** + * qemu_fdt_randomize_seeds: + * @fdt: device tree blob + * + * Re-randomize all "rng-seed" properties with new seeds. + */ +void qemu_fdt_randomize_seeds(void *fdt); + +#define FDT_PCI_RANGE_RELOCATABLE 0x80000000 +#define FDT_PCI_RANGE_PREFETCHABLE 0x40000000 +#define FDT_PCI_RANGE_ALIASED 0x20000000 +#define FDT_PCI_RANGE_TYPE_MASK 0x03000000 +#define FDT_PCI_RANGE_MMIO_64BIT 0x03000000 +#define FDT_PCI_RANGE_MMIO 0x02000000 +#define FDT_PCI_RANGE_IOPORT 0x01000000 +#define FDT_PCI_RANGE_CONFIG 0x00000000 + +#endif /* DEVICE_TREE_H */ diff --git a/include/system/dirtylimit.h b/include/system/dirtylimit.h new file mode 100644 index 0000000..d11ebbb --- /dev/null +++ b/include/system/dirtylimit.h @@ -0,0 +1,39 @@ +/* + * Dirty page rate limit common functions + * + * Copyright (c) 2022 CHINA TELECOM CO.,LTD. + * + * Authors: + * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_DIRTYRLIMIT_H +#define QEMU_DIRTYRLIMIT_H + +#define DIRTYLIMIT_CALC_TIME_MS 1000 /* 1000ms */ + +int64_t vcpu_dirty_rate_get(int cpu_index); +void vcpu_dirty_rate_stat_start(void); +void vcpu_dirty_rate_stat_stop(void); +void vcpu_dirty_rate_stat_initialize(void); +void vcpu_dirty_rate_stat_finalize(void); + +void dirtylimit_state_lock(void); +void dirtylimit_state_unlock(void); +void dirtylimit_state_initialize(void); +void dirtylimit_state_finalize(void); +bool dirtylimit_in_service(void); +bool dirtylimit_vcpu_index_valid(int cpu_index); +void dirtylimit_process(void); +void dirtylimit_change(bool start); +void dirtylimit_set_vcpu(int cpu_index, + uint64_t quota, + bool enable); +void dirtylimit_set_all(uint64_t quota, + bool enable); +void dirtylimit_vcpu_execute(CPUState *cpu); +uint64_t dirtylimit_throttle_time_per_round(void); +uint64_t dirtylimit_ring_full_time(void); +#endif diff --git a/include/system/dirtyrate.h b/include/system/dirtyrate.h new file mode 100644 index 0000000..20813f3 --- /dev/null +++ b/include/system/dirtyrate.h @@ -0,0 +1,30 @@ +/* + * dirty page rate helper functions + * + * Copyright (c) 2022 CHINA TELECOM CO.,LTD. + * + * Authors: + * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_DIRTYRATE_H +#define QEMU_DIRTYRATE_H + +#include "qapi/qapi-types-migration.h" + +typedef struct VcpuStat { + int nvcpu; /* number of vcpu */ + DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */ +} VcpuStat; + +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms, + VcpuStat *stat, + unsigned int flag, + bool one_shot); + +void global_dirty_log_change(unsigned int flag, + bool start); +#endif diff --git a/include/system/dma.h b/include/system/dma.h new file mode 100644 index 0000000..82e7ad5 --- /dev/null +++ b/include/system/dma.h @@ -0,0 +1,322 @@ +/* + * DMA helper functions + * + * Copyright (c) 2009, 2020 Red Hat + * + * This work is licensed under the terms of the GNU General Public License + * (GNU GPL), version 2 or later. + */ + +#ifndef DMA_H +#define DMA_H + +#include "system/memory.h" +#include "system/address-spaces.h" +#include "block/block.h" +#include "block/accounting.h" + +typedef enum { + DMA_DIRECTION_TO_DEVICE = 0, + DMA_DIRECTION_FROM_DEVICE = 1, +} DMADirection; + +/* + * When an IOMMU is present, bus addresses become distinct from + * CPU/memory physical addresses and may be a different size. Because + * the IOVA size depends more on the bus than on the platform, we more + * or less have to treat these as 64-bit always to cover all (or at + * least most) cases. + */ +typedef uint64_t dma_addr_t; + +#define DMA_ADDR_BITS 64 +#define DMA_ADDR_FMT "%" PRIx64 + +typedef struct ScatterGatherEntry ScatterGatherEntry; + +struct QEMUSGList { + ScatterGatherEntry *sg; + int nsg; + int nalloc; + dma_addr_t size; + DeviceState *dev; + AddressSpace *as; +}; + +static inline void dma_barrier(AddressSpace *as, DMADirection dir) +{ + /* + * This is called before DMA read and write operations + * unless the _relaxed form is used and is responsible + * for providing some sane ordering of accesses vs + * concurrently running VCPUs. + * + * Users of map(), unmap() or lower level st/ld_* + * operations are responsible for providing their own + * ordering via barriers. + * + * This primitive implementation does a simple smp_mb() + * before each operation which provides pretty much full + * ordering. + * + * A smarter implementation can be devised if needed to + * use lighter barriers based on the direction of the + * transfer, the DMA context, etc... + */ + smp_mb(); +} + +/* Checks that the given range of addresses is valid for DMA. This is + * useful for certain cases, but usually you should just use + * dma_memory_{read,write}() and check for errors */ +static inline bool dma_memory_valid(AddressSpace *as, + dma_addr_t addr, dma_addr_t len, + DMADirection dir, MemTxAttrs attrs) +{ + return address_space_access_valid(as, addr, len, + dir == DMA_DIRECTION_FROM_DEVICE, + attrs); +} + +static inline MemTxResult dma_memory_rw_relaxed(AddressSpace *as, + dma_addr_t addr, + void *buf, dma_addr_t len, + DMADirection dir, + MemTxAttrs attrs) +{ + return address_space_rw(as, addr, attrs, + buf, len, dir == DMA_DIRECTION_FROM_DEVICE); +} + +static inline MemTxResult dma_memory_read_relaxed(AddressSpace *as, + dma_addr_t addr, + void *buf, dma_addr_t len) +{ + return dma_memory_rw_relaxed(as, addr, buf, len, + DMA_DIRECTION_TO_DEVICE, + MEMTXATTRS_UNSPECIFIED); +} + +static inline MemTxResult dma_memory_write_relaxed(AddressSpace *as, + dma_addr_t addr, + const void *buf, + dma_addr_t len) +{ + return dma_memory_rw_relaxed(as, addr, (void *)buf, len, + DMA_DIRECTION_FROM_DEVICE, + MEMTXATTRS_UNSPECIFIED); +} + +/** + * dma_memory_rw: Read from or write to an address space from DMA controller. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @buf: buffer with the data transferred + * @len: the number of bytes to read or write + * @dir: indicates the transfer direction + * @attrs: memory transaction attributes + */ +static inline MemTxResult dma_memory_rw(AddressSpace *as, dma_addr_t addr, + void *buf, dma_addr_t len, + DMADirection dir, MemTxAttrs attrs) +{ + dma_barrier(as, dir); + + return dma_memory_rw_relaxed(as, addr, buf, len, dir, attrs); +} + +/** + * dma_memory_read: Read from an address space from DMA controller. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). Called within RCU critical section. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @buf: buffer with the data transferred + * @len: length of the data transferred + * @attrs: memory transaction attributes + */ +static inline MemTxResult dma_memory_read(AddressSpace *as, dma_addr_t addr, + void *buf, dma_addr_t len, + MemTxAttrs attrs) +{ + return dma_memory_rw(as, addr, buf, len, + DMA_DIRECTION_TO_DEVICE, attrs); +} + +/** + * dma_memory_write: Write to address space from DMA controller. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @buf: buffer with the data transferred + * @len: the number of bytes to write + * @attrs: memory transaction attributes + */ +static inline MemTxResult dma_memory_write(AddressSpace *as, dma_addr_t addr, + const void *buf, dma_addr_t len, + MemTxAttrs attrs) +{ + return dma_memory_rw(as, addr, (void *)buf, len, + DMA_DIRECTION_FROM_DEVICE, attrs); +} + +/** + * dma_memory_set: Fill memory with a constant byte from DMA controller. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @c: constant byte to fill the memory + * @len: the number of bytes to fill with the constant byte + * @attrs: memory transaction attributes + */ +MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr, + uint8_t c, dma_addr_t len, MemTxAttrs attrs); + +/** + * dma_memory_map: Map a physical memory region into a host virtual address. + * + * May map a subset of the requested range, given by and returned in @plen. + * May return %NULL and set *@plen to zero(0), if resources needed to perform + * the mapping are exhausted. + * Use only for reads OR writes - not for read-modify-write operations. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @len: pointer to length of buffer; updated on return + * @dir: indicates the transfer direction + * @attrs: memory attributes + */ +static inline void *dma_memory_map(AddressSpace *as, + dma_addr_t addr, dma_addr_t *len, + DMADirection dir, MemTxAttrs attrs) +{ + hwaddr xlen = *len; + void *p; + + p = address_space_map(as, addr, &xlen, dir == DMA_DIRECTION_FROM_DEVICE, + attrs); + *len = xlen; + return p; +} + +/** + * dma_memory_unmap: Unmaps a memory region previously mapped by dma_memory_map() + * + * Will also mark the memory as dirty if @dir == %DMA_DIRECTION_FROM_DEVICE. + * @access_len gives the amount of memory that was actually read or written + * by the caller. + * + * @as: #AddressSpace used + * @buffer: host pointer as returned by dma_memory_map() + * @len: buffer length as returned by dma_memory_map() + * @dir: indicates the transfer direction + * @access_len: amount of data actually transferred + */ +static inline void dma_memory_unmap(AddressSpace *as, + void *buffer, dma_addr_t len, + DMADirection dir, dma_addr_t access_len) +{ + address_space_unmap(as, buffer, (hwaddr)len, + dir == DMA_DIRECTION_FROM_DEVICE, access_len); +} + +#define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \ + static inline MemTxResult ld##_lname##_##_end##_dma(AddressSpace *as, \ + dma_addr_t addr, \ + uint##_bits##_t *pval, \ + MemTxAttrs attrs) \ + { \ + MemTxResult res = dma_memory_read(as, addr, pval, (_bits) / 8, attrs); \ + _end##_bits##_to_cpus(pval); \ + return res; \ + } \ + static inline MemTxResult st##_sname##_##_end##_dma(AddressSpace *as, \ + dma_addr_t addr, \ + uint##_bits##_t val, \ + MemTxAttrs attrs) \ + { \ + val = cpu_to_##_end##_bits(val); \ + return dma_memory_write(as, addr, &val, (_bits) / 8, attrs); \ + } + +static inline MemTxResult ldub_dma(AddressSpace *as, dma_addr_t addr, + uint8_t *val, MemTxAttrs attrs) +{ + return dma_memory_read(as, addr, val, 1, attrs); +} + +static inline MemTxResult stb_dma(AddressSpace *as, dma_addr_t addr, + uint8_t val, MemTxAttrs attrs) +{ + return dma_memory_write(as, addr, &val, 1, attrs); +} + +DEFINE_LDST_DMA(uw, w, 16, le); +DEFINE_LDST_DMA(l, l, 32, le); +DEFINE_LDST_DMA(q, q, 64, le); +DEFINE_LDST_DMA(uw, w, 16, be); +DEFINE_LDST_DMA(l, l, 32, be); +DEFINE_LDST_DMA(q, q, 64, be); + +#undef DEFINE_LDST_DMA + +struct ScatterGatherEntry { + dma_addr_t base; + dma_addr_t len; +}; + +void qemu_sglist_init(QEMUSGList *qsg, DeviceState *dev, int alloc_hint, + AddressSpace *as); +void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len); +void qemu_sglist_destroy(QEMUSGList *qsg); + +typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque); + +BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align, + DMAIOFunc *io_func, void *io_func_opaque, + BlockCompletionFunc *cb, void *opaque, DMADirection dir); +BlockAIOCB *dma_blk_read(BlockBackend *blk, + QEMUSGList *sg, uint64_t offset, uint32_t align, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *dma_blk_write(BlockBackend *blk, + QEMUSGList *sg, uint64_t offset, uint32_t align, + BlockCompletionFunc *cb, void *opaque); +MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual, + QEMUSGList *sg, MemTxAttrs attrs); +MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual, + QEMUSGList *sg, MemTxAttrs attrs); + +void dma_acct_start(BlockBackend *blk, BlockAcctCookie *cookie, + QEMUSGList *sg, enum BlockAcctType type); + +/** + * dma_aligned_pow2_mask: Return the address bit mask of the largest + * power of 2 size less or equal than @end - @start + 1, aligned with @start, + * and bounded by 1 << @max_addr_bits bits. + * + * @start: range start address + * @end: range end address (greater than @start) + * @max_addr_bits: max address bits (<= 64) + */ +uint64_t dma_aligned_pow2_mask(uint64_t start, uint64_t end, + int max_addr_bits); + +#endif diff --git a/include/system/dump-arch.h b/include/system/dump-arch.h new file mode 100644 index 0000000..743916e --- /dev/null +++ b/include/system/dump-arch.h @@ -0,0 +1,35 @@ +/* + * QEMU dump + * + * Copyright Fujitsu, Corp. 2011, 2012 + * + * Authors: + * Wen Congyang <wency@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef DUMP_ARCH_H +#define DUMP_ARCH_H + +typedef struct ArchDumpInfo { + int d_machine; /* Architecture */ + int d_endian; /* ELFDATA2LSB or ELFDATA2MSB */ + int d_class; /* ELFCLASS32 or ELFCLASS64 */ + uint32_t page_size; /* The target's page size. If it's variable and + * unknown, then this should be the maximum. */ + uint64_t phys_base; /* The target's physmem base. */ + void (*arch_sections_add_fn)(DumpState *s); + uint64_t (*arch_sections_write_hdr_fn)(DumpState *s, uint8_t *buff); + int (*arch_sections_write_fn)(DumpState *s, uint8_t *buff); + void (*arch_cleanup_fn)(DumpState *s); +} ArchDumpInfo; + +struct GuestPhysBlockList; /* memory_mapping.h */ +int cpu_get_dump_info(ArchDumpInfo *info, + const struct GuestPhysBlockList *guest_phys_blocks); +ssize_t cpu_get_note_size(int class, int machine, int nr_cpus); + +#endif diff --git a/include/system/dump.h b/include/system/dump.h new file mode 100644 index 0000000..607bd7b --- /dev/null +++ b/include/system/dump.h @@ -0,0 +1,225 @@ +/* + * QEMU dump + * + * Copyright Fujitsu, Corp. 2011, 2012 + * + * Authors: + * Wen Congyang <wency@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef DUMP_H +#define DUMP_H + +#include "qapi/qapi-types-dump.h" +#include "qemu/thread.h" + +#define MAKEDUMPFILE_SIGNATURE "makedumpfile" +#define MAX_SIZE_MDF_HEADER (4096) /* max size of makedumpfile_header */ +#define TYPE_FLAT_HEADER (1) /* type of flattened format */ +#define VERSION_FLAT_HEADER (1) /* version of flattened format */ +#define END_FLAG_FLAT_HEADER (-1) + +#ifndef ARCH_PFN_OFFSET +#define ARCH_PFN_OFFSET (0) +#endif + +/* + * flag for compressed format + */ +#define DUMP_DH_COMPRESSED_ZLIB (0x1) +#define DUMP_DH_COMPRESSED_LZO (0x2) +#define DUMP_DH_COMPRESSED_SNAPPY (0x4) + +#define KDUMP_SIGNATURE "KDUMP " +#define SIG_LEN (sizeof(KDUMP_SIGNATURE) - 1) +#define DUMP_LEVEL (1) +#define DISKDUMP_HEADER_BLOCKS (1) + +#include "system/dump-arch.h" +#include "system/memory_mapping.h" + +typedef struct QEMU_PACKED MakedumpfileHeader { + char signature[16]; /* = "makedumpfile" */ + int64_t type; + int64_t version; +} MakedumpfileHeader; + +typedef struct QEMU_PACKED MakedumpfileDataHeader { + int64_t offset; + int64_t buf_size; +} MakedumpfileDataHeader; + +typedef struct QEMU_PACKED NewUtsname { + char sysname[65]; + char nodename[65]; + char release[65]; + char version[65]; + char machine[65]; + char domainname[65]; +} NewUtsname; + +typedef struct QEMU_PACKED DiskDumpHeader32 { + char signature[SIG_LEN]; /* = "KDUMP " */ + uint32_t header_version; /* Dump header version */ + NewUtsname utsname; /* copy of system_utsname */ + char timestamp[10]; /* Time stamp */ + uint32_t status; /* Above flags */ + uint32_t block_size; /* Size of a block in byte */ + uint32_t sub_hdr_size; /* Size of arch dependent header in block */ + uint32_t bitmap_blocks; /* Size of Memory bitmap in block */ + uint32_t max_mapnr; /* = max_mapnr , + obsoleted in header_version 6 */ + uint32_t total_ram_blocks; /* Number of blocks should be written */ + uint32_t device_blocks; /* Number of total blocks in dump device */ + uint32_t written_blocks; /* Number of written blocks */ + uint32_t current_cpu; /* CPU# which handles dump */ + uint32_t nr_cpus; /* Number of CPUs */ +} DiskDumpHeader32; + +typedef struct QEMU_PACKED DiskDumpHeader64 { + char signature[SIG_LEN]; /* = "KDUMP " */ + uint32_t header_version; /* Dump header version */ + NewUtsname utsname; /* copy of system_utsname */ + char timestamp[22]; /* Time stamp */ + uint32_t status; /* Above flags */ + uint32_t block_size; /* Size of a block in byte */ + uint32_t sub_hdr_size; /* Size of arch dependent header in block */ + uint32_t bitmap_blocks; /* Size of Memory bitmap in block */ + uint32_t max_mapnr; /* = max_mapnr, + obsoleted in header_version 6 */ + uint32_t total_ram_blocks; /* Number of blocks should be written */ + uint32_t device_blocks; /* Number of total blocks in dump device */ + uint32_t written_blocks; /* Number of written blocks */ + uint32_t current_cpu; /* CPU# which handles dump */ + uint32_t nr_cpus; /* Number of CPUs */ +} DiskDumpHeader64; + +typedef struct QEMU_PACKED KdumpSubHeader32 { + uint32_t phys_base; + uint32_t dump_level; /* header_version 1 and later */ + uint32_t split; /* header_version 2 and later */ + uint32_t start_pfn; /* header_version 2 and later, + obsoleted in header_version 6 */ + uint32_t end_pfn; /* header_version 2 and later, + obsoleted in header_version 6 */ + uint64_t offset_vmcoreinfo; /* header_version 3 and later */ + uint32_t size_vmcoreinfo; /* header_version 3 and later */ + uint64_t offset_note; /* header_version 4 and later */ + uint32_t note_size; /* header_version 4 and later */ + uint64_t offset_eraseinfo; /* header_version 5 and later */ + uint32_t size_eraseinfo; /* header_version 5 and later */ + uint64_t start_pfn_64; /* header_version 6 and later */ + uint64_t end_pfn_64; /* header_version 6 and later */ + uint64_t max_mapnr_64; /* header_version 6 and later */ +} KdumpSubHeader32; + +typedef struct QEMU_PACKED KdumpSubHeader64 { + uint64_t phys_base; + uint32_t dump_level; /* header_version 1 and later */ + uint32_t split; /* header_version 2 and later */ + uint64_t start_pfn; /* header_version 2 and later, + obsoleted in header_version 6 */ + uint64_t end_pfn; /* header_version 2 and later, + obsoleted in header_version 6 */ + uint64_t offset_vmcoreinfo; /* header_version 3 and later */ + uint64_t size_vmcoreinfo; /* header_version 3 and later */ + uint64_t offset_note; /* header_version 4 and later */ + uint64_t note_size; /* header_version 4 and later */ + uint64_t offset_eraseinfo; /* header_version 5 and later */ + uint64_t size_eraseinfo; /* header_version 5 and later */ + uint64_t start_pfn_64; /* header_version 6 and later */ + uint64_t end_pfn_64; /* header_version 6 and later */ + uint64_t max_mapnr_64; /* header_version 6 and later */ +} KdumpSubHeader64; + +typedef struct DataCache { + DumpState *state; /* dump state related to this data */ + uint8_t *buf; /* buffer for cached data */ + size_t buf_size; /* size of the buf */ + size_t data_size; /* size of cached data in buf */ + off_t offset; /* offset of the file */ +} DataCache; + +typedef struct QEMU_PACKED PageDescriptor { + uint64_t offset; /* the offset of the page data*/ + uint32_t size; /* the size of this dump page */ + uint32_t flags; /* flags */ + uint64_t page_flags; /* page flags */ +} PageDescriptor; + +typedef struct DumpState { + GuestPhysBlockList guest_phys_blocks; + ArchDumpInfo dump_info; + MemoryMappingList list; + bool resume; + bool detached; + bool kdump_raw; + hwaddr memory_offset; + int fd; + + /* + * Dump filter area variables + * + * A filtered dump only contains the guest memory designated by + * the start address and length variables defined below. + * + * If length is 0, no filtering is applied. + */ + int64_t filter_area_begin; /* Start address of partial guest memory area */ + int64_t filter_area_length; /* Length of partial guest memory area */ + + /* Elf dump related data */ + uint32_t phdr_num; + uint32_t shdr_num; + ssize_t note_size; + hwaddr shdr_offset; + hwaddr phdr_offset; + hwaddr section_offset; + hwaddr note_offset; + + void *elf_section_hdrs; /* Pointer to section header buffer */ + void *elf_section_data; /* Pointer to section data buffer */ + uint64_t elf_section_data_size; /* Size of section data */ + GArray *string_table_buf; /* String table data buffer */ + + uint8_t *note_buf; /* buffer for notes */ + size_t note_buf_offset; /* the writing place in note_buf */ + uint32_t nr_cpus; /* number of guest's cpu */ + uint64_t max_mapnr; /* the biggest guest's phys-mem's number */ + size_t len_dump_bitmap; /* the size of the place used to store + dump_bitmap in vmcore */ + off_t offset_dump_bitmap; /* offset of dump_bitmap part in vmcore */ + off_t offset_page; /* offset of page part in vmcore */ + size_t num_dumpable; /* number of page that can be dumped */ + uint32_t flag_compress; /* indicate the compression format */ + DumpStatus status; /* current dump status */ + + bool has_format; /* whether format is provided */ + DumpGuestMemoryFormat format; /* valid only if has_format == true */ + QemuThread dump_thread; /* thread for detached dump */ + + int64_t total_size; /* total memory size (in bytes) to + * be dumped. When filter is + * enabled, this will only count + * those to be written. */ + int64_t written_size; /* written memory size (in bytes), + * this could be used to calculate + * how much work we have + * finished. */ + uint8_t *guest_note; /* ELF note content */ + size_t guest_note_size; +} DumpState; + +uint16_t cpu_to_dump16(DumpState *s, uint16_t val); +uint32_t cpu_to_dump32(DumpState *s, uint32_t val); +uint64_t cpu_to_dump64(DumpState *s, uint64_t val); + +int64_t dump_filtered_memblock_size(GuestPhysBlock *block, int64_t filter_area_start, + int64_t filter_area_length); +int64_t dump_filtered_memblock_start(GuestPhysBlock *block, int64_t filter_area_start, + int64_t filter_area_length); +#endif diff --git a/include/system/event-loop-base.h b/include/system/event-loop-base.h new file mode 100644 index 0000000..a6c24f1 --- /dev/null +++ b/include/system/event-loop-base.h @@ -0,0 +1,40 @@ +/* + * QEMU event-loop backend + * + * Copyright (C) 2022 Red Hat Inc + * + * Authors: + * Nicolas Saenz Julienne <nsaenzju@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_EVENT_LOOP_BASE_H +#define QEMU_EVENT_LOOP_BASE_H + +#include "qom/object.h" +#include "block/aio.h" + +#define TYPE_EVENT_LOOP_BASE "event-loop-base" +OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass, + EVENT_LOOP_BASE) + +struct EventLoopBaseClass { + ObjectClass parent_class; + + void (*init)(EventLoopBase *base, Error **errp); + void (*update_params)(EventLoopBase *base, Error **errp); + bool (*can_be_deleted)(EventLoopBase *base); +}; + +struct EventLoopBase { + Object parent; + + /* AioContext AIO engine parameters */ + int64_t aio_max_batch; + + /* AioContext thread pool parameters */ + int64_t thread_pool_min; + int64_t thread_pool_max; +}; +#endif diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h new file mode 100644 index 0000000..ab849a4 --- /dev/null +++ b/include/system/host_iommu_device.h @@ -0,0 +1,125 @@ +/* + * Host IOMMU device abstract declaration + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan <zhenzhong.duan@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef HOST_IOMMU_DEVICE_H +#define HOST_IOMMU_DEVICE_H + +#include "qom/object.h" +#include "qapi/error.h" +#ifdef CONFIG_LINUX +#include "linux/iommufd.h" + +typedef union VendorCaps { + struct iommu_hw_info_vtd vtd; + struct iommu_hw_info_arm_smmuv3 smmuv3; +} VendorCaps; + +/** + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. + * + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + * + * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on + * IOMMUFD this represents a user-space buffer filled by kernel + * with host IOMMU @type specific hardware information data) + */ +typedef struct HostIOMMUDeviceCaps { + uint32_t type; + uint64_t hw_caps; + VendorCaps vendor_caps; +} HostIOMMUDeviceCaps; +#endif + +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) + +struct HostIOMMUDevice { + Object parent_obj; + + char *name; + void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ + PCIBus *aliased_bus; + int aliased_devfn; +#ifdef CONFIG_LINUX + HostIOMMUDeviceCaps caps; +#endif +}; + +/** + * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices. + * + * Different types of host devices (e.g., VFIO or VDPA device) or devices + * with different backend (e.g., VFIO legacy container or IOMMUFD backend) + * will have different implementations of the HostIOMMUDeviceClass. + */ +struct HostIOMMUDeviceClass { + ObjectClass parent_class; + + /** + * @realize: initialize host IOMMU device instance further. + * + * Mandatory callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @get_cap: check if a host IOMMU device capability is supported. + * + * Optional callback, if not implemented, hint not supporting query + * of @cap. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @cap: capability to check. + * + * @errp: pass an Error out when fails to query capability. + * + * Returns: <0 on failure, 0 if a @cap is unsupported, or else + * 1 or some positive value for some special @cap, + * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS. + */ + int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp); + /** + * @get_iova_ranges: Return the list of usable iova_ranges along with + * @hiod Host IOMMU device + * + * @hiod: handle to the host IOMMU device + */ + GList* (*get_iova_ranges)(HostIOMMUDevice *hiod); + /** + * + * @get_page_size_mask: Return the page size mask supported along this + * @hiod Host IOMMU device + * + * @hiod: handle to the host IOMMU device + */ + uint64_t (*get_page_size_mask)(HostIOMMUDevice *hiod); +}; + +/* + * Host IOMMU device capability list. + */ +#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE 0 +#define HOST_IOMMU_DEVICE_CAP_AW_BITS 1 + +#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX 64 +#endif diff --git a/include/system/hostmem.h b/include/system/hostmem.h new file mode 100644 index 0000000..88fa791 --- /dev/null +++ b/include/system/hostmem.h @@ -0,0 +1,99 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Igor Mammedov <imammedo@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef SYSTEM_HOSTMEM_H +#define SYSTEM_HOSTMEM_H + +#include "system/numa.h" +#include "qapi/qapi-types-machine.h" +#include "qom/object.h" +#include "system/memory.h" +#include "qemu/bitmap.h" +#include "qemu/thread-context.h" + +#define TYPE_MEMORY_BACKEND "memory-backend" +OBJECT_DECLARE_TYPE(HostMemoryBackend, HostMemoryBackendClass, + MEMORY_BACKEND) + +/* hostmem-ram.c */ +/** + * @TYPE_MEMORY_BACKEND_RAM: + * name of backend that uses mmap on the anonymous RAM + */ + +#define TYPE_MEMORY_BACKEND_RAM "memory-backend-ram" + +/* hostmem-file.c */ +/** + * @TYPE_MEMORY_BACKEND_FILE: + * name of backend that uses mmap on a file descriptor + */ +#define TYPE_MEMORY_BACKEND_FILE "memory-backend-file" + +#define TYPE_MEMORY_BACKEND_MEMFD "memory-backend-memfd" + + +/** + * HostMemoryBackendClass: + * @parent_class: opaque parent class container + */ +struct HostMemoryBackendClass { + ObjectClass parent_class; + + /** + * alloc: Allocate memory from backend. + * + * @backend: the #HostMemoryBackend. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ + bool (*alloc)(HostMemoryBackend *backend, Error **errp); +}; + +/** + * @HostMemoryBackend + * + * @parent: opaque parent object container + * @size: amount of memory backend provides + * @mr: MemoryRegion representing host memory belonging to backend + * @prealloc_threads: number of threads to be used for preallocatining RAM + */ +struct HostMemoryBackend { + /* private */ + Object parent; + + /* protected */ + uint64_t size; + bool merge, dump, use_canonical_path; + bool prealloc, is_mapped, share, reserve; + bool guest_memfd, aligned; + uint32_t prealloc_threads; + ThreadContext *prealloc_context; + DECLARE_BITMAP(host_nodes, MAX_NODES + 1); + HostMemPolicy policy; + + MemoryRegion mr; +}; + +bool host_memory_backend_mr_inited(HostMemoryBackend *backend); +MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend); + +void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped); +bool host_memory_backend_is_mapped(HostMemoryBackend *backend); +size_t host_memory_backend_pagesize(HostMemoryBackend *memdev); +char *host_memory_backend_get_name(HostMemoryBackend *backend); + +long qemu_minrampagesize(void); +long qemu_maxrampagesize(void); + +#endif diff --git a/include/system/hvf.h b/include/system/hvf.h new file mode 100644 index 0000000..a9a502f --- /dev/null +++ b/include/system/hvf.h @@ -0,0 +1,77 @@ +/* + * QEMU Hypervisor.framework (HVF) support + * + * Copyright Google Inc., 2017 + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* header to be included in non-HVF-specific code */ + +#ifndef HVF_H +#define HVF_H + +#include "qemu/accel.h" +#include "qemu/queue.h" +#include "exec/vaddr.h" +#include "qom/object.h" +#include "exec/vaddr.h" + +#ifdef COMPILING_PER_TARGET +# ifdef CONFIG_HVF +# define CONFIG_HVF_IS_POSSIBLE +# endif /* !CONFIG_HVF */ +#else +# define CONFIG_HVF_IS_POSSIBLE +#endif /* COMPILING_PER_TARGET */ + +#ifdef CONFIG_HVF_IS_POSSIBLE +extern bool hvf_allowed; +#define hvf_enabled() (hvf_allowed) +#else /* !CONFIG_HVF_IS_POSSIBLE */ +#define hvf_enabled() 0 +#endif /* !CONFIG_HVF_IS_POSSIBLE */ + +#define TYPE_HVF_ACCEL ACCEL_CLASS_NAME("hvf") + +typedef struct HVFState HVFState; +DECLARE_INSTANCE_CHECKER(HVFState, HVF_STATE, + TYPE_HVF_ACCEL) + +#ifdef COMPILING_PER_TARGET +struct hvf_sw_breakpoint { + vaddr pc; + vaddr saved_insn; + int use_count; + QTAILQ_ENTRY(hvf_sw_breakpoint) entry; +}; + +struct hvf_sw_breakpoint *hvf_find_sw_breakpoint(CPUState *cpu, + vaddr pc); +int hvf_sw_breakpoints_active(CPUState *cpu); + +int hvf_arch_insert_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); +int hvf_arch_remove_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); +int hvf_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type); +int hvf_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type); +void hvf_arch_remove_all_hw_breakpoints(void); + +/* + * hvf_update_guest_debug: + * @cs: CPUState for the CPU to update + * + * Update guest to enable or disable debugging. Per-arch specifics will be + * handled by calling down to hvf_arch_update_guest_debug. + */ +int hvf_update_guest_debug(CPUState *cpu); +void hvf_arch_update_guest_debug(CPUState *cpu); + +/* + * Return whether the guest supports debugging. + */ +bool hvf_arch_supports_guest_debug(void); +#endif /* COMPILING_PER_TARGET */ + +#endif diff --git a/include/system/hvf_int.h b/include/system/hvf_int.h new file mode 100644 index 0000000..d774e58 --- /dev/null +++ b/include/system/hvf_int.h @@ -0,0 +1,80 @@ +/* + * QEMU Hypervisor.framework (HVF) support + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* header to be included in HVF-specific code */ + +#ifndef HVF_INT_H +#define HVF_INT_H + +#include "qemu/queue.h" + +#ifdef __aarch64__ +#include <Hypervisor/Hypervisor.h> +typedef hv_vcpu_t hvf_vcpuid; +#else +#include <Hypervisor/hv.h> +typedef hv_vcpuid_t hvf_vcpuid; +#endif + +/* hvf_slot flags */ +#define HVF_SLOT_LOG (1 << 0) + +typedef struct hvf_slot { + uint64_t start; + uint64_t size; + uint8_t *mem; + int slot_id; + uint32_t flags; + MemoryRegion *region; +} hvf_slot; + +typedef struct hvf_vcpu_caps { + uint64_t vmx_cap_pinbased; + uint64_t vmx_cap_procbased; + uint64_t vmx_cap_procbased2; + uint64_t vmx_cap_entry; + uint64_t vmx_cap_exit; + uint64_t vmx_cap_preemption_timer; +} hvf_vcpu_caps; + +struct HVFState { + AccelState parent; + + hvf_slot slots[32]; + int num_slots; + + hvf_vcpu_caps *hvf_caps; + uint64_t vtimer_offset; + QTAILQ_HEAD(, hvf_sw_breakpoint) hvf_sw_breakpoints; +}; +extern HVFState *hvf_state; + +struct AccelCPUState { + hvf_vcpuid fd; + void *exit; + bool vtimer_masked; + sigset_t unblock_ipi_mask; + bool guest_debug_enabled; + bool dirty; +}; + +void assert_hvf_ok_impl(hv_return_t ret, const char *file, unsigned int line, + const char *exp); +#define assert_hvf_ok(EX) assert_hvf_ok_impl((EX), __FILE__, __LINE__, #EX) +const char *hvf_return_string(hv_return_t ret); +int hvf_arch_init(void); +hv_return_t hvf_arch_vm_create(MachineState *ms, uint32_t pa_range); +int hvf_arch_init_vcpu(CPUState *cpu); +void hvf_arch_vcpu_destroy(CPUState *cpu); +int hvf_vcpu_exec(CPUState *); +hvf_slot *hvf_find_overlap_slot(uint64_t, uint64_t); +int hvf_put_registers(CPUState *); +int hvf_get_registers(CPUState *); +void hvf_kick_vcpu_thread(CPUState *cpu); + +#endif diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h new file mode 100644 index 0000000..380e9e6 --- /dev/null +++ b/include/system/hw_accel.h @@ -0,0 +1,25 @@ +/* + * QEMU Hardware accelerators support + * + * Copyright 2016 Google, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_HW_ACCEL_H +#define QEMU_HW_ACCEL_H + +#include "hw/core/cpu.h" +#include "system/kvm.h" +#include "system/hvf.h" +#include "system/whpx.h" +#include "system/nvmm.h" + +void cpu_synchronize_state(CPUState *cpu); +void cpu_synchronize_post_reset(CPUState *cpu); +void cpu_synchronize_post_init(CPUState *cpu); +void cpu_synchronize_pre_loadvm(CPUState *cpu); + +#endif /* QEMU_HW_ACCEL_H */ diff --git a/include/system/iommufd.h b/include/system/iommufd.h new file mode 100644 index 0000000..283861b --- /dev/null +++ b/include/system/iommufd.h @@ -0,0 +1,120 @@ +/* + * iommufd container backend declaration + * + * Copyright (C) 2024 Intel Corporation. + * Copyright Red Hat, Inc. 2024 + * + * Authors: Yi Liu <yi.l.liu@intel.com> + * Eric Auger <eric.auger@redhat.com> + * Zhenzhong Duan <zhenzhong.duan@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef SYSTEM_IOMMUFD_H +#define SYSTEM_IOMMUFD_H + +#include "qom/object.h" +#include "exec/hwaddr.h" +#include "exec/cpu-common.h" +#include "system/host_iommu_device.h" + +#define TYPE_IOMMUFD_BACKEND "iommufd" +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) + +struct IOMMUFDBackendClass { + ObjectClass parent_class; +}; + +struct IOMMUFDBackend { + Object parent; + + /*< protected >*/ + int fd; /* /dev/iommu file descriptor */ + bool owned; /* is the /dev/iommu opened internally */ + uint32_t users; + + /*< public >*/ +}; + +bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +void iommufd_backend_disconnect(IOMMUFDBackend *be); + +bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp); + +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Overload of the host IOMMU device for the iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t hwpt_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); +#endif diff --git a/include/system/ioport.h b/include/system/ioport.h new file mode 100644 index 0000000..780ea5a --- /dev/null +++ b/include/system/ioport.h @@ -0,0 +1,75 @@ +/* + * defines ioport related functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/************************************************************************** + * IO ports API + */ + +#ifndef SYSTEM_IOPORT_H +#define SYSTEM_IOPORT_H + +#include "system/memory.h" + +#define MAX_IOPORTS (64 * 1024) +#define IOPORTS_MASK (MAX_IOPORTS - 1) + +typedef struct MemoryRegionPortio { + uint32_t offset; + uint32_t len; + unsigned size; + uint32_t (*read)(void *opaque, uint32_t address); + void (*write)(void *opaque, uint32_t address, uint32_t data); +} MemoryRegionPortio; + +#define PORTIO_END_OF_LIST() { } + +extern const MemoryRegionOps unassigned_io_ops; + +void cpu_outb(uint32_t addr, uint8_t val); +void cpu_outw(uint32_t addr, uint16_t val); +void cpu_outl(uint32_t addr, uint32_t val); +uint8_t cpu_inb(uint32_t addr); +uint16_t cpu_inw(uint32_t addr); +uint32_t cpu_inl(uint32_t addr); + +typedef struct PortioList { + const struct MemoryRegionPortio *ports; + Object *owner; + struct MemoryRegion *address_space; + uint32_t addr; + unsigned nr; + struct MemoryRegion **regions; + void *opaque; + const char *name; + bool flush_coalesced_mmio; +} PortioList; + +void portio_list_init(PortioList *piolist, Object *owner, + const struct MemoryRegionPortio *callbacks, + void *opaque, const char *name); +void portio_list_set_flush_coalesced(PortioList *piolist); +void portio_list_destroy(PortioList *piolist); +void portio_list_add(PortioList *piolist, + struct MemoryRegion *address_space, + uint32_t addr); +void portio_list_del(PortioList *piolist); +void portio_list_set_enabled(PortioList *piolist, bool enabled); +void portio_list_set_address(PortioList *piolist, uint32_t addr); + +#endif /* IOPORT_H */ diff --git a/include/system/iothread.h b/include/system/iothread.h new file mode 100644 index 0000000..d95c17a --- /dev/null +++ b/include/system/iothread.h @@ -0,0 +1,67 @@ +/* + * Event loop thread + * + * Copyright Red Hat Inc., 2013 + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef IOTHREAD_H +#define IOTHREAD_H + +#include "block/aio.h" +#include "qemu/thread.h" +#include "qom/object.h" +#include "system/event-loop-base.h" + +#define TYPE_IOTHREAD "iothread" + +struct IOThread { + EventLoopBase parent_obj; + + QemuThread thread; + AioContext *ctx; + bool run_gcontext; /* whether we should run gcontext */ + GMainContext *worker_context; + GMainLoop *main_loop; + QemuSemaphore init_done_sem; /* is thread init done? */ + bool stopping; /* has iothread_stop() been called? */ + bool running; /* should iothread_run() continue? */ + int thread_id; + + /* AioContext poll parameters */ + int64_t poll_max_ns; + int64_t poll_grow; + int64_t poll_shrink; +}; +typedef struct IOThread IOThread; + +DECLARE_INSTANCE_CHECKER(IOThread, IOTHREAD, + TYPE_IOTHREAD) + +char *iothread_get_id(IOThread *iothread); +IOThread *iothread_by_id(const char *id); +AioContext *iothread_get_aio_context(IOThread *iothread); +GMainContext *iothread_get_g_main_context(IOThread *iothread); + +/* + * Helpers used to allocate iothreads for internal use. These + * iothreads will not be seen by monitor clients when query using + * "query-iothreads". + */ +IOThread *iothread_create(const char *id, Error **errp); +void iothread_stop(IOThread *iothread); +void iothread_destroy(IOThread *iothread); + +/* + * Returns true if executing within IOThread context, + * false otherwise. + */ +bool qemu_in_iothread(void); + +#endif /* IOTHREAD_H */ diff --git a/include/system/kvm.h b/include/system/kvm.h new file mode 100644 index 0000000..7cc60d2 --- /dev/null +++ b/include/system/kvm.h @@ -0,0 +1,589 @@ +/* + * QEMU KVM support + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* header to be included in non-KVM-specific code */ + +#ifndef QEMU_KVM_H +#define QEMU_KVM_H + +#include "exec/memattrs.h" +#include "qemu/accel.h" +#include "qom/object.h" + +#ifdef COMPILING_PER_TARGET +# ifdef CONFIG_KVM +# include <linux/kvm.h> +# define CONFIG_KVM_IS_POSSIBLE +# endif +#else +# define CONFIG_KVM_IS_POSSIBLE +#endif + +#ifdef CONFIG_KVM_IS_POSSIBLE + +extern bool kvm_allowed; +extern bool kvm_kernel_irqchip; +extern bool kvm_split_irqchip; +extern bool kvm_async_interrupts_allowed; +extern bool kvm_halt_in_kernel_allowed; +extern bool kvm_resamplefds_allowed; +extern bool kvm_msi_via_irqfd_allowed; +extern bool kvm_gsi_routing_allowed; +extern bool kvm_gsi_direct_mapping; +extern bool kvm_readonly_mem_allowed; +extern bool kvm_msi_use_devid; +extern bool kvm_pre_fault_memory_supported; + +#define kvm_enabled() (kvm_allowed) +/** + * kvm_irqchip_in_kernel: + * + * Returns: true if an in-kernel irqchip was created. + * What this actually means is architecture and machine model + * specific: on PC, for instance, it means that the LAPIC + * is in kernel. This function should never be used from generic + * target-independent code: use one of the following functions or + * some other specific check instead. + */ +#define kvm_irqchip_in_kernel() (kvm_kernel_irqchip) + +/** + * kvm_irqchip_is_split: + * + * Returns: true if the irqchip implementation is split between + * user and kernel space. The details are architecture and + * machine specific. On PC, it means that the PIC, IOAPIC, and + * PIT are in user space while the LAPIC is in the kernel. + */ +#define kvm_irqchip_is_split() (kvm_split_irqchip) + +/** + * kvm_async_interrupts_enabled: + * + * Returns: true if we can deliver interrupts to KVM + * asynchronously (ie by ioctl from any thread at any time) + * rather than having to do interrupt delivery synchronously + * (where the vcpu must be stopped at a suitable point first). + */ +#define kvm_async_interrupts_enabled() (kvm_async_interrupts_allowed) + +/** + * kvm_halt_in_kernel + * + * Returns: true if halted cpus should still get a KVM_RUN ioctl to run + * inside of kernel space. This only works if MP state is implemented. + */ +#define kvm_halt_in_kernel() (kvm_halt_in_kernel_allowed) + +/** + * kvm_irqfds_enabled: + * + * Returns: true if we can use irqfds to inject interrupts into + * a KVM CPU (ie the kernel supports irqfds and we are running + * with a configuration where it is meaningful to use them). + * + * Always available if running with in-kernel irqchip. + */ +#define kvm_irqfds_enabled() kvm_irqchip_in_kernel() + +/** + * kvm_resamplefds_enabled: + * + * Returns: true if we can use resamplefds to inject interrupts into + * a KVM CPU (ie the kernel supports resamplefds and we are running + * with a configuration where it is meaningful to use them). + */ +#define kvm_resamplefds_enabled() (kvm_resamplefds_allowed) + +/** + * kvm_msi_via_irqfd_enabled: + * + * Returns: true if we can route a PCI MSI (Message Signaled Interrupt) + * to a KVM CPU via an irqfd. This requires that the kernel supports + * this and that we're running in a configuration that permits it. + */ +#define kvm_msi_via_irqfd_enabled() (kvm_msi_via_irqfd_allowed) + +/** + * kvm_gsi_routing_enabled: + * + * Returns: true if GSI routing is enabled (ie the kernel supports + * it and we're running in a configuration that permits it). + */ +#define kvm_gsi_routing_enabled() (kvm_gsi_routing_allowed) + +/** + * kvm_gsi_direct_mapping: + * + * Returns: true if GSI direct mapping is enabled. + */ +#define kvm_gsi_direct_mapping() (kvm_gsi_direct_mapping) + +/** + * kvm_readonly_mem_enabled: + * + * Returns: true if KVM readonly memory is enabled (ie the kernel + * supports it and we're running in a configuration that permits it). + */ +#define kvm_readonly_mem_enabled() (kvm_readonly_mem_allowed) + +/** + * kvm_msi_devid_required: + * Returns: true if KVM requires a device id to be provided while + * defining an MSI routing entry. + */ +#define kvm_msi_devid_required() (kvm_msi_use_devid) + +#else + +#define kvm_enabled() (0) +#define kvm_irqchip_in_kernel() (false) +#define kvm_irqchip_is_split() (false) +#define kvm_async_interrupts_enabled() (false) +#define kvm_halt_in_kernel() (false) +#define kvm_irqfds_enabled() (false) +#define kvm_resamplefds_enabled() (false) +#define kvm_msi_via_irqfd_enabled() (false) +#define kvm_gsi_routing_allowed() (false) +#define kvm_gsi_direct_mapping() (false) +#define kvm_readonly_mem_enabled() (false) +#define kvm_msi_devid_required() (false) + +#endif /* CONFIG_KVM_IS_POSSIBLE */ + +struct kvm_run; +struct kvm_irq_routing_entry; + +typedef struct KVMCapabilityInfo { + const char *name; + int value; +} KVMCapabilityInfo; + +#define KVM_CAP_INFO(CAP) { "KVM_CAP_" stringify(CAP), KVM_CAP_##CAP } +#define KVM_CAP_LAST_INFO { NULL, 0 } + +struct KVMState; + +#define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm") +typedef struct KVMState KVMState; +DECLARE_INSTANCE_CHECKER(KVMState, KVM_STATE, + TYPE_KVM_ACCEL) + +extern KVMState *kvm_state; +typedef struct Notifier Notifier; + +typedef struct KVMRouteChange { + KVMState *s; + int changes; +} KVMRouteChange; + +/* external API */ + +unsigned int kvm_get_max_memslots(void); +unsigned int kvm_get_free_memslots(void); +bool kvm_has_sync_mmu(void); +int kvm_has_vcpu_events(void); +int kvm_max_nested_state_length(void); +int kvm_has_gsi_routing(void); + +/** + * kvm_arm_supports_user_irq + * + * Not all KVM implementations support notifications for kernel generated + * interrupt events to user space. This function indicates whether the current + * KVM implementation does support them. + * + * Returns: true if KVM supports using kernel generated IRQs from user space + */ +bool kvm_arm_supports_user_irq(void); + + +int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); +int kvm_on_sigbus(int code, void *addr); + +int kvm_check_extension(KVMState *s, unsigned int extension); + +int kvm_vm_ioctl(KVMState *s, unsigned long type, ...); + +void kvm_flush_coalesced_mmio_buffer(void); + +#ifdef COMPILING_PER_TARGET +#include "cpu.h" + +/** + * kvm_update_guest_debug(): ensure KVM debug structures updated + * @cs: the CPUState for this cpu + * @reinject_trap: KVM trap injection control + * + * There are usually per-arch specifics which will be handled by + * calling down to kvm_arch_update_guest_debug after the generic + * fields have been set. + */ +#ifdef TARGET_KVM_HAVE_GUEST_DEBUG +int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); +#else +static inline int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) +{ + return -EINVAL; +} +#endif + +/* internal API */ + +int kvm_ioctl(KVMState *s, unsigned long type, ...); + +int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...); + +/** + * kvm_device_ioctl - call an ioctl on a kvm device + * @fd: The KVM device file descriptor as returned from KVM_CREATE_DEVICE + * @type: The device-ctrl ioctl number + * + * Returns: -errno on error, nonnegative on success + */ +int kvm_device_ioctl(int fd, unsigned long type, ...); + +/** + * kvm_vm_check_attr - check for existence of a specific vm attribute + * @s: The KVMState pointer + * @group: the group + * @attr: the attribute of that group to query for + * + * Returns: 1 if the attribute exists + * 0 if the attribute either does not exist or if the vm device + * interface is unavailable + */ +int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr); + +/** + * kvm_device_check_attr - check for existence of a specific device attribute + * @fd: The device file descriptor + * @group: the group + * @attr: the attribute of that group to query for + * + * Returns: 1 if the attribute exists + * 0 if the attribute either does not exist or if the vm device + * interface is unavailable + */ +int kvm_device_check_attr(int fd, uint32_t group, uint64_t attr); + +/** + * kvm_device_access - set or get value of a specific device attribute + * @fd: The device file descriptor + * @group: the group + * @attr: the attribute of that group to set or get + * @val: pointer to a storage area for the value + * @write: true for set and false for get operation + * @errp: error object handle + * + * Returns: 0 on success + * < 0 on error + * Use kvm_device_check_attr() in order to check for the availability + * of optional attributes. + */ +int kvm_device_access(int fd, int group, uint64_t attr, + void *val, bool write, Error **errp); + +/** + * kvm_create_device - create a KVM device for the device control API + * @KVMState: The KVMState pointer + * @type: The KVM device type (see Documentation/virtual/kvm/devices in the + * kernel source) + * @test: If true, only test if device can be created, but don't actually + * create the device. + * + * Returns: -errno on error, nonnegative on success: @test ? 0 : device fd; + */ +int kvm_create_device(KVMState *s, uint64_t type, bool test); + +/** + * kvm_device_supported - probe whether KVM supports specific device + * + * @vmfd: The fd handler for VM + * @type: type of device + * + * @return: true if supported, otherwise false. + */ +bool kvm_device_supported(int vmfd, uint64_t type); + +/** + * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_vcpu(CPUState *cpu); + +/** + * kvm_park_vcpu - Park QEMU KVM vCPU context + * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. + * + * @returns: none + */ +void kvm_park_vcpu(CPUState *cpu); + +/** + * kvm_unpark_vcpu - unpark QEMU KVM vCPU context + * @s: KVM State + * @vcpu_id: Architecture vCPU ID of the parked vCPU + * + * @returns: KVM fd + */ +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); + +/** + * kvm_create_and_park_vcpu - Create and park a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be created and parked. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_and_park_vcpu(CPUState *cpu); + +/* Arch specific hooks */ + +extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; + +void kvm_arch_accel_class_init(ObjectClass *oc); + +void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run); +MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run); + +int kvm_arch_handle_exit(CPUState *cpu, struct kvm_run *run); + +int kvm_arch_process_async_events(CPUState *cpu); + +int kvm_arch_get_registers(CPUState *cpu, Error **errp); + +/* state subset only touched by the VCPU itself during runtime */ +#define KVM_PUT_RUNTIME_STATE 1 +/* state subset modified during VCPU reset */ +#define KVM_PUT_RESET_STATE 2 +/* full state set, modified during initialization or on vmload */ +#define KVM_PUT_FULL_STATE 3 + +int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp); + +int kvm_arch_get_default_type(MachineState *ms); + +int kvm_arch_init(MachineState *ms, KVMState *s); + +int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp); +int kvm_arch_init_vcpu(CPUState *cpu); +int kvm_arch_destroy_vcpu(CPUState *cpu); + +#ifdef TARGET_KVM_HAVE_RESET_PARKED_VCPU +void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd); +#else +static inline void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd) +{ +} +#endif + +bool kvm_vcpu_id_is_valid(int vcpu_id); + +/* Returns VCPU ID to be used on KVM_CREATE_VCPU ioctl() */ +unsigned long kvm_arch_vcpu_id(CPUState *cpu); + +void kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); + +void kvm_arch_init_irq_routing(KVMState *s); + +int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data, PCIDevice *dev); + +/* Notify arch about newly added MSI routes */ +int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, + int vector, PCIDevice *dev); +/* Notify arch about released MSI routes */ +int kvm_arch_release_virq_post(int virq); + +int kvm_arch_msi_data_to_gsi(uint32_t data); + +int kvm_set_irq(KVMState *s, int irq, int level); +int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg); + +void kvm_irqchip_add_irq_route(KVMState *s, int gsi, int irqchip, int pin); + +void kvm_irqchip_add_change_notifier(Notifier *n); +void kvm_irqchip_remove_change_notifier(Notifier *n); +void kvm_irqchip_change_notify(void); + +struct kvm_guest_debug; +struct kvm_debug_exit_arch; + +struct kvm_sw_breakpoint { + vaddr pc; + vaddr saved_insn; + int use_count; + QTAILQ_ENTRY(kvm_sw_breakpoint) entry; +}; + +struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, + vaddr pc); + +int kvm_sw_breakpoints_active(CPUState *cpu); + +int kvm_arch_insert_sw_breakpoint(CPUState *cpu, + struct kvm_sw_breakpoint *bp); +int kvm_arch_remove_sw_breakpoint(CPUState *cpu, + struct kvm_sw_breakpoint *bp); +int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type); +int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type); +void kvm_arch_remove_all_hw_breakpoints(void); + +void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg); + +bool kvm_arch_stop_on_emulation_error(CPUState *cpu); + +int kvm_vm_check_extension(KVMState *s, unsigned int extension); + +#define kvm_vm_enable_cap(s, capability, cap_flags, ...) \ + ({ \ + struct kvm_enable_cap cap = { \ + .cap = capability, \ + .flags = cap_flags, \ + }; \ + uint64_t args_tmp[] = { __VA_ARGS__ }; \ + size_t n = MIN(ARRAY_SIZE(args_tmp), ARRAY_SIZE(cap.args)); \ + memcpy(cap.args, args_tmp, n * sizeof(cap.args[0])); \ + kvm_vm_ioctl(s, KVM_ENABLE_CAP, &cap); \ + }) + +#define kvm_vcpu_enable_cap(cpu, capability, cap_flags, ...) \ + ({ \ + struct kvm_enable_cap cap = { \ + .cap = capability, \ + .flags = cap_flags, \ + }; \ + uint64_t args_tmp[] = { __VA_ARGS__ }; \ + size_t n = MIN(ARRAY_SIZE(args_tmp), ARRAY_SIZE(cap.args)); \ + memcpy(cap.args, args_tmp, n * sizeof(cap.args[0])); \ + kvm_vcpu_ioctl(cpu, KVM_ENABLE_CAP, &cap); \ + }) + +void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len); + +int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr, + hwaddr *phys_addr); + +#endif /* COMPILING_PER_TARGET */ + +void kvm_cpu_synchronize_state(CPUState *cpu); + +void kvm_init_cpu_signals(CPUState *cpu); + +/** + * kvm_irqchip_add_msi_route - Add MSI route for specific vector + * @c: KVMRouteChange instance. + * @vector: which vector to add. This can be either MSI/MSIX + * vector. The function will automatically detect whether + * MSI/MSIX is enabled, and fetch corresponding MSI + * message. + * @dev: Owner PCI device to add the route. If @dev is specified + * as @NULL, an empty MSI message will be inited. + * @return: virq (>=0) when success, errno (<0) when failed. + */ +int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); +int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, + PCIDevice *dev); +void kvm_irqchip_commit_routes(KVMState *s); + +static inline KVMRouteChange kvm_irqchip_begin_route_changes(KVMState *s) +{ + return (KVMRouteChange) { .s = s, .changes = 0 }; +} + +static inline void kvm_irqchip_commit_route_changes(KVMRouteChange *c) +{ + if (c->changes) { + kvm_irqchip_commit_routes(c->s); + c->changes = 0; + } +} + +int kvm_irqchip_get_virq(KVMState *s); +void kvm_irqchip_release_virq(KVMState *s, int virq); + +void kvm_add_routing_entry(KVMState *s, + struct kvm_irq_routing_entry *entry); + +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq); +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq); +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, + EventNotifier *rn, qemu_irq irq); +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, + qemu_irq irq); +void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi); +void kvm_init_irq_routing(KVMState *s); + +bool kvm_kernel_irqchip_allowed(void); +bool kvm_kernel_irqchip_required(void); +bool kvm_kernel_irqchip_split(void); + +/** + * kvm_arch_irqchip_create: + * @KVMState: The KVMState pointer + * + * Allow architectures to create an in-kernel irq chip themselves. + * + * Returns: < 0: error + * 0: irq chip was not created + * > 0: irq chip was created + */ +int kvm_arch_irqchip_create(KVMState *s); + +/** + * kvm_set_one_reg - set a register value in KVM via KVM_SET_ONE_REG ioctl + * @id: The register ID + * @source: The pointer to the value to be set. It must point to a variable + * of the correct type/size for the register being accessed. + * + * Returns: 0 on success, or a negative errno on failure. + */ +int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source); + +/** + * kvm_get_one_reg - get a register value from KVM via KVM_GET_ONE_REG ioctl + * @id: The register ID + * @target: The pointer where the value is to be stored. It must point to a + * variable of the correct type/size for the register being accessed. + * + * Returns: 0 on success, or a negative errno on failure. + */ +int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target); + +/* Notify resamplefd for EOI of specific interrupts. */ +void kvm_resample_fd_notify(int gsi); + +bool kvm_dirty_ring_enabled(void); + +uint32_t kvm_dirty_ring_size(void); + +void kvm_mark_guest_state_protected(void); + +/** + * kvm_hwpoisoned_mem - indicate if there is any hwpoisoned page + * reported for the VM. + */ +bool kvm_hwpoisoned_mem(void); + +int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp); + +int kvm_set_memory_attributes_private(hwaddr start, uint64_t size); +int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size); + +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private); + +#endif diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h new file mode 100644 index 0000000..756a3c0 --- /dev/null +++ b/include/system/kvm_int.h @@ -0,0 +1,187 @@ +/* + * Internal definitions for a target's KVM support + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_KVM_INT_H +#define QEMU_KVM_INT_H + +#include "system/memory.h" +#include "qapi/qapi-types-common.h" +#include "qemu/accel.h" +#include "qemu/queue.h" +#include "system/kvm.h" +#include "hw/boards.h" +#include "hw/i386/topology.h" +#include "io/channel-socket.h" + +typedef struct KVMSlot +{ + hwaddr start_addr; + ram_addr_t memory_size; + void *ram; + int slot; + int flags; + int old_flags; + /* Dirty bitmap cache for the slot */ + unsigned long *dirty_bmap; + unsigned long dirty_bmap_size; + /* Cache of the address space ID */ + int as_id; + /* Cache of the offset in ram address space */ + ram_addr_t ram_start_offset; + int guest_memfd; + hwaddr guest_memfd_offset; +} KVMSlot; + +typedef struct KVMMemoryUpdate { + QSIMPLEQ_ENTRY(KVMMemoryUpdate) next; + MemoryRegionSection section; +} KVMMemoryUpdate; + +typedef struct KVMMemoryListener { + MemoryListener listener; + KVMSlot *slots; + unsigned int nr_slots_used; + unsigned int nr_slots_allocated; + int as_id; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; +} KVMMemoryListener; + +#define KVM_MSI_HASHTAB_SIZE 256 + +typedef struct KVMHostTopoInfo { + /* Number of package on the Host */ + unsigned int maxpkgs; + /* Number of cpus on the Host */ + unsigned int maxcpus; + /* Number of cpus on each different package */ + unsigned int *pkg_cpu_count; + /* Each package can have different maxticks */ + unsigned int *maxticks; +} KVMHostTopoInfo; + +struct KVMMsrEnergy { + pid_t pid; + bool enable; + char *socket_path; + QIOChannelSocket *sioc; + QemuThread msr_thr; + unsigned int guest_vcpus; + unsigned int guest_vsockets; + X86CPUTopoInfo guest_topo_info; + KVMHostTopoInfo host_topo; + const CPUArchIdList *guest_cpu_list; + uint64_t *msr_value; + uint64_t msr_unit; + uint64_t msr_limit; + uint64_t msr_info; +}; + +enum KVMDirtyRingReaperState { + KVM_DIRTY_RING_REAPER_NONE = 0, + /* The reaper is sleeping */ + KVM_DIRTY_RING_REAPER_WAIT, + /* The reaper is reaping for dirty pages */ + KVM_DIRTY_RING_REAPER_REAPING, +}; + +/* + * KVM reaper instance, responsible for collecting the KVM dirty bits + * via the dirty ring. + */ +struct KVMDirtyRingReaper { + /* The reaper thread */ + QemuThread reaper_thr; + volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ + volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ +}; +struct KVMState +{ + AccelState parent_obj; + /* Max number of KVM slots supported */ + int nr_slots_max; + int fd; + int vmfd; + int coalesced_mmio; + int coalesced_pio; + struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; + bool coalesced_flush_in_progress; + int vcpu_events; +#ifdef TARGET_KVM_HAVE_GUEST_DEBUG + QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; +#endif + int max_nested_state_len; + int kvm_shadow_mem; + bool kernel_irqchip_allowed; + bool kernel_irqchip_required; + OnOffAuto kernel_irqchip_split; + bool sync_mmu; + bool guest_state_protected; + uint64_t manual_dirty_log_protect; + /* + * Older POSIX says that ioctl numbers are signed int, but in + * practice they are not. (Newer POSIX doesn't specify ioctl + * at all.) Linux, glibc and *BSD all treat ioctl numbers as + * unsigned, and real-world ioctl values like KVM_GET_XSAVE have + * bit 31 set, which means that passing them via an 'int' will + * result in sign-extension when they get converted back to the + * 'unsigned long' which the ioctl() prototype uses. Luckily Linux + * always treats the argument as an unsigned 32-bit int, so any + * possible sign-extension is deliberately ignored, but for + * consistency we keep to the same type that glibc is using. + */ + unsigned long irq_set_ioctl; + unsigned int sigmask_len; + GHashTable *gsimap; +#ifdef KVM_CAP_IRQ_ROUTING + struct kvm_irq_routing *irq_routes; + int nr_allocated_irq_routes; + unsigned long *used_gsi_bitmap; + unsigned int gsi_count; +#endif + KVMMemoryListener memory_listener; + QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; + + /* For "info mtree -f" to tell if an MR is registered in KVM */ + int nr_as; + struct KVMAs { + KVMMemoryListener *ml; + AddressSpace *as; + } *as; + uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ + uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ + bool kvm_dirty_ring_with_bitmap; + uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */ + struct KVMDirtyRingReaper reaper; + struct KVMMsrEnergy msr_energy; + NotifyVmexitOption notify_vmexit; + uint32_t notify_window; + uint32_t xen_version; + uint32_t xen_caps; + uint16_t xen_gnttab_max_frames; + uint16_t xen_evtchn_max_pirq; + char *device; +}; + +void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, + AddressSpace *as, int as_id, const char *name); + +void kvm_set_max_memslot_size(hwaddr max_slot_size); + +/** + * kvm_hwpoison_page_add: + * + * Parameters: + * @ram_addr: the address in the RAM for the poisoned page + * + * Add a poisoned page to the list + * + * Return: None. + */ +void kvm_hwpoison_page_add(ram_addr_t ram_addr); +#endif diff --git a/include/system/kvm_xen.h b/include/system/kvm_xen.h new file mode 100644 index 0000000..7d0e69f --- /dev/null +++ b/include/system/kvm_xen.h @@ -0,0 +1,44 @@ +/* + * Xen HVM emulation support in KVM + * + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_SYSTEM_KVM_XEN_H +#define QEMU_SYSTEM_KVM_XEN_H + +/* The KVM API uses these to indicate "no GPA" or "no GFN" */ +#define INVALID_GPA UINT64_MAX +#define INVALID_GFN UINT64_MAX + +/* QEMU plays the rĂ´le of dom0 for "interdomain" communication. */ +#define DOMID_QEMU 0 + +int kvm_xen_soft_reset(void); +uint32_t kvm_xen_get_caps(void); +void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id); +bool kvm_xen_has_vcpu_callback_vector(void); +void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type); +void kvm_xen_set_callback_asserted(void); +int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port); +uint16_t kvm_xen_get_gnttab_max_frames(void); +uint16_t kvm_xen_get_evtchn_max_pirq(void); + +#define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() & \ + KVM_XEN_HVM_CONFIG_ ## cap)) + +#define XEN_SPECIAL_AREA_ADDR 0xfeff8000UL +#define XEN_SPECIAL_AREA_SIZE 0x4000UL + +#define XEN_SPECIALPAGE_CONSOLE 0 +#define XEN_SPECIALPAGE_XENSTORE 1 + +#define XEN_SPECIAL_PFN(x) ((XEN_SPECIAL_AREA_ADDR >> TARGET_PAGE_BITS) + \ + XEN_SPECIALPAGE_##x) + +#endif /* QEMU_SYSTEM_KVM_XEN_H */ diff --git a/include/system/memory.h b/include/system/memory.h new file mode 100644 index 0000000..46248d4 --- /dev/null +++ b/include/system/memory.h @@ -0,0 +1,3267 @@ +/* + * Physical memory management API + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Avi Kivity <avi@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef SYSTEM_MEMORY_H +#define SYSTEM_MEMORY_H + +#include "exec/cpu-common.h" +#include "exec/hwaddr.h" +#include "exec/memattrs.h" +#include "exec/memop.h" +#include "exec/ramlist.h" +#include "exec/tswap.h" +#include "qemu/bswap.h" +#include "qemu/queue.h" +#include "qemu/int128.h" +#include "qemu/range.h" +#include "qemu/notify.h" +#include "qom/object.h" +#include "qemu/rcu.h" + +#define RAM_ADDR_INVALID (~(ram_addr_t)0) + +#define MAX_PHYS_ADDR_SPACE_BITS 62 +#define MAX_PHYS_ADDR (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1) + +#define TYPE_MEMORY_REGION "memory-region" +DECLARE_INSTANCE_CHECKER(MemoryRegion, MEMORY_REGION, + TYPE_MEMORY_REGION) + +#define TYPE_IOMMU_MEMORY_REGION "iommu-memory-region" +typedef struct IOMMUMemoryRegionClass IOMMUMemoryRegionClass; +DECLARE_OBJ_CHECKERS(IOMMUMemoryRegion, IOMMUMemoryRegionClass, + IOMMU_MEMORY_REGION, TYPE_IOMMU_MEMORY_REGION) + +#define TYPE_RAM_DISCARD_MANAGER "ram-discard-manager" +typedef struct RamDiscardManagerClass RamDiscardManagerClass; +typedef struct RamDiscardManager RamDiscardManager; +DECLARE_OBJ_CHECKERS(RamDiscardManager, RamDiscardManagerClass, + RAM_DISCARD_MANAGER, TYPE_RAM_DISCARD_MANAGER); + +#ifdef CONFIG_FUZZ +void fuzz_dma_read_cb(size_t addr, + size_t len, + MemoryRegion *mr); +#else +static inline void fuzz_dma_read_cb(size_t addr, + size_t len, + MemoryRegion *mr) +{ + /* Do Nothing */ +} +#endif + +/* Possible bits for global_dirty_log_{start|stop} */ + +/* Dirty tracking enabled because migration is running */ +#define GLOBAL_DIRTY_MIGRATION (1U << 0) + +/* Dirty tracking enabled because measuring dirty rate */ +#define GLOBAL_DIRTY_DIRTY_RATE (1U << 1) + +/* Dirty tracking enabled because dirty limit */ +#define GLOBAL_DIRTY_LIMIT (1U << 2) + +#define GLOBAL_DIRTY_MASK (0x7) + +extern unsigned int global_dirty_tracking; + +typedef struct MemoryRegionOps MemoryRegionOps; + +struct ReservedRegion { + Range range; + unsigned type; +}; + +/** + * struct MemoryRegionSection: describes a fragment of a #MemoryRegion + * + * @mr: the region, or %NULL if empty + * @fv: the flat view of the address space the region is mapped in + * @offset_within_region: the beginning of the section, relative to @mr's start + * @size: the size of the section; will not exceed @mr's boundaries + * @offset_within_address_space: the address of the first byte of the section + * relative to the region's address space + * @readonly: writes to this section are ignored + * @nonvolatile: this section is non-volatile + * @unmergeable: this section should not get merged with adjacent sections + */ +struct MemoryRegionSection { + Int128 size; + MemoryRegion *mr; + FlatView *fv; + hwaddr offset_within_region; + hwaddr offset_within_address_space; + bool readonly; + bool nonvolatile; + bool unmergeable; +}; + +typedef struct IOMMUTLBEntry IOMMUTLBEntry; + +/* See address_space_translate: bit 0 is read, bit 1 is write. */ +typedef enum { + IOMMU_NONE = 0, + IOMMU_RO = 1, + IOMMU_WO = 2, + IOMMU_RW = 3, +} IOMMUAccessFlags; + +#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) + +struct IOMMUTLBEntry { + AddressSpace *target_as; + hwaddr iova; + hwaddr translated_addr; + hwaddr addr_mask; /* 0xfff = 4k translation */ + IOMMUAccessFlags perm; +}; + +/* + * Bitmap for different IOMMUNotifier capabilities. Each notifier can + * register with one or multiple IOMMU Notifier capability bit(s). + * + * Normally there're two use cases for the notifiers: + * + * (1) When the device needs accurate synchronizations of the vIOMMU page + * tables, it needs to register with both MAP|UNMAP notifies (which + * is defined as IOMMU_NOTIFIER_IOTLB_EVENTS below). + * + * Regarding to accurate synchronization, it's when the notified + * device maintains a shadow page table and must be notified on each + * guest MAP (page table entry creation) and UNMAP (invalidation) + * events (e.g. VFIO). Both notifications must be accurate so that + * the shadow page table is fully in sync with the guest view. + * + * (2) When the device doesn't need accurate synchronizations of the + * vIOMMU page tables, it needs to register only with UNMAP or + * DEVIOTLB_UNMAP notifies. + * + * It's when the device maintains a cache of IOMMU translations + * (IOTLB) and is able to fill that cache by requesting translations + * from the vIOMMU through a protocol similar to ATS (Address + * Translation Service). + * + * Note that in this mode the vIOMMU will not maintain a shadowed + * page table for the address space, and the UNMAP messages can cover + * more than the pages that used to get mapped. The IOMMU notifiee + * should be able to take care of over-sized invalidations. + */ +typedef enum { + IOMMU_NOTIFIER_NONE = 0, + /* Notify cache invalidations */ + IOMMU_NOTIFIER_UNMAP = 0x1, + /* Notify entry changes (newly created entries) */ + IOMMU_NOTIFIER_MAP = 0x2, + /* Notify changes on device IOTLB entries */ + IOMMU_NOTIFIER_DEVIOTLB_UNMAP = 0x04, +} IOMMUNotifierFlag; + +#define IOMMU_NOTIFIER_IOTLB_EVENTS (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) +#define IOMMU_NOTIFIER_DEVIOTLB_EVENTS IOMMU_NOTIFIER_DEVIOTLB_UNMAP +#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_IOTLB_EVENTS | \ + IOMMU_NOTIFIER_DEVIOTLB_EVENTS) + +struct IOMMUNotifier; +typedef void (*IOMMUNotify)(struct IOMMUNotifier *notifier, + IOMMUTLBEntry *data); + +struct IOMMUNotifier { + IOMMUNotify notify; + IOMMUNotifierFlag notifier_flags; + /* Notify for address space range start <= addr <= end */ + hwaddr start; + hwaddr end; + int iommu_idx; + void *opaque; + QLIST_ENTRY(IOMMUNotifier) node; +}; +typedef struct IOMMUNotifier IOMMUNotifier; + +typedef struct IOMMUTLBEvent { + IOMMUNotifierFlag type; + IOMMUTLBEntry entry; +} IOMMUTLBEvent; + +/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ +#define RAM_PREALLOC (1 << 0) + +/* RAM is mmap-ed with MAP_SHARED */ +#define RAM_SHARED (1 << 1) + +/* Only a portion of RAM (used_length) is actually used, and migrated. + * Resizing RAM while migrating can result in the migration being canceled. + */ +#define RAM_RESIZEABLE (1 << 2) + +/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically + * zero the page and wake waiting processes. + * (Set during postcopy) + */ +#define RAM_UF_ZEROPAGE (1 << 3) + +/* RAM can be migrated */ +#define RAM_MIGRATABLE (1 << 4) + +/* RAM is a persistent kind memory */ +#define RAM_PMEM (1 << 5) + + +/* + * UFFDIO_WRITEPROTECT is used on this RAMBlock to + * support 'write-tracking' migration type. + * Implies ram_state->ram_wt_enabled. + */ +#define RAM_UF_WRITEPROTECT (1 << 6) + +/* + * RAM is mmap-ed with MAP_NORESERVE. When set, reserving swap space (or huge + * pages if applicable) is skipped: will bail out if not supported. When not + * set, the OS will do the reservation, if supported for the memory type. + */ +#define RAM_NORESERVE (1 << 7) + +/* RAM that isn't accessible through normal means. */ +#define RAM_PROTECTED (1 << 8) + +/* RAM is an mmap-ed named file */ +#define RAM_NAMED_FILE (1 << 9) + +/* RAM is mmap-ed read-only */ +#define RAM_READONLY (1 << 10) + +/* RAM FD is opened read-only */ +#define RAM_READONLY_FD (1 << 11) + +/* RAM can be private that has kvm guest memfd backend */ +#define RAM_GUEST_MEMFD (1 << 12) + +/* + * In RAMBlock creation functions, if MAP_SHARED is 0 in the flags parameter, + * the implementation may still create a shared mapping if other conditions + * require it. Callers who specifically want a private mapping, eg objects + * specified by the user, must pass RAM_PRIVATE. + * After RAMBlock creation, MAP_SHARED in the block's flags indicates whether + * the block is shared or private, and MAP_PRIVATE is omitted. + */ +#define RAM_PRIVATE (1 << 13) + +static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, + IOMMUNotifierFlag flags, + hwaddr start, hwaddr end, + int iommu_idx) +{ + n->notify = fn; + n->notifier_flags = flags; + n->start = start; + n->end = end; + n->iommu_idx = iommu_idx; +} + +/* + * Memory region callbacks + */ +struct MemoryRegionOps { + /* Read from the memory region. @addr is relative to @mr; @size is + * in bytes. */ + uint64_t (*read)(void *opaque, + hwaddr addr, + unsigned size); + /* Write to the memory region. @addr is relative to @mr; @size is + * in bytes. */ + void (*write)(void *opaque, + hwaddr addr, + uint64_t data, + unsigned size); + + MemTxResult (*read_with_attrs)(void *opaque, + hwaddr addr, + uint64_t *data, + unsigned size, + MemTxAttrs attrs); + MemTxResult (*write_with_attrs)(void *opaque, + hwaddr addr, + uint64_t data, + unsigned size, + MemTxAttrs attrs); + + enum device_endian endianness; + /* Guest-visible constraints: */ + struct { + /* If nonzero, specify bounds on access sizes beyond which a machine + * check is thrown. + */ + unsigned min_access_size; + unsigned max_access_size; + /* If true, unaligned accesses are supported. Otherwise unaligned + * accesses throw machine checks. + */ + bool unaligned; + /* + * If present, and returns #false, the transaction is not accepted + * by the device (and results in machine dependent behaviour such + * as a machine check exception). + */ + bool (*accepts)(void *opaque, hwaddr addr, + unsigned size, bool is_write, + MemTxAttrs attrs); + } valid; + /* Internal implementation constraints: */ + struct { + /* If nonzero, specifies the minimum size implemented. Smaller sizes + * will be rounded upwards and a partial result will be returned. + */ + unsigned min_access_size; + /* If nonzero, specifies the maximum size implemented. Larger sizes + * will be done as a series of accesses with smaller sizes. + */ + unsigned max_access_size; + /* If true, unaligned accesses are supported. Otherwise all accesses + * are converted to (possibly multiple) naturally aligned accesses. + */ + bool unaligned; + } impl; +}; + +typedef struct MemoryRegionClass { + /* private */ + ObjectClass parent_class; +} MemoryRegionClass; + + +enum IOMMUMemoryRegionAttr { + IOMMU_ATTR_SPAPR_TCE_FD +}; + +/* + * IOMMUMemoryRegionClass: + * + * All IOMMU implementations need to subclass TYPE_IOMMU_MEMORY_REGION + * and provide an implementation of at least the @translate method here + * to handle requests to the memory region. Other methods are optional. + * + * The IOMMU implementation must use the IOMMU notifier infrastructure + * to report whenever mappings are changed, by calling + * memory_region_notify_iommu() (or, if necessary, by calling + * memory_region_notify_iommu_one() for each registered notifier). + * + * Conceptually an IOMMU provides a mapping from input address + * to an output TLB entry. If the IOMMU is aware of memory transaction + * attributes and the output TLB entry depends on the transaction + * attributes, we represent this using IOMMU indexes. Each index + * selects a particular translation table that the IOMMU has: + * + * @attrs_to_index returns the IOMMU index for a set of transaction attributes + * + * @translate takes an input address and an IOMMU index + * + * and the mapping returned can only depend on the input address and the + * IOMMU index. + * + * Most IOMMUs don't care about the transaction attributes and support + * only a single IOMMU index. A more complex IOMMU might have one index + * for secure transactions and one for non-secure transactions. + */ +struct IOMMUMemoryRegionClass { + /* private: */ + MemoryRegionClass parent_class; + + /* public: */ + /** + * @translate: + * + * Return a TLB entry that contains a given address. + * + * The IOMMUAccessFlags indicated via @flag are optional and may + * be specified as IOMMU_NONE to indicate that the caller needs + * the full translation information for both reads and writes. If + * the access flags are specified then the IOMMU implementation + * may use this as an optimization, to stop doing a page table + * walk as soon as it knows that the requested permissions are not + * allowed. If IOMMU_NONE is passed then the IOMMU must do the + * full page table walk and report the permissions in the returned + * IOMMUTLBEntry. (Note that this implies that an IOMMU may not + * return different mappings for reads and writes.) + * + * The returned information remains valid while the caller is + * holding the big QEMU lock or is inside an RCU critical section; + * if the caller wishes to cache the mapping beyond that it must + * register an IOMMU notifier so it can invalidate its cached + * information when the IOMMU mapping changes. + * + * @iommu: the IOMMUMemoryRegion + * + * @hwaddr: address to be translated within the memory region + * + * @flag: requested access permission + * + * @iommu_idx: IOMMU index for the translation + */ + IOMMUTLBEntry (*translate)(IOMMUMemoryRegion *iommu, hwaddr addr, + IOMMUAccessFlags flag, int iommu_idx); + /** + * @get_min_page_size: + * + * Returns minimum supported page size in bytes. + * + * If this method is not provided then the minimum is assumed to + * be TARGET_PAGE_SIZE. + * + * @iommu: the IOMMUMemoryRegion + */ + uint64_t (*get_min_page_size)(IOMMUMemoryRegion *iommu); + /** + * @notify_flag_changed: + * + * Called when IOMMU Notifier flag changes (ie when the set of + * events which IOMMU users are requesting notification for changes). + * Optional method -- need not be provided if the IOMMU does not + * need to know exactly which events must be notified. + * + * @iommu: the IOMMUMemoryRegion + * + * @old_flags: events which previously needed to be notified + * + * @new_flags: events which now need to be notified + * + * Returns 0 on success, or a negative errno; in particular + * returns -EINVAL if the new flag bitmap is not supported by the + * IOMMU memory region. In case of failure, the error object + * must be created + */ + int (*notify_flag_changed)(IOMMUMemoryRegion *iommu, + IOMMUNotifierFlag old_flags, + IOMMUNotifierFlag new_flags, + Error **errp); + /** + * @replay: + * + * Called to handle memory_region_iommu_replay(). + * + * The default implementation of memory_region_iommu_replay() is to + * call the IOMMU translate method for every page in the address space + * with flag == IOMMU_NONE and then call the notifier if translate + * returns a valid mapping. If this method is implemented then it + * overrides the default behaviour, and must provide the full semantics + * of memory_region_iommu_replay(), by calling @notifier for every + * translation present in the IOMMU. + * + * Optional method -- an IOMMU only needs to provide this method + * if the default is inefficient or produces undesirable side effects. + * + * Note: this is not related to record-and-replay functionality. + */ + void (*replay)(IOMMUMemoryRegion *iommu, IOMMUNotifier *notifier); + + /** + * @get_attr: + * + * Get IOMMU misc attributes. This is an optional method that + * can be used to allow users of the IOMMU to get implementation-specific + * information. The IOMMU implements this method to handle calls + * by IOMMU users to memory_region_iommu_get_attr() by filling in + * the arbitrary data pointer for any IOMMUMemoryRegionAttr values that + * the IOMMU supports. If the method is unimplemented then + * memory_region_iommu_get_attr() will always return -EINVAL. + * + * @iommu: the IOMMUMemoryRegion + * + * @attr: attribute being queried + * + * @data: memory to fill in with the attribute data + * + * Returns 0 on success, or a negative errno; in particular + * returns -EINVAL for unrecognized or unimplemented attribute types. + */ + int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr attr, + void *data); + + /** + * @attrs_to_index: + * + * Return the IOMMU index to use for a given set of transaction attributes. + * + * Optional method: if an IOMMU only supports a single IOMMU index then + * the default implementation of memory_region_iommu_attrs_to_index() + * will return 0. + * + * The indexes supported by an IOMMU must be contiguous, starting at 0. + * + * @iommu: the IOMMUMemoryRegion + * @attrs: memory transaction attributes + */ + int (*attrs_to_index)(IOMMUMemoryRegion *iommu, MemTxAttrs attrs); + + /** + * @num_indexes: + * + * Return the number of IOMMU indexes this IOMMU supports. + * + * Optional method: if this method is not provided, then + * memory_region_iommu_num_indexes() will return 1, indicating that + * only a single IOMMU index is supported. + * + * @iommu: the IOMMUMemoryRegion + */ + int (*num_indexes)(IOMMUMemoryRegion *iommu); +}; + +typedef struct RamDiscardListener RamDiscardListener; +typedef int (*NotifyRamPopulate)(RamDiscardListener *rdl, + MemoryRegionSection *section); +typedef void (*NotifyRamDiscard)(RamDiscardListener *rdl, + MemoryRegionSection *section); + +struct RamDiscardListener { + /* + * @notify_populate: + * + * Notification that previously discarded memory is about to get populated. + * Listeners are able to object. If any listener objects, already + * successfully notified listeners are notified about a discard again. + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + * + * Returns 0 on success. If the notification is rejected by the listener, + * an error is returned. + */ + NotifyRamPopulate notify_populate; + + /* + * @notify_discard: + * + * Notification that previously populated memory was discarded successfully + * and listeners should drop all references to such memory and prevent + * new population (e.g., unmap). + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + */ + NotifyRamDiscard notify_discard; + + /* + * @double_discard_supported: + * + * The listener suppors getting @notify_discard notifications that span + * already discarded parts. + */ + bool double_discard_supported; + + MemoryRegionSection *section; + QLIST_ENTRY(RamDiscardListener) next; +}; + +static inline void ram_discard_listener_init(RamDiscardListener *rdl, + NotifyRamPopulate populate_fn, + NotifyRamDiscard discard_fn, + bool double_discard_supported) +{ + rdl->notify_populate = populate_fn; + rdl->notify_discard = discard_fn; + rdl->double_discard_supported = double_discard_supported; +} + +/** + * typedef ReplayRamDiscardState: + * + * The callback handler for #RamDiscardManagerClass.replay_populated/ + * #RamDiscardManagerClass.replay_discarded to invoke on populated/discarded + * parts. + * + * @section: the #MemoryRegionSection of populated/discarded part + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if failed. + */ +typedef int (*ReplayRamDiscardState)(MemoryRegionSection *section, + void *opaque); + +/* + * RamDiscardManagerClass: + * + * A #RamDiscardManager coordinates which parts of specific RAM #MemoryRegion + * regions are currently populated to be used/accessed by the VM, notifying + * after parts were discarded (freeing up memory) and before parts will be + * populated (consuming memory), to be used/accessed by the VM. + * + * A #RamDiscardManager can only be set for a RAM #MemoryRegion while the + * #MemoryRegion isn't mapped into an address space yet (either directly + * or via an alias); it cannot change while the #MemoryRegion is + * mapped into an address space. + * + * The #RamDiscardManager is intended to be used by technologies that are + * incompatible with discarding of RAM (e.g., VFIO, which may pin all + * memory inside a #MemoryRegion), and require proper coordination to only + * map the currently populated parts, to hinder parts that are expected to + * remain discarded from silently getting populated and consuming memory. + * Technologies that support discarding of RAM don't have to bother and can + * simply map the whole #MemoryRegion. + * + * An example #RamDiscardManager is virtio-mem, which logically (un)plugs + * memory within an assigned RAM #MemoryRegion, coordinated with the VM. + * Logically unplugging memory consists of discarding RAM. The VM agreed to not + * access unplugged (discarded) memory - especially via DMA. virtio-mem will + * properly coordinate with listeners before memory is plugged (populated), + * and after memory is unplugged (discarded). + * + * Listeners are called in multiples of the minimum granularity (unless it + * would exceed the registered range) and changes are aligned to the minimum + * granularity within the #MemoryRegion. Listeners have to prepare for memory + * becoming discarded in a different granularity than it was populated and the + * other way around. + */ +struct RamDiscardManagerClass { + /* private */ + InterfaceClass parent_class; + + /* public */ + + /** + * @get_min_granularity: + * + * Get the minimum granularity in which listeners will get notified + * about changes within the #MemoryRegion via the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @mr: the #MemoryRegion + * + * Returns the minimum granularity. + */ + uint64_t (*get_min_granularity)(const RamDiscardManager *rdm, + const MemoryRegion *mr); + + /** + * @is_populated: + * + * Check whether the given #MemoryRegionSection is completely populated + * (i.e., no parts are currently discarded) via the #RamDiscardManager. + * There are no alignment requirements. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * + * Returns whether the given range is completely populated. + */ + bool (*is_populated)(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + + /** + * @replay_populated: + * + * Call the #ReplayRamDiscardState callback for all populated parts within + * the #MemoryRegionSection via the #RamDiscardManager. + * + * In case any call fails, no further calls are made. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ + int (*replay_populated)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, void *opaque); + + /** + * @replay_discarded: + * + * Call the #ReplayRamDiscardState callback for all discarded parts within + * the #MemoryRegionSection via the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ + int (*replay_discarded)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, void *opaque); + + /** + * @register_listener: + * + * Register a #RamDiscardListener for the given #MemoryRegionSection and + * immediately notify the #RamDiscardListener about all populated parts + * within the #MemoryRegionSection via the #RamDiscardManager. + * + * In case any notification fails, no further notifications are triggered + * and an error is logged. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + * @section: the #MemoryRegionSection + */ + void (*register_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + + /** + * @unregister_listener: + * + * Unregister a previously registered #RamDiscardListener via the + * #RamDiscardManager after notifying the #RamDiscardListener about all + * populated parts becoming unpopulated within the registered + * #MemoryRegionSection. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + */ + void (*unregister_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl); +}; + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr); + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + +/** + * ram_discard_manager_replay_populated: + * + * A wrapper to call the #RamDiscardManagerClass.replay_populated callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, + void *opaque); + +/** + * ram_discard_manager_replay_discarded: + * + * A wrapper to call the #RamDiscardManagerClass.replay_discarded callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ +int ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, + void *opaque); + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl); + +/** + * memory_translate_iotlb: Extract addresses from a TLB entry. + * Called with rcu_read_lock held. + * + * @iotlb: pointer to an #IOMMUTLBEntry + * @xlat_p: return the offset of the entry from the start of the returned + * MemoryRegion. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: On success, return the MemoryRegion containing the @iotlb translated + * addr. The MemoryRegion must not be accessed after rcu_read_unlock. + * On failure, return NULL, setting @errp with error. + */ +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp); + +typedef struct CoalescedMemoryRange CoalescedMemoryRange; +typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; + +/** MemoryRegion: + * + * A struct representing a memory region. + */ +struct MemoryRegion { + Object parent_obj; + + /* private: */ + + /* The following fields should fit in a cache line */ + bool romd_mode; + bool ram; + bool subpage; + bool readonly; /* For RAM regions */ + bool nonvolatile; + bool rom_device; + bool flush_coalesced_mmio; + bool unmergeable; + uint8_t dirty_log_mask; + bool is_iommu; + RAMBlock *ram_block; + Object *owner; + /* owner as TYPE_DEVICE. Used for re-entrancy checks in MR access hotpath */ + DeviceState *dev; + + const MemoryRegionOps *ops; + void *opaque; + MemoryRegion *container; + int mapped_via_alias; /* Mapped via an alias, container might be NULL */ + Int128 size; + hwaddr addr; + void (*destructor)(MemoryRegion *mr); + uint64_t align; + bool terminates; + bool ram_device; + bool enabled; + uint8_t vga_logging_count; + MemoryRegion *alias; + hwaddr alias_offset; + int32_t priority; + QTAILQ_HEAD(, MemoryRegion) subregions; + QTAILQ_ENTRY(MemoryRegion) subregions_link; + QTAILQ_HEAD(, CoalescedMemoryRange) coalesced; + const char *name; + unsigned ioeventfd_nb; + MemoryRegionIoeventfd *ioeventfds; + RamDiscardManager *rdm; /* Only for RAM */ + + /* For devices designed to perform re-entrant IO into their own IO MRs */ + bool disable_reentrancy_guard; +}; + +struct IOMMUMemoryRegion { + MemoryRegion parent_obj; + + QLIST_HEAD(, IOMMUNotifier) iommu_notify; + IOMMUNotifierFlag iommu_notify_flags; +}; + +#define IOMMU_NOTIFIER_FOREACH(n, mr) \ + QLIST_FOREACH((n), &(mr)->iommu_notify, node) + +#define MEMORY_LISTENER_PRIORITY_MIN 0 +#define MEMORY_LISTENER_PRIORITY_ACCEL 10 +#define MEMORY_LISTENER_PRIORITY_DEV_BACKEND 10 + +/** + * struct MemoryListener: callbacks structure for updates to the physical memory map + * + * Allows a component to adjust to changes in the guest-visible memory map. + * Use with memory_listener_register() and memory_listener_unregister(). + */ +struct MemoryListener { + /** + * @begin: + * + * Called at the beginning of an address space update transaction. + * Followed by calls to #MemoryListener.region_add(), + * #MemoryListener.region_del(), #MemoryListener.region_nop(), + * #MemoryListener.log_start() and #MemoryListener.log_stop() in + * increasing address order. + * + * @listener: The #MemoryListener. + */ + void (*begin)(MemoryListener *listener); + + /** + * @commit: + * + * Called at the end of an address space update transaction, + * after the last call to #MemoryListener.region_add(), + * #MemoryListener.region_del() or #MemoryListener.region_nop(), + * #MemoryListener.log_start() and #MemoryListener.log_stop(). + * + * @listener: The #MemoryListener. + */ + void (*commit)(MemoryListener *listener); + + /** + * @region_add: + * + * Called during an address space update transaction, + * for a section of the address space that is new in this address space + * space since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The new #MemoryRegionSection. + */ + void (*region_add)(MemoryListener *listener, MemoryRegionSection *section); + + /** + * @region_del: + * + * Called during an address space update transaction, + * for a section of the address space that has disappeared in the address + * space since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The old #MemoryRegionSection. + */ + void (*region_del)(MemoryListener *listener, MemoryRegionSection *section); + + /** + * @region_nop: + * + * Called during an address space update transaction, + * for a section of the address space that is in the same place in the address + * space as in the last transaction. + * + * @listener: The #MemoryListener. + * @section: The #MemoryRegionSection. + */ + void (*region_nop)(MemoryListener *listener, MemoryRegionSection *section); + + /** + * @log_start: + * + * Called during an address space update transaction, after + * one of #MemoryListener.region_add(), #MemoryListener.region_del() or + * #MemoryListener.region_nop(), if dirty memory logging clients have + * become active since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The #MemoryRegionSection. + * @old: A bitmap of dirty memory logging clients that were active in + * the previous transaction. + * @new: A bitmap of dirty memory logging clients that are active in + * the current transaction. + */ + void (*log_start)(MemoryListener *listener, MemoryRegionSection *section, + int old_val, int new_val); + + /** + * @log_stop: + * + * Called during an address space update transaction, after + * one of #MemoryListener.region_add(), #MemoryListener.region_del() or + * #MemoryListener.region_nop() and possibly after + * #MemoryListener.log_start(), if dirty memory logging clients have + * become inactive since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The #MemoryRegionSection. + * @old: A bitmap of dirty memory logging clients that were active in + * the previous transaction. + * @new: A bitmap of dirty memory logging clients that are active in + * the current transaction. + */ + void (*log_stop)(MemoryListener *listener, MemoryRegionSection *section, + int old_val, int new_val); + + /** + * @log_sync: + * + * Called by memory_region_snapshot_and_clear_dirty() and + * memory_global_dirty_log_sync(), before accessing QEMU's "official" + * copy of the dirty memory bitmap for a #MemoryRegionSection. + * + * @listener: The #MemoryListener. + * @section: The #MemoryRegionSection. + */ + void (*log_sync)(MemoryListener *listener, MemoryRegionSection *section); + + /** + * @log_sync_global: + * + * This is the global version of @log_sync when the listener does + * not have a way to synchronize the log with finer granularity. + * When the listener registers with @log_sync_global defined, then + * its @log_sync must be NULL. Vice versa. + * + * @listener: The #MemoryListener. + * @last_stage: The last stage to synchronize the log during migration. + * The caller should guarantee that the synchronization with true for + * @last_stage is triggered for once after all VCPUs have been stopped. + */ + void (*log_sync_global)(MemoryListener *listener, bool last_stage); + + /** + * @log_clear: + * + * Called before reading the dirty memory bitmap for a + * #MemoryRegionSection. + * + * @listener: The #MemoryListener. + * @section: The #MemoryRegionSection. + */ + void (*log_clear)(MemoryListener *listener, MemoryRegionSection *section); + + /** + * @log_global_start: + * + * Called by memory_global_dirty_log_start(), which + * enables the %DIRTY_LOG_MIGRATION client on all memory regions in + * the address space. #MemoryListener.log_global_start() is also + * called when a #MemoryListener is added, if global dirty logging is + * active at that time. + * + * @listener: The #MemoryListener. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ + bool (*log_global_start)(MemoryListener *listener, Error **errp); + + /** + * @log_global_stop: + * + * Called by memory_global_dirty_log_stop(), which + * disables the %DIRTY_LOG_MIGRATION client on all memory regions in + * the address space. + * + * @listener: The #MemoryListener. + */ + void (*log_global_stop)(MemoryListener *listener); + + /** + * @log_global_after_sync: + * + * Called after reading the dirty memory bitmap + * for any #MemoryRegionSection. + * + * @listener: The #MemoryListener. + */ + void (*log_global_after_sync)(MemoryListener *listener); + + /** + * @eventfd_add: + * + * Called during an address space update transaction, + * for a section of the address space that has had a new ioeventfd + * registration since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The new #MemoryRegionSection. + * @match_data: The @match_data parameter for the new ioeventfd. + * @data: The @data parameter for the new ioeventfd. + * @e: The #EventNotifier parameter for the new ioeventfd. + */ + void (*eventfd_add)(MemoryListener *listener, MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e); + + /** + * @eventfd_del: + * + * Called during an address space update transaction, + * for a section of the address space that has dropped an ioeventfd + * registration since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The new #MemoryRegionSection. + * @match_data: The @match_data parameter for the dropped ioeventfd. + * @data: The @data parameter for the dropped ioeventfd. + * @e: The #EventNotifier parameter for the dropped ioeventfd. + */ + void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e); + + /** + * @coalesced_io_add: + * + * Called during an address space update transaction, + * for a section of the address space that has had a new coalesced + * MMIO range registration since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The new #MemoryRegionSection. + * @addr: The starting address for the coalesced MMIO range. + * @len: The length of the coalesced MMIO range. + */ + void (*coalesced_io_add)(MemoryListener *listener, MemoryRegionSection *section, + hwaddr addr, hwaddr len); + + /** + * @coalesced_io_del: + * + * Called during an address space update transaction, + * for a section of the address space that has dropped a coalesced + * MMIO range since the last transaction. + * + * @listener: The #MemoryListener. + * @section: The new #MemoryRegionSection. + * @addr: The starting address for the coalesced MMIO range. + * @len: The length of the coalesced MMIO range. + */ + void (*coalesced_io_del)(MemoryListener *listener, MemoryRegionSection *section, + hwaddr addr, hwaddr len); + /** + * @priority: + * + * Govern the order in which memory listeners are invoked. Lower priorities + * are invoked earlier for "add" or "start" callbacks, and later for "delete" + * or "stop" callbacks. + */ + unsigned priority; + + /** + * @name: + * + * Name of the listener. It can be used in contexts where we'd like to + * identify one memory listener with the rest. + */ + const char *name; + + /* private: */ + AddressSpace *address_space; + QTAILQ_ENTRY(MemoryListener) link; + QTAILQ_ENTRY(MemoryListener) link_as; +}; + +typedef struct AddressSpaceMapClient { + QEMUBH *bh; + QLIST_ENTRY(AddressSpaceMapClient) link; +} AddressSpaceMapClient; + +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096) + +/** + * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects + */ +struct AddressSpace { + /* private: */ + struct rcu_head rcu; + char *name; + MemoryRegion *root; + + /* Accessed via RCU. */ + struct FlatView *current_map; + + int ioeventfd_nb; + int ioeventfd_notifiers; + struct MemoryRegionIoeventfd *ioeventfds; + QTAILQ_HEAD(, MemoryListener) listeners; + QTAILQ_ENTRY(AddressSpace) address_spaces_link; + + /* + * Maximum DMA bounce buffer size used for indirect memory map requests. + * This limits the total size of bounce buffer allocations made for + * DMA requests to indirect memory regions within this AddressSpace. DMA + * requests that exceed the limit (e.g. due to overly large requested size + * or concurrent DMA requests having claimed too much buffer space) will be + * rejected and left to the caller to handle. + */ + size_t max_bounce_buffer_size; + /* Total size of bounce buffers currently allocated, atomically accessed */ + size_t bounce_buffer_size; + /* List of callbacks to invoke when buffers free up */ + QemuMutex map_client_list_lock; + QLIST_HEAD(, AddressSpaceMapClient) map_client_list; +}; + +typedef struct AddressSpaceDispatch AddressSpaceDispatch; +typedef struct FlatRange FlatRange; + +/* Flattened global view of current active memory hierarchy. Kept in sorted + * order. + */ +struct FlatView { + struct rcu_head rcu; + unsigned ref; + FlatRange *ranges; + unsigned nr; + unsigned nr_allocated; + struct AddressSpaceDispatch *dispatch; + MemoryRegion *root; +}; + +static inline FlatView *address_space_to_flatview(AddressSpace *as) +{ + return qatomic_rcu_read(&as->current_map); +} + +/** + * typedef flatview_cb: callback for flatview_for_each_range() + * + * @start: start address of the range within the FlatView + * @len: length of the range in bytes + * @mr: MemoryRegion covering this range + * @offset_in_region: offset of the first byte of the range within @mr + * @opaque: data pointer passed to flatview_for_each_range() + * + * Returns: true to stop the iteration, false to keep going. + */ +typedef bool (*flatview_cb)(Int128 start, + Int128 len, + const MemoryRegion *mr, + hwaddr offset_in_region, + void *opaque); + +/** + * flatview_for_each_range: Iterate through a FlatView + * @fv: the FlatView to iterate through + * @cb: function to call for each range + * @opaque: opaque data pointer to pass to @cb + * + * A FlatView is made up of a list of non-overlapping ranges, each of + * which is a slice of a MemoryRegion. This function iterates through + * each range in @fv, calling @cb. The callback function can terminate + * iteration early by returning 'true'. + */ +void flatview_for_each_range(FlatView *fv, flatview_cb cb, void *opaque); + +static inline bool MemoryRegionSection_eq(MemoryRegionSection *a, + MemoryRegionSection *b) +{ + return a->mr == b->mr && + a->fv == b->fv && + a->offset_within_region == b->offset_within_region && + a->offset_within_address_space == b->offset_within_address_space && + int128_eq(a->size, b->size) && + a->readonly == b->readonly && + a->nonvolatile == b->nonvolatile; +} + +/** + * memory_region_section_new_copy: Copy a memory region section + * + * Allocate memory for a new copy, copy the memory region section, and + * properly take a reference on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); + +/** + * memory_region_section_free_copy: Free a copied memory region section + * + * Free a copy of a memory section created via memory_region_section_new_copy(). + * properly dropping references on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +void memory_region_section_free_copy(MemoryRegionSection *s); + +/** + * memory_region_section_intersect_range: Adjust the memory section to cover + * the intersection with the given range. + * + * @s: the #MemoryRegionSection to be adjusted + * @offset: the offset of the given range in the memory region + * @size: the size of the given range + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static inline bool memory_region_section_intersect_range(MemoryRegionSection *s, + uint64_t offset, + uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + Int128 end = int128_min(int128_add(int128_make64(s->offset_within_region), + s->size), + int128_add(int128_make64(offset), + int128_make64(size))); + + if (int128_le(end, int128_make64(start))) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_sub(end, int128_make64(start)); + return true; +} + +/** + * memory_region_init: Initialize a memory region + * + * The region typically acts as a container for other memory regions. Use + * memory_region_add_subregion() to add subregions. + * + * @mr: the #MemoryRegion to be initialized + * @owner: the object that tracks the region's reference count + * @name: used for debugging; not visible to the user or ABI + * @size: size of the region; any subregions beyond this size will be clipped + */ +void memory_region_init(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size); + +/** + * memory_region_ref: Add 1 to a memory region's reference count + * + * Whenever memory regions are accessed outside the BQL, they need to be + * preserved against hot-unplug. MemoryRegions actually do not have their + * own reference count; they piggyback on a QOM object, their "owner". + * This function adds a reference to the owner. + * + * All MemoryRegions must have an owner if they can disappear, even if the + * device they belong to operates exclusively under the BQL. This is because + * the region could be returned at any time by memory_region_find, and this + * is usually under guest control. + * + * @mr: the #MemoryRegion + */ +void memory_region_ref(MemoryRegion *mr); + +/** + * memory_region_unref: Remove 1 to a memory region's reference count + * + * Whenever memory regions are accessed outside the BQL, they need to be + * preserved against hot-unplug. MemoryRegions actually do not have their + * own reference count; they piggyback on a QOM object, their "owner". + * This function removes a reference to the owner and possibly destroys it. + * + * @mr: the #MemoryRegion + */ +void memory_region_unref(MemoryRegion *mr); + +/** + * memory_region_init_io: Initialize an I/O memory region. + * + * Accesses into the region will cause the callbacks in @ops to be called. + * if @size is nonzero, subregions will be clipped to @size. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @ops: a structure containing read and write callbacks to be used when + * I/O is performed on the region. + * @opaque: passed to the read and write callbacks of the @ops structure. + * @name: used for debugging; not visible to the user or ABI + * @size: size of the region. + */ +void memory_region_init_io(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size); + +/** + * memory_region_init_ram_nomigrate: Initialize RAM memory region. Accesses + * into the region will modify memory + * directly. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @errp: pointer to Error*, to store an error if it happens. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_ram_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + +/** + * memory_region_init_ram_flags_nomigrate: Initialize RAM memory region. + * Accesses into the region will + * modify memory directly. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_NORESERVE, + * RAM_GUEST_MEMFD. + * @errp: pointer to Error*, to store an error if it happens. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint32_t ram_flags, + Error **errp); + +/** + * memory_region_init_resizeable_ram: Initialize memory region with resizable + * RAM. Accesses into the region will + * modify memory directly. Only an initial + * portion of this RAM is actually used. + * Changing the size while migrating + * can result in the migration being + * canceled. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: used size of the region. + * @max_size: max size of the region. + * @resized: callback to notify owner about used size change. + * @errp: pointer to Error*, to store an error if it happens. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_resizeable_ram(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint64_t max_size, + void (*resized)(const char*, + uint64_t length, + void *host), + Error **errp); +#ifdef CONFIG_POSIX + +/** + * memory_region_init_ram_from_file: Initialize RAM memory region with a + * mmap-ed backend. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @align: alignment of the region base address; if 0, the default alignment + * (getpagesize()) will be used. + * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM, + * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY, + * RAM_READONLY_FD, RAM_GUEST_MEMFD + * @path: the path in which to allocate the RAM. + * @offset: offset within the file referenced by path + * @errp: pointer to Error*, to store an error if it happens. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_ram_from_file(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint64_t align, + uint32_t ram_flags, + const char *path, + ram_addr_t offset, + Error **errp); + +/** + * memory_region_init_ram_from_fd: Initialize RAM memory region with a + * mmap-ed backend. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: the name of the region. + * @size: size of the region. + * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM, + * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY, + * RAM_READONLY_FD, RAM_GUEST_MEMFD + * @fd: the fd to mmap. + * @offset: offset within the file referenced by fd + * @errp: pointer to Error*, to store an error if it happens. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_ram_from_fd(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint32_t ram_flags, + int fd, + ram_addr_t offset, + Error **errp); +#endif + +/** + * memory_region_init_ram_ptr: Initialize RAM memory region from a + * user-provided pointer. Accesses into the + * region will modify memory directly. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @ptr: memory to be mapped; must contain at least @size bytes. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + */ +void memory_region_init_ram_ptr(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + void *ptr); + +/** + * memory_region_init_ram_device_ptr: Initialize RAM device memory region from + * a user-provided pointer. + * + * A RAM device represents a mapping to a physical device, such as to a PCI + * MMIO BAR of an vfio-pci assigned device. The memory region may be mapped + * into the VM address space and access to the region will modify memory + * directly. However, the memory region should not be included in a memory + * dump (device may not be enabled/mapped at the time of the dump), and + * operations incompatible with manipulating MMIO should be avoided. Replaces + * skip_dump flag. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: the name of the region. + * @size: size of the region. + * @ptr: memory to be mapped; must contain at least @size bytes. + * + * Note that this function does not do anything to cause the data in the + * RAM memory region to be migrated; that is the responsibility of the caller. + * (For RAM device memory regions, migrating the contents rarely makes sense.) + */ +void memory_region_init_ram_device_ptr(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + void *ptr); + +/** + * memory_region_init_alias: Initialize a memory region that aliases all or a + * part of another memory region. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: used for debugging; not visible to the user or ABI + * @orig: the region to be referenced; @mr will be equivalent to + * @orig between @offset and @offset + @size - 1. + * @offset: start of the section in @orig to be referenced. + * @size: size of the region. + */ +void memory_region_init_alias(MemoryRegion *mr, + Object *owner, + const char *name, + MemoryRegion *orig, + hwaddr offset, + uint64_t size); + +/** + * memory_region_init_rom_nomigrate: Initialize a ROM memory region. + * + * This has the same effect as calling memory_region_init_ram_nomigrate() + * and then marking the resulting region read-only with + * memory_region_set_readonly(). + * + * Note that this function does not do anything to cause the data in the + * RAM side of the memory region to be migrated; that is the responsibility + * of the caller. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_rom_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + +/** + * memory_region_init_rom_device_nomigrate: Initialize a ROM memory region. + * Writes are handled via callbacks. + * + * Note that this function does not do anything to cause the data in the + * RAM side of the memory region to be migrated; that is the responsibility + * of the caller. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @ops: callbacks for write access handling (must not be NULL). + * @opaque: passed to the read and write callbacks of the @ops structure. + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp); + +/** + * memory_region_init_iommu: Initialize a memory region of a custom type + * that translates addresses + * + * An IOMMU region translates addresses and forwards accesses to a target + * memory region. + * + * The IOMMU implementation must define a subclass of TYPE_IOMMU_MEMORY_REGION. + * @_iommu_mr should be a pointer to enough memory for an instance of + * that subclass, @instance_size is the size of that subclass, and + * @mrtypename is its name. This function will initialize @_iommu_mr as an + * instance of the subclass, and its methods will then be called to handle + * accesses to the memory region. See the documentation of + * #IOMMUMemoryRegionClass for further details. + * + * @_iommu_mr: the #IOMMUMemoryRegion to be initialized + * @instance_size: the IOMMUMemoryRegion subclass instance size + * @mrtypename: the type name of the #IOMMUMemoryRegion + * @owner: the object that tracks the region's reference count + * @name: used for debugging; not visible to the user or ABI + * @size: size of the region. + */ +void memory_region_init_iommu(void *_iommu_mr, + size_t instance_size, + const char *mrtypename, + Object *owner, + const char *name, + uint64_t size); + +/** + * memory_region_init_ram - Initialize RAM memory region. Accesses into the + * region will modify memory directly. + * + * @mr: the #MemoryRegion to be initialized + * @owner: the object that tracks the region's reference count (must be + * TYPE_DEVICE or a subclass of TYPE_DEVICE, or NULL) + * @name: name of the memory region + * @size: size of the region in bytes + * @errp: pointer to Error*, to store an error if it happens. + * + * This function allocates RAM for a board model or device, and + * arranges for it to be migrated (by calling vmstate_register_ram() + * if @owner is a DeviceState, or vmstate_register_ram_global() if + * @owner is NULL). + * + * TODO: Currently we restrict @owner to being either NULL (for + * global RAM regions with no owner) or devices, so that we can + * give the RAM block a unique name for migration purposes. + * We should lift this restriction and allow arbitrary Objects. + * If you pass a non-NULL non-device @owner then we will assert. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_ram(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + +bool memory_region_init_ram_guest_memfd(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + +/** + * memory_region_init_rom: Initialize a ROM memory region. + * + * This has the same effect as calling memory_region_init_ram() + * and then marking the resulting region read-only with + * memory_region_set_readonly(). This includes arranging for the + * contents to be migrated. + * + * TODO: Currently we restrict @owner to being either NULL (for + * global RAM regions with no owner) or devices, so that we can + * give the RAM block a unique name for migration purposes. + * We should lift this restriction and allow arbitrary Objects. + * If you pass a non-NULL non-device @owner then we will assert. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_rom(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + +/** + * memory_region_init_rom_device: Initialize a ROM memory region. + * Writes are handled via callbacks. + * + * This function initializes a memory region backed by RAM for reads + * and callbacks for writes, and arranges for the RAM backing to + * be migrated (by calling vmstate_register_ram() + * if @owner is a DeviceState, or vmstate_register_ram_global() if + * @owner is NULL). + * + * TODO: Currently we restrict @owner to being either NULL (for + * global RAM regions with no owner) or devices, so that we can + * give the RAM block a unique name for migration purposes. + * We should lift this restriction and allow arbitrary Objects. + * If you pass a non-NULL non-device @owner then we will assert. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @ops: callbacks for write access handling (must not be NULL). + * @opaque: passed to the read and write callbacks of the @ops structure. + * @name: Region name, becomes part of RAMBlock name used in migration stream + * must be unique within any device + * @size: size of the region. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_region_init_rom_device(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp); + + +/** + * memory_region_owner: get a memory region's owner. + * + * @mr: the memory region being queried. + */ +Object *memory_region_owner(MemoryRegion *mr); + +/** + * memory_region_size: get a memory region's size. + * + * @mr: the memory region being queried. + */ +uint64_t memory_region_size(MemoryRegion *mr); + +/** + * memory_region_is_ram: check whether a memory region is random access + * + * Returns %true if a memory region is random access. + * + * @mr: the memory region being queried + */ +static inline bool memory_region_is_ram(MemoryRegion *mr) +{ + return mr->ram; +} + +/** + * memory_region_is_ram_device: check whether a memory region is a ram device + * + * Returns %true if a memory region is a device backed ram region + * + * @mr: the memory region being queried + */ +bool memory_region_is_ram_device(MemoryRegion *mr); + +/** + * memory_region_is_romd: check whether a memory region is in ROMD mode + * + * Returns %true if a memory region is a ROM device and currently set to allow + * direct reads. + * + * @mr: the memory region being queried + */ +static inline bool memory_region_is_romd(MemoryRegion *mr) +{ + return mr->rom_device && mr->romd_mode; +} + +/** + * memory_region_is_protected: check whether a memory region is protected + * + * Returns %true if a memory region is protected RAM and cannot be accessed + * via standard mechanisms, e.g. DMA. + * + * @mr: the memory region being queried + */ +bool memory_region_is_protected(MemoryRegion *mr); + +/** + * memory_region_has_guest_memfd: check whether a memory region has guest_memfd + * associated + * + * Returns %true if a memory region's ram_block has valid guest_memfd assigned. + * + * @mr: the memory region being queried + */ +bool memory_region_has_guest_memfd(MemoryRegion *mr); + +/** + * memory_region_get_iommu: check whether a memory region is an iommu + * + * Returns pointer to IOMMUMemoryRegion if a memory region is an iommu, + * otherwise NULL. + * + * @mr: the memory region being queried + */ +static inline IOMMUMemoryRegion *memory_region_get_iommu(MemoryRegion *mr) +{ + if (mr->alias) { + return memory_region_get_iommu(mr->alias); + } + if (mr->is_iommu) { + return (IOMMUMemoryRegion *) mr; + } + return NULL; +} + +/** + * memory_region_get_iommu_class_nocheck: returns iommu memory region class + * if an iommu or NULL if not + * + * Returns pointer to IOMMUMemoryRegionClass if a memory region is an iommu, + * otherwise NULL. This is fast path avoiding QOM checking, use with caution. + * + * @iommu_mr: the memory region being queried + */ +static inline IOMMUMemoryRegionClass *memory_region_get_iommu_class_nocheck( + IOMMUMemoryRegion *iommu_mr) +{ + return (IOMMUMemoryRegionClass *) (((Object *)iommu_mr)->class); +} + +#define memory_region_is_iommu(mr) (memory_region_get_iommu(mr) != NULL) + +/** + * memory_region_iommu_get_min_page_size: get minimum supported page size + * for an iommu + * + * Returns minimum supported page size for an iommu. + * + * @iommu_mr: the memory region being queried + */ +uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr); + +/** + * memory_region_notify_iommu: notify a change in an IOMMU translation entry. + * + * Note: for any IOMMU implementation, an in-place mapping change + * should be notified with an UNMAP followed by a MAP. + * + * @iommu_mr: the memory region that was changed + * @iommu_idx: the IOMMU index for the translation table which has changed + * @event: TLB event with the new entry in the IOMMU translation table. + * The entry replaces all old entries for the same virtual I/O address + * range. + */ +void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, + const IOMMUTLBEvent event); + +/** + * memory_region_notify_iommu_one: notify a change in an IOMMU translation + * entry to a single notifier + * + * This works just like memory_region_notify_iommu(), but it only + * notifies a specific notifier, not all of them. + * + * @notifier: the notifier to be notified + * @event: TLB event with the new entry in the IOMMU translation table. + * The entry replaces all old entries for the same virtual I/O address + * range. + */ +void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + const IOMMUTLBEvent *event); + +/** + * memory_region_unmap_iommu_notifier_range: notify a unmap for an IOMMU + * translation that covers the + * range of a notifier + * + * @notifier: the notifier to be notified + */ +void memory_region_unmap_iommu_notifier_range(IOMMUNotifier *notifier); + + +/** + * memory_region_register_iommu_notifier: register a notifier for changes to + * IOMMU translation entries. + * + * Returns 0 on success, or a negative errno otherwise. In particular, + * -EINVAL indicates that at least one of the attributes of the notifier + * is not supported (flag/range) by the IOMMU memory region. In case of error + * the error object must be created. + * + * @mr: the memory region to observe + * @n: the IOMMUNotifier to be added; the notify callback receives a + * pointer to an #IOMMUTLBEntry as the opaque value; the pointer + * ceases to be valid on exit from the notifier. + * @errp: pointer to Error*, to store an error if it happens. + */ +int memory_region_register_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n, Error **errp); + +/** + * memory_region_iommu_replay: replay existing IOMMU translations to + * a notifier with the minimum page granularity returned by + * mr->iommu_ops->get_page_size(). + * + * Note: this is not related to record-and-replay functionality. + * + * @iommu_mr: the memory region to observe + * @n: the notifier to which to replay iommu mappings + */ +void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n); + +/** + * memory_region_unregister_iommu_notifier: unregister a notifier for + * changes to IOMMU translation entries. + * + * @mr: the memory region which was observed and for which notify_stopped() + * needs to be called + * @n: the notifier to be removed. + */ +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n); + +/** + * memory_region_iommu_get_attr: return an IOMMU attr if get_attr() is + * defined on the IOMMU. + * + * Returns 0 on success, or a negative errno otherwise. In particular, + * -EINVAL indicates that the IOMMU does not support the requested + * attribute. + * + * @iommu_mr: the memory region + * @attr: the requested attribute + * @data: a pointer to the requested attribute data + */ +int memory_region_iommu_get_attr(IOMMUMemoryRegion *iommu_mr, + enum IOMMUMemoryRegionAttr attr, + void *data); + +/** + * memory_region_iommu_attrs_to_index: return the IOMMU index to + * use for translations with the given memory transaction attributes. + * + * @iommu_mr: the memory region + * @attrs: the memory transaction attributes + */ +int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, + MemTxAttrs attrs); + +/** + * memory_region_iommu_num_indexes: return the total number of IOMMU + * indexes that this IOMMU supports. + * + * @iommu_mr: the memory region + */ +int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); + +/** + * memory_region_name: get a memory region's name + * + * Returns the string that was used to initialize the memory region. + * + * @mr: the memory region being queried + */ +const char *memory_region_name(const MemoryRegion *mr); + +/** + * memory_region_is_logging: return whether a memory region is logging writes + * + * Returns %true if the memory region is logging writes for the given client + * + * @mr: the memory region being queried + * @client: the client being queried + */ +bool memory_region_is_logging(MemoryRegion *mr, uint8_t client); + +/** + * memory_region_get_dirty_log_mask: return the clients for which a + * memory region is logging writes. + * + * Returns a bitmap of clients, in which the DIRTY_MEMORY_* constants + * are the bit indices. + * + * @mr: the memory region being queried + */ +uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr); + +/** + * memory_region_is_rom: check whether a memory region is ROM + * + * Returns %true if a memory region is read-only memory. + * + * @mr: the memory region being queried + */ +static inline bool memory_region_is_rom(MemoryRegion *mr) +{ + return mr->ram && mr->readonly; +} + +/** + * memory_region_is_nonvolatile: check whether a memory region is non-volatile + * + * Returns %true is a memory region is non-volatile memory. + * + * @mr: the memory region being queried + */ +static inline bool memory_region_is_nonvolatile(MemoryRegion *mr) +{ + return mr->nonvolatile; +} + +/** + * memory_region_get_fd: Get a file descriptor backing a RAM memory region. + * + * Returns a file descriptor backing a file-based RAM memory region, + * or -1 if the region is not a file-based RAM memory region. + * + * @mr: the RAM or alias memory region being queried. + */ +int memory_region_get_fd(MemoryRegion *mr); + +/** + * memory_region_from_host: Convert a pointer into a RAM memory region + * and an offset within it. + * + * Given a host pointer inside a RAM memory region (created with + * memory_region_init_ram() or memory_region_init_ram_ptr()), return + * the MemoryRegion and the offset within it. + * + * Use with care; by the time this function returns, the returned pointer is + * not protected by RCU anymore. If the caller is not within an RCU critical + * section and does not hold the BQL, it must have other means of + * protecting the pointer, such as a reference to the region that includes + * the incoming ram_addr_t. + * + * @ptr: the host pointer to be converted + * @offset: the offset within memory region + */ +MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset); + +/** + * memory_region_get_ram_ptr: Get a pointer into a RAM memory region. + * + * Returns a host pointer to a RAM memory region (created with + * memory_region_init_ram() or memory_region_init_ram_ptr()). + * + * Use with care; by the time this function returns, the returned pointer is + * not protected by RCU anymore. If the caller is not within an RCU critical + * section and does not hold the BQL, it must have other means of + * protecting the pointer, such as a reference to the region that includes + * the incoming ram_addr_t. + * + * @mr: the memory region being queried. + */ +void *memory_region_get_ram_ptr(MemoryRegion *mr); + +/* memory_region_ram_resize: Resize a RAM region. + * + * Resizing RAM while migrating can result in the migration being canceled. + * Care has to be taken if the guest might have already detected the memory. + * + * @mr: a memory region created with @memory_region_init_resizeable_ram. + * @newsize: the new size the region + * @errp: pointer to Error*, to store an error if it happens. + */ +void memory_region_ram_resize(MemoryRegion *mr, ram_addr_t newsize, + Error **errp); + +/** + * memory_region_msync: Synchronize selected address range of + * a memory mapped region + * + * @mr: the memory region to be msync + * @addr: the initial address of the range to be sync + * @size: the size of the range to be sync + */ +void memory_region_msync(MemoryRegion *mr, hwaddr addr, hwaddr size); + +/** + * memory_region_writeback: Trigger cache writeback for + * selected address range + * + * @mr: the memory region to be updated + * @addr: the initial address of the range to be written back + * @size: the size of the range to be written back + */ +void memory_region_writeback(MemoryRegion *mr, hwaddr addr, hwaddr size); + +/** + * memory_region_set_log: Turn dirty logging on or off for a region. + * + * Turns dirty logging on or off for a specified client (display, migration). + * Only meaningful for RAM regions. + * + * @mr: the memory region being updated. + * @log: whether dirty logging is to be enabled or disabled. + * @client: the user of the logging information; %DIRTY_MEMORY_VGA only. + */ +void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client); + +/** + * memory_region_set_dirty: Mark a range of bytes as dirty in a memory region. + * + * Marks a range of bytes as dirty, after it has been dirtied outside + * guest code. + * + * @mr: the memory region being dirtied. + * @addr: the address (relative to the start of the region) being dirtied. + * @size: size of the range being dirtied. + */ +void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, + hwaddr size); + +/** + * memory_region_clear_dirty_bitmap - clear dirty bitmap for memory range + * + * This function is called when the caller wants to clear the remote + * dirty bitmap of a memory range within the memory region. This can + * be used by e.g. KVM to manually clear dirty log when + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is declared support by the host + * kernel. + * + * @mr: the memory region to clear the dirty log upon + * @start: start address offset within the memory region + * @len: length of the memory region to clear dirty bitmap + */ +void memory_region_clear_dirty_bitmap(MemoryRegion *mr, hwaddr start, + hwaddr len); + +/** + * memory_region_snapshot_and_clear_dirty: Get a snapshot of the dirty + * bitmap and clear it. + * + * Creates a snapshot of the dirty bitmap, clears the dirty bitmap and + * returns the snapshot. The snapshot can then be used to query dirty + * status, using memory_region_snapshot_get_dirty. Snapshotting allows + * querying the same page multiple times, which is especially useful for + * display updates where the scanlines often are not page aligned. + * + * The dirty bitmap region which gets copied into the snapshot (and + * cleared afterwards) can be larger than requested. The boundaries + * are rounded up/down so complete bitmap longs (covering 64 pages on + * 64bit hosts) can be copied over into the bitmap snapshot. Which + * isn't a problem for display updates as the extra pages are outside + * the visible area, and in case the visible area changes a full + * display redraw is due anyway. Should other use cases for this + * function emerge we might have to revisit this implementation + * detail. + * + * Use g_free to release DirtyBitmapSnapshot. + * + * @mr: the memory region being queried. + * @addr: the address (relative to the start of the region) being queried. + * @size: the size of the range being queried. + * @client: the user of the logging information; typically %DIRTY_MEMORY_VGA. + */ +DirtyBitmapSnapshot *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr, + hwaddr addr, + hwaddr size, + unsigned client); + +/** + * memory_region_snapshot_get_dirty: Check whether a range of bytes is dirty + * in the specified dirty bitmap snapshot. + * + * @mr: the memory region being queried. + * @snap: the dirty bitmap snapshot + * @addr: the address (relative to the start of the region) being queried. + * @size: the size of the range being queried. + */ +bool memory_region_snapshot_get_dirty(MemoryRegion *mr, + DirtyBitmapSnapshot *snap, + hwaddr addr, hwaddr size); + +/** + * memory_region_reset_dirty: Mark a range of pages as clean, for a specified + * client. + * + * Marks a range of pages as no longer dirty. + * + * @mr: the region being updated. + * @addr: the start of the subrange being cleaned. + * @size: the size of the subrange being cleaned. + * @client: the user of the logging information; %DIRTY_MEMORY_MIGRATION or + * %DIRTY_MEMORY_VGA. + */ +void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr, + hwaddr size, unsigned client); + +/** + * memory_region_flush_rom_device: Mark a range of pages dirty and invalidate + * TBs (for self-modifying code). + * + * The MemoryRegionOps->write() callback of a ROM device must use this function + * to mark byte ranges that have been modified internally, such as by directly + * accessing the memory returned by memory_region_get_ram_ptr(). + * + * This function marks the range dirty and invalidates TBs so that TCG can + * detect self-modifying code. + * + * @mr: the region being flushed. + * @addr: the start, relative to the start of the region, of the range being + * flushed. + * @size: the size, in bytes, of the range being flushed. + */ +void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size); + +/** + * memory_region_set_readonly: Turn a memory region read-only (or read-write) + * + * Allows a memory region to be marked as read-only (turning it into a ROM). + * only useful on RAM regions. + * + * @mr: the region being updated. + * @readonly: whether the region is to be ROM or RAM. + */ +void memory_region_set_readonly(MemoryRegion *mr, bool readonly); + +/** + * memory_region_set_nonvolatile: Turn a memory region non-volatile + * + * Allows a memory region to be marked as non-volatile. + * only useful on RAM regions. + * + * @mr: the region being updated. + * @nonvolatile: whether the region is to be non-volatile. + */ +void memory_region_set_nonvolatile(MemoryRegion *mr, bool nonvolatile); + +/** + * memory_region_rom_device_set_romd: enable/disable ROMD mode + * + * Allows a ROM device (initialized with memory_region_init_rom_device() to + * set to ROMD mode (default) or MMIO mode. When it is in ROMD mode, the + * device is mapped to guest memory and satisfies read access directly. + * When in MMIO mode, reads are forwarded to the #MemoryRegion.read function. + * Writes are always handled by the #MemoryRegion.write function. + * + * @mr: the memory region to be updated + * @romd_mode: %true to put the region into ROMD mode + */ +void memory_region_rom_device_set_romd(MemoryRegion *mr, bool romd_mode); + +/** + * memory_region_set_coalescing: Enable memory coalescing for the region. + * + * Enabled writes to a region to be queued for later processing. MMIO ->write + * callbacks may be delayed until a non-coalesced MMIO is issued. + * Only useful for IO regions. Roughly similar to write-combining hardware. + * + * @mr: the memory region to be write coalesced + */ +void memory_region_set_coalescing(MemoryRegion *mr); + +/** + * memory_region_add_coalescing: Enable memory coalescing for a sub-range of + * a region. + * + * Like memory_region_set_coalescing(), but works on a sub-range of a region. + * Multiple calls can be issued coalesced disjoint ranges. + * + * @mr: the memory region to be updated. + * @offset: the start of the range within the region to be coalesced. + * @size: the size of the subrange to be coalesced. + */ +void memory_region_add_coalescing(MemoryRegion *mr, + hwaddr offset, + uint64_t size); + +/** + * memory_region_clear_coalescing: Disable MMIO coalescing for the region. + * + * Disables any coalescing caused by memory_region_set_coalescing() or + * memory_region_add_coalescing(). Roughly equivalent to uncacheble memory + * hardware. + * + * @mr: the memory region to be updated. + */ +void memory_region_clear_coalescing(MemoryRegion *mr); + +/** + * memory_region_set_flush_coalesced: Enforce memory coalescing flush before + * accesses. + * + * Ensure that pending coalesced MMIO request are flushed before the memory + * region is accessed. This property is automatically enabled for all regions + * passed to memory_region_set_coalescing() and memory_region_add_coalescing(). + * + * @mr: the memory region to be updated. + */ +void memory_region_set_flush_coalesced(MemoryRegion *mr); + +/** + * memory_region_clear_flush_coalesced: Disable memory coalescing flush before + * accesses. + * + * Clear the automatic coalesced MMIO flushing enabled via + * memory_region_set_flush_coalesced. Note that this service has no effect on + * memory regions that have MMIO coalescing enabled for themselves. For them, + * automatic flushing will stop once coalescing is disabled. + * + * @mr: the memory region to be updated. + */ +void memory_region_clear_flush_coalesced(MemoryRegion *mr); + +/** + * memory_region_add_eventfd: Request an eventfd to be triggered when a word + * is written to a location. + * + * Marks a word in an IO region (initialized with memory_region_init_io()) + * as a trigger for an eventfd event. The I/O callback will not be called. + * The caller must be prepared to handle failure (that is, take the required + * action if the callback _is_ called). + * + * @mr: the memory region being updated. + * @addr: the address within @mr that is to be monitored + * @size: the size of the access to trigger the eventfd + * @match_data: whether to match against @data, instead of just @addr + * @data: the data to match against the guest write + * @e: event notifier to be triggered when @addr, @size, and @data all match. + **/ +void memory_region_add_eventfd(MemoryRegion *mr, + hwaddr addr, + unsigned size, + bool match_data, + uint64_t data, + EventNotifier *e); + +/** + * memory_region_del_eventfd: Cancel an eventfd. + * + * Cancels an eventfd trigger requested by a previous + * memory_region_add_eventfd() call. + * + * @mr: the memory region being updated. + * @addr: the address within @mr that is to be monitored + * @size: the size of the access to trigger the eventfd + * @match_data: whether to match against @data, instead of just @addr + * @data: the data to match against the guest write + * @e: event notifier to be triggered when @addr, @size, and @data all match. + */ +void memory_region_del_eventfd(MemoryRegion *mr, + hwaddr addr, + unsigned size, + bool match_data, + uint64_t data, + EventNotifier *e); + +/** + * memory_region_add_subregion: Add a subregion to a container. + * + * Adds a subregion at @offset. The subregion may not overlap with other + * subregions (except for those explicitly marked as overlapping). A region + * may only be added once as a subregion (unless removed with + * memory_region_del_subregion()); use memory_region_init_alias() if you + * want a region to be a subregion in multiple locations. + * + * @mr: the region to contain the new subregion; must be a container + * initialized with memory_region_init(). + * @offset: the offset relative to @mr where @subregion is added. + * @subregion: the subregion to be added. + */ +void memory_region_add_subregion(MemoryRegion *mr, + hwaddr offset, + MemoryRegion *subregion); +/** + * memory_region_add_subregion_overlap: Add a subregion to a container + * with overlap. + * + * Adds a subregion at @offset. The subregion may overlap with other + * subregions. Conflicts are resolved by having a higher @priority hide a + * lower @priority. Subregions without priority are taken as @priority 0. + * A region may only be added once as a subregion (unless removed with + * memory_region_del_subregion()); use memory_region_init_alias() if you + * want a region to be a subregion in multiple locations. + * + * @mr: the region to contain the new subregion; must be a container + * initialized with memory_region_init(). + * @offset: the offset relative to @mr where @subregion is added. + * @subregion: the subregion to be added. + * @priority: used for resolving overlaps; highest priority wins. + */ +void memory_region_add_subregion_overlap(MemoryRegion *mr, + hwaddr offset, + MemoryRegion *subregion, + int priority); + +/** + * memory_region_get_ram_addr: Get the ram address associated with a memory + * region + * + * @mr: the region to be queried + */ +ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr); + +uint64_t memory_region_get_alignment(const MemoryRegion *mr); +/** + * memory_region_del_subregion: Remove a subregion. + * + * Removes a subregion from its container. + * + * @mr: the container to be updated. + * @subregion: the region being removed; must be a current subregion of @mr. + */ +void memory_region_del_subregion(MemoryRegion *mr, + MemoryRegion *subregion); + +/* + * memory_region_set_enabled: dynamically enable or disable a region + * + * Enables or disables a memory region. A disabled memory region + * ignores all accesses to itself and its subregions. It does not + * obscure sibling subregions with lower priority - it simply behaves as + * if it was removed from the hierarchy. + * + * Regions default to being enabled. + * + * @mr: the region to be updated + * @enabled: whether to enable or disable the region + */ +void memory_region_set_enabled(MemoryRegion *mr, bool enabled); + +/* + * memory_region_set_address: dynamically update the address of a region + * + * Dynamically updates the address of a region, relative to its container. + * May be used on regions are currently part of a memory hierarchy. + * + * @mr: the region to be updated + * @addr: new address, relative to container region + */ +void memory_region_set_address(MemoryRegion *mr, hwaddr addr); + +/* + * memory_region_set_size: dynamically update the size of a region. + * + * Dynamically updates the size of a region. + * + * @mr: the region to be updated + * @size: used size of the region. + */ +void memory_region_set_size(MemoryRegion *mr, uint64_t size); + +/* + * memory_region_set_alias_offset: dynamically update a memory alias's offset + * + * Dynamically updates the offset into the target region that an alias points + * to, as if the fourth argument to memory_region_init_alias() has changed. + * + * @mr: the #MemoryRegion to be updated; should be an alias. + * @offset: the new offset into the target memory region + */ +void memory_region_set_alias_offset(MemoryRegion *mr, + hwaddr offset); + +/* + * memory_region_set_unmergeable: Set a memory region unmergeable + * + * Mark a memory region unmergeable, resulting in the memory region (or + * everything contained in a memory region container) not getting merged when + * simplifying the address space and notifying memory listeners. Consequently, + * memory listeners will never get notified about ranges that are larger than + * the original memory regions. + * + * This is primarily useful when multiple aliases to a RAM memory region are + * mapped into a memory region container, and updates (e.g., enable/disable or + * map/unmap) of individual memory region aliases are not supposed to affect + * other memory regions in the same container. + * + * @mr: the #MemoryRegion to be updated + * @unmergeable: whether to mark the #MemoryRegion unmergeable + */ +void memory_region_set_unmergeable(MemoryRegion *mr, bool unmergeable); + +/** + * memory_region_present: checks if an address relative to a @container + * translates into #MemoryRegion within @container + * + * Answer whether a #MemoryRegion within @container covers the address + * @addr. + * + * @container: a #MemoryRegion within which @addr is a relative address + * @addr: the area within @container to be searched + */ +bool memory_region_present(MemoryRegion *container, hwaddr addr); + +/** + * memory_region_is_mapped: returns true if #MemoryRegion is mapped + * into another memory region, which does not necessarily imply that it is + * mapped into an address space. + * + * @mr: a #MemoryRegion which should be checked if it's mapped + */ +bool memory_region_is_mapped(MemoryRegion *mr); + +/** + * memory_region_get_ram_discard_manager: get the #RamDiscardManager for a + * #MemoryRegion + * + * The #RamDiscardManager cannot change while a memory region is mapped. + * + * @mr: the #MemoryRegion + */ +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr); + +/** + * memory_region_has_ram_discard_manager: check whether a #MemoryRegion has a + * #RamDiscardManager assigned + * + * @mr: the #MemoryRegion + */ +static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) +{ + return !!memory_region_get_ram_discard_manager(mr); +} + +/** + * memory_region_set_ram_discard_manager: set the #RamDiscardManager for a + * #MemoryRegion + * + * This function must not be called for a mapped #MemoryRegion, a #MemoryRegion + * that does not cover RAM, or a #MemoryRegion that already has a + * #RamDiscardManager assigned. Return 0 if the rdm is set successfully. + * + * @mr: the #MemoryRegion + * @rdm: #RamDiscardManager to set + */ +int memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm); + +/** + * memory_region_find: translate an address/size relative to a + * MemoryRegion into a #MemoryRegionSection. + * + * Locates the first #MemoryRegion within @mr that overlaps the range + * given by @addr and @size. + * + * Returns a #MemoryRegionSection that describes a contiguous overlap. + * It will have the following characteristics: + * - @size = 0 iff no overlap was found + * - @mr is non-%NULL iff an overlap was found + * + * Remember that in the return value the @offset_within_region is + * relative to the returned region (in the .@mr field), not to the + * @mr argument. + * + * Similarly, the .@offset_within_address_space is relative to the + * address space that contains both regions, the passed and the + * returned one. However, in the special case where the @mr argument + * has no container (and thus is the root of the address space), the + * following will hold: + * - @offset_within_address_space >= @addr + * - @offset_within_address_space + .@size <= @addr + @size + * + * @mr: a MemoryRegion within which @addr is a relative address + * @addr: start of the area within @as to be searched + * @size: size of the area to be searched + */ +MemoryRegionSection memory_region_find(MemoryRegion *mr, + hwaddr addr, uint64_t size); + +/** + * memory_global_dirty_log_sync: synchronize the dirty log for all memory + * + * Synchronizes the dirty page log for all address spaces. + * + * @last_stage: whether this is the last stage of live migration + */ +void memory_global_dirty_log_sync(bool last_stage); + +/** + * memory_global_after_dirty_log_sync: synchronize the dirty log for all memory + * + * Synchronizes the vCPUs with a thread that is reading the dirty bitmap. + * This function must be called after the dirty log bitmap is cleared, and + * before dirty guest memory pages are read. If you are using + * #DirtyBitmapSnapshot, memory_region_snapshot_and_clear_dirty() takes + * care of doing this. + */ +void memory_global_after_dirty_log_sync(void); + +/** + * memory_region_transaction_begin: Start a transaction. + * + * During a transaction, changes will be accumulated and made visible + * only when the transaction ends (is committed). + */ +void memory_region_transaction_begin(void); + +/** + * memory_region_transaction_commit: Commit a transaction and make changes + * visible to the guest. + */ +void memory_region_transaction_commit(void); + +/** + * memory_listener_register: register callbacks to be called when memory + * sections are mapped or unmapped into an address + * space + * + * @listener: an object containing the callbacks to be called + * @filter: if non-%NULL, only regions in this address space will be observed + */ +void memory_listener_register(MemoryListener *listener, AddressSpace *filter); + +/** + * memory_listener_unregister: undo the effect of memory_listener_register() + * + * @listener: an object containing the callbacks to be removed + */ +void memory_listener_unregister(MemoryListener *listener); + +/** + * memory_global_dirty_log_start: begin dirty logging for all regions + * + * @flags: purpose of starting dirty log, migration or dirty rate + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: true on success, else false setting @errp with error. + */ +bool memory_global_dirty_log_start(unsigned int flags, Error **errp); + +/** + * memory_global_dirty_log_stop: end dirty logging for all regions + * + * @flags: purpose of stopping dirty log, migration or dirty rate + */ +void memory_global_dirty_log_stop(unsigned int flags); + +void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled); + +bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, + unsigned size, bool is_write, + MemTxAttrs attrs); + +/** + * memory_region_dispatch_read: perform a read directly to the specified + * MemoryRegion. + * + * @mr: #MemoryRegion to access + * @addr: address within that region + * @pval: pointer to uint64_t which the data is written to + * @op: size, sign, and endianness of the memory operation + * @attrs: memory transaction attributes to use for the access + */ +MemTxResult memory_region_dispatch_read(MemoryRegion *mr, + hwaddr addr, + uint64_t *pval, + MemOp op, + MemTxAttrs attrs); +/** + * memory_region_dispatch_write: perform a write directly to the specified + * MemoryRegion. + * + * @mr: #MemoryRegion to access + * @addr: address within that region + * @data: data to write + * @op: size, sign, and endianness of the memory operation + * @attrs: memory transaction attributes to use for the access + */ +MemTxResult memory_region_dispatch_write(MemoryRegion *mr, + hwaddr addr, + uint64_t data, + MemOp op, + MemTxAttrs attrs); + +/** + * address_space_init: initializes an address space + * + * @as: an uninitialized #AddressSpace + * @root: a #MemoryRegion that routes addresses for the address space + * @name: an address space name. The name is only used for debugging + * output. + */ +void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name); + +/** + * address_space_destroy: destroy an address space + * + * Releases all resources associated with an address space. After an address space + * is destroyed, its root memory region (given by address_space_init()) may be destroyed + * as well. + * + * @as: address space to be destroyed + */ +void address_space_destroy(AddressSpace *as); + +/** + * address_space_remove_listeners: unregister all listeners of an address space + * + * Removes all callbacks previously registered with memory_listener_register() + * for @as. + * + * @as: an initialized #AddressSpace + */ +void address_space_remove_listeners(AddressSpace *as); + +/** + * address_space_rw: read from or write to an address space. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @attrs: memory transaction attributes + * @buf: buffer with the data transferred + * @len: the number of bytes to read or write + * @is_write: indicates the transfer direction + */ +MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, void *buf, + hwaddr len, bool is_write); + +/** + * address_space_write: write to address space. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @attrs: memory transaction attributes + * @buf: buffer with the data transferred + * @len: the number of bytes to write + */ +MemTxResult address_space_write(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const void *buf, hwaddr len); + +/** + * address_space_write_rom: write to address space, including ROM. + * + * This function writes to the specified address space, but will + * write data to both ROM and RAM. This is used for non-guest + * writes like writes from the gdb debug stub or initial loading + * of ROM contents. + * + * Note that portions of the write which attempt to write data to + * a device will be silently ignored -- only real RAM and ROM will + * be written to. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @attrs: memory transaction attributes + * @buf: buffer with the data transferred + * @len: the number of bytes to write + */ +MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const void *buf, hwaddr len); + +/* address_space_ld*: load from an address space + * address_space_st*: store to an address space + * + * These functions perform a load or store of the byte, word, + * longword or quad to the specified address within the AddressSpace. + * The _le suffixed functions treat the data as little endian; + * _be indicates big endian; no suffix indicates "same endianness + * as guest CPU". + * + * The "guest CPU endianness" accessors are deprecated for use outside + * target-* code; devices should be CPU-agnostic and use either the LE + * or the BE accessors. + * + * @as #AddressSpace to be accessed + * @addr: address within that address space + * @val: data value, for stores + * @attrs: memory transaction attributes + * @result: location to write the success/failure of the transaction; + * if NULL, this information is discarded + */ + +#define SUFFIX +#define ARG1 as +#define ARG1_DECL AddressSpace *as +#include "exec/memory_ldst.h.inc" + +static inline void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val) +{ + address_space_stl_notdirty(as, addr, val, + MEMTXATTRS_UNSPECIFIED, NULL); +} + +#define SUFFIX +#define ARG1 as +#define ARG1_DECL AddressSpace *as +#include "exec/memory_ldst_phys.h.inc" + +struct MemoryRegionCache { + uint8_t *ptr; + hwaddr xlat; + hwaddr len; + FlatView *fv; + MemoryRegionSection mrs; + bool is_write; +}; + +/* address_space_ld*_cached: load from a cached #MemoryRegion + * address_space_st*_cached: store into a cached #MemoryRegion + * + * These functions perform a load or store of the byte, word, + * longword or quad to the specified address. The address is + * a physical address in the AddressSpace, but it must lie within + * a #MemoryRegion that was mapped with address_space_cache_init. + * + * The _le suffixed functions treat the data as little endian; + * _be indicates big endian; no suffix indicates "same endianness + * as guest CPU". + * + * The "guest CPU endianness" accessors are deprecated for use outside + * target-* code; devices should be CPU-agnostic and use either the LE + * or the BE accessors. + * + * @cache: previously initialized #MemoryRegionCache to be accessed + * @addr: address within the address space + * @val: data value, for stores + * @attrs: memory transaction attributes + * @result: location to write the success/failure of the transaction; + * if NULL, this information is discarded + */ + +#define SUFFIX _cached_slow +#define ARG1 cache +#define ARG1_DECL MemoryRegionCache *cache +#include "exec/memory_ldst.h.inc" + +/* Inline fast path for direct RAM access. */ +static inline uint8_t address_space_ldub_cached(MemoryRegionCache *cache, + hwaddr addr, MemTxAttrs attrs, MemTxResult *result) +{ + assert(addr < cache->len); + if (likely(cache->ptr)) { + return ldub_p(cache->ptr + addr); + } else { + return address_space_ldub_cached_slow(cache, addr, attrs, result); + } +} + +static inline void address_space_stb_cached(MemoryRegionCache *cache, + hwaddr addr, uint8_t val, MemTxAttrs attrs, MemTxResult *result) +{ + assert(addr < cache->len); + if (likely(cache->ptr)) { + stb_p(cache->ptr + addr, val); + } else { + address_space_stb_cached_slow(cache, addr, val, attrs, result); + } +} + +#define ENDIANNESS +#include "exec/memory_ldst_cached.h.inc" + +#define ENDIANNESS _le +#include "exec/memory_ldst_cached.h.inc" + +#define ENDIANNESS _be +#include "exec/memory_ldst_cached.h.inc" + +#define SUFFIX _cached +#define ARG1 cache +#define ARG1_DECL MemoryRegionCache *cache +#include "exec/memory_ldst_phys.h.inc" + +/* address_space_cache_init: prepare for repeated access to a physical + * memory region + * + * @cache: #MemoryRegionCache to be filled + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @len: length of buffer + * @is_write: indicates the transfer direction + * + * Will only work with RAM, and may map a subset of the requested range by + * returning a value that is less than @len. On failure, return a negative + * errno value. + * + * Because it only works with RAM, this function can be used for + * read-modify-write operations. In this case, is_write should be %true. + * + * Note that addresses passed to the address_space_*_cached functions + * are relative to @addr. + */ +int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpace *as, + hwaddr addr, + hwaddr len, + bool is_write); + +/** + * address_space_cache_init_empty: Initialize empty #MemoryRegionCache + * + * @cache: The #MemoryRegionCache to operate on. + * + * Initializes #MemoryRegionCache structure without memory region attached. + * Cache initialized this way can only be safely destroyed, but not used. + */ +static inline void address_space_cache_init_empty(MemoryRegionCache *cache) +{ + cache->mrs.mr = NULL; + /* There is no real need to initialize fv, but it makes Coverity happy. */ + cache->fv = NULL; +} + +/** + * address_space_cache_invalidate: complete a write to a #MemoryRegionCache + * + * @cache: The #MemoryRegionCache to operate on. + * @addr: The first physical address that was written, relative to the + * address that was passed to @address_space_cache_init. + * @access_len: The number of bytes that were written starting at @addr. + */ +void address_space_cache_invalidate(MemoryRegionCache *cache, + hwaddr addr, + hwaddr access_len); + +/** + * address_space_cache_destroy: free a #MemoryRegionCache + * + * @cache: The #MemoryRegionCache whose memory should be released. + */ +void address_space_cache_destroy(MemoryRegionCache *cache); + +/* address_space_get_iotlb_entry: translate an address into an IOTLB + * entry. Should be called from an RCU critical section. + */ +IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, + bool is_write, MemTxAttrs attrs); + +/* address_space_translate: translate an address range into an address space + * into a MemoryRegion and an address range into that section. Should be + * called from an RCU critical section, to avoid that the last reference + * to the returned region disappears after address_space_translate returns. + * + * @fv: #FlatView to be accessed + * @addr: address within that address space + * @xlat: pointer to address within the returned memory region section's + * #MemoryRegion. + * @len: pointer to length + * @is_write: indicates the transfer direction + * @attrs: memory attributes + */ +MemoryRegion *flatview_translate(FlatView *fv, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write, + MemTxAttrs attrs); + +static inline MemoryRegion *address_space_translate(AddressSpace *as, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write, + MemTxAttrs attrs) +{ + return flatview_translate(address_space_to_flatview(as), + addr, xlat, len, is_write, attrs); +} + +/* address_space_access_valid: check for validity of accessing an address + * space range + * + * Check whether memory is assigned to the given address space range, and + * access is permitted by any IOMMU regions that are active for the address + * space. + * + * For now, addr and len should be aligned to a page size. This limitation + * will be lifted in the future. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @len: length of the area to be checked + * @is_write: indicates the transfer direction + * @attrs: memory attributes + */ +bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len, + bool is_write, MemTxAttrs attrs); + +/* address_space_map: map a physical memory region into a host virtual address + * + * May map a subset of the requested range, given by and returned in @plen. + * May return %NULL and set *@plen to zero(0), if resources needed to perform + * the mapping are exhausted. + * Use only for reads OR writes - not for read-modify-write operations. + * Use address_space_register_map_client() to know when retrying the map + * operation is likely to succeed. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @plen: pointer to length of buffer; updated on return + * @is_write: indicates the transfer direction + * @attrs: memory attributes + */ +void *address_space_map(AddressSpace *as, hwaddr addr, + hwaddr *plen, bool is_write, MemTxAttrs attrs); + +/* address_space_unmap: Unmaps a memory region previously mapped by address_space_map() + * + * Will also mark the memory as dirty if @is_write == %true. @access_len gives + * the amount of memory that was actually read or written by the caller. + * + * @as: #AddressSpace used + * @buffer: host pointer as returned by address_space_map() + * @len: buffer length as returned by address_space_map() + * @access_len: amount of data actually transferred + * @is_write: indicates the transfer direction + */ +void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, + bool is_write, hwaddr access_len); + +/* + * address_space_register_map_client: Register a callback to invoke when + * resources for address_space_map() are available again. + * + * address_space_map may fail when there are not enough resources available, + * such as when bounce buffer memory would exceed the limit. The callback can + * be used to retry the address_space_map operation. Note that the callback + * gets automatically removed after firing. + * + * @as: #AddressSpace to be accessed + * @bh: callback to invoke when address_space_map() retry is appropriate + */ +void address_space_register_map_client(AddressSpace *as, QEMUBH *bh); + +/* + * address_space_unregister_map_client: Unregister a callback that has + * previously been registered and not fired yet. + * + * @as: #AddressSpace to be accessed + * @bh: callback to unregister + */ +void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh); + +/* Internal functions, part of the implementation of address_space_read. */ +MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, void *buf, hwaddr len); +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, void *buf, + hwaddr len, hwaddr addr1, hwaddr l, + MemoryRegion *mr); +void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr); + +/* Internal functions, part of the implementation of address_space_read_cached + * and address_space_write_cached. */ +MemTxResult address_space_read_cached_slow(MemoryRegionCache *cache, + hwaddr addr, void *buf, hwaddr len); +MemTxResult address_space_write_cached_slow(MemoryRegionCache *cache, + hwaddr addr, const void *buf, + hwaddr len); + +int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr); +bool prepare_mmio_access(MemoryRegion *mr); + +static inline bool memory_region_supports_direct_access(MemoryRegion *mr) +{ + /* ROM DEVICE regions only allow direct access if in ROMD mode. */ + if (memory_region_is_romd(mr)) { + return true; + } + if (!memory_region_is_ram(mr)) { + return false; + } + /* + * RAM DEVICE regions can be accessed directly using memcpy, but it might + * be MMIO and access using mempy can be wrong (e.g., using instructions not + * intended for MMIO access). So we treat this as IO. + */ + return !memory_region_is_ram_device(mr); +} + +static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write, + MemTxAttrs attrs) +{ + if (!memory_region_supports_direct_access(mr)) { + return false; + } + /* Debug access can write to ROM. */ + if (is_write && !attrs.debug) { + return !mr->readonly && !mr->rom_device; + } + return true; +} + +/** + * address_space_read: read from an address space. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). Called within RCU critical section. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @attrs: memory transaction attributes + * @buf: buffer with the data transferred + * @len: length of the data transferred + */ +static inline __attribute__((__always_inline__)) +MemTxResult address_space_read(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, void *buf, + hwaddr len) +{ + MemTxResult result = MEMTX_OK; + hwaddr l, addr1; + void *ptr; + MemoryRegion *mr; + FlatView *fv; + + if (__builtin_constant_p(len)) { + if (len) { + RCU_READ_LOCK_GUARD(); + fv = address_space_to_flatview(as); + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); + if (len == l && memory_access_is_direct(mr, false, attrs)) { + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); + memcpy(buf, ptr, len); + } else { + result = flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); + } + } + } else { + result = address_space_read_full(as, addr, attrs, buf, len); + } + return result; +} + +/** + * address_space_read_cached: read from a cached RAM region + * + * @cache: Cached region to be addressed + * @addr: address relative to the base of the RAM region + * @buf: buffer with the data transferred + * @len: length of the data transferred + */ +static inline MemTxResult +address_space_read_cached(MemoryRegionCache *cache, hwaddr addr, + void *buf, hwaddr len) +{ + assert(addr < cache->len && len <= cache->len - addr); + fuzz_dma_read_cb(cache->xlat + addr, len, cache->mrs.mr); + if (likely(cache->ptr)) { + memcpy(buf, cache->ptr + addr, len); + return MEMTX_OK; + } else { + return address_space_read_cached_slow(cache, addr, buf, len); + } +} + +/** + * address_space_write_cached: write to a cached RAM region + * + * @cache: Cached region to be addressed + * @addr: address relative to the base of the RAM region + * @buf: buffer with the data transferred + * @len: length of the data transferred + */ +static inline MemTxResult +address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, + const void *buf, hwaddr len) +{ + assert(addr < cache->len && len <= cache->len - addr); + if (likely(cache->ptr)) { + memcpy(cache->ptr + addr, buf, len); + return MEMTX_OK; + } else { + return address_space_write_cached_slow(cache, addr, buf, len); + } +} + +/** + * address_space_set: Fill address space with a constant byte. + * + * Return a MemTxResult indicating whether the operation succeeded + * or failed (eg unassigned memory, device rejected the transaction, + * IOMMU fault). + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @c: constant byte to fill the memory + * @len: the number of bytes to fill with the constant byte + * @attrs: memory transaction attributes + */ +MemTxResult address_space_set(AddressSpace *as, hwaddr addr, + uint8_t c, hwaddr len, MemTxAttrs attrs); + +/* + * Inhibit technologies that require discarding of pages in RAM blocks, e.g., + * to manage the actual amount of memory consumed by the VM (then, the memory + * provided by RAM blocks might be bigger than the desired memory consumption). + * This *must* be set if: + * - Discarding parts of a RAM blocks does not result in the change being + * reflected in the VM and the pages getting freed. + * - All memory in RAM blocks is pinned or duplicated, invaldiating any previous + * discards blindly. + * - Discarding parts of a RAM blocks will result in integrity issues (e.g., + * encrypted VMs). + * Technologies that only temporarily pin the current working set of a + * driver are fine, because we don't expect such pages to be discarded + * (esp. based on guest action like balloon inflation). + * + * This is *not* to be used to protect from concurrent discards (esp., + * postcopy). + * + * Returns 0 if successful. Returns -EBUSY if a technology that relies on + * discards to work reliably is active. + */ +int ram_block_discard_disable(bool state); + +/* + * See ram_block_discard_disable(): only disable uncoordinated discards, + * keeping coordinated discards (via the RamDiscardManager) enabled. + */ +int ram_block_uncoordinated_discard_disable(bool state); + +/* + * Inhibit technologies that disable discarding of pages in RAM blocks. + * + * Returns 0 if successful. Returns -EBUSY if discards are already set to + * broken. + */ +int ram_block_discard_require(bool state); + +/* + * See ram_block_discard_require(): only inhibit technologies that disable + * uncoordinated discarding of pages in RAM blocks, allowing co-existence with + * technologies that only inhibit uncoordinated discards (via the + * RamDiscardManager). + */ +int ram_block_coordinated_discard_require(bool state); + +/* + * Test if any discarding of memory in ram blocks is disabled. + */ +bool ram_block_discard_is_disabled(void); + +/* + * Test if any discarding of memory in ram blocks is required to work reliably. + */ +bool ram_block_discard_is_required(void); + +void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp); +void ram_block_del_cpr_blocker(RAMBlock *rb); + +#endif diff --git a/include/system/memory_mapping.h b/include/system/memory_mapping.h new file mode 100644 index 0000000..021e0a6 --- /dev/null +++ b/include/system/memory_mapping.h @@ -0,0 +1,85 @@ +/* + * QEMU memory mapping + * + * Copyright Fujitsu, Corp. 2011, 2012 + * + * Authors: + * Wen Congyang <wency@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef MEMORY_MAPPING_H +#define MEMORY_MAPPING_H + +#include "qemu/queue.h" +#include "exec/cpu-common.h" + +typedef struct GuestPhysBlock { + /* visible to guest, reflects PCI hole, etc */ + hwaddr target_start; + + /* implies size */ + hwaddr target_end; + + /* points into host memory */ + uint8_t *host_addr; + + /* points to the MemoryRegion that this block belongs to */ + MemoryRegion *mr; + + QTAILQ_ENTRY(GuestPhysBlock) next; +} GuestPhysBlock; + +/* point-in-time snapshot of guest-visible physical mappings */ +typedef struct GuestPhysBlockList { + unsigned num; + QTAILQ_HEAD(, GuestPhysBlock) head; +} GuestPhysBlockList; + +/* The physical and virtual address in the memory mapping are contiguous. */ +typedef struct MemoryMapping { + hwaddr phys_addr; + vaddr virt_addr; + ram_addr_t length; + QTAILQ_ENTRY(MemoryMapping) next; +} MemoryMapping; + +struct MemoryMappingList { + unsigned int num; + MemoryMapping *last_mapping; + QTAILQ_HEAD(, MemoryMapping) head; +}; + +/* + * add or merge the memory region [phys_addr, phys_addr + length) into the + * memory mapping's list. The region's virtual address starts with virt_addr, + * and is contiguous. The list is sorted by phys_addr. + */ +void memory_mapping_list_add_merge_sorted(MemoryMappingList *list, + hwaddr phys_addr, + hwaddr virt_addr, + ram_addr_t length); + +void memory_mapping_list_free(MemoryMappingList *list); + +void memory_mapping_list_init(MemoryMappingList *list); + +void guest_phys_blocks_free(GuestPhysBlockList *list); +void guest_phys_blocks_init(GuestPhysBlockList *list); +void guest_phys_blocks_append(GuestPhysBlockList *list); + +bool qemu_get_guest_memory_mapping(MemoryMappingList *list, + const GuestPhysBlockList *guest_phys_blocks, + Error **errp); + +/* get guest's memory mapping without do paging(virtual address is 0). */ +void qemu_get_guest_simple_memory_mapping(MemoryMappingList *list, + const GuestPhysBlockList *guest_phys_blocks); + +void memory_mapping_filter(MemoryMappingList *list, int64_t begin, + int64_t length); + +#endif diff --git a/include/system/numa.h b/include/system/numa.h new file mode 100644 index 0000000..1044b0e --- /dev/null +++ b/include/system/numa.h @@ -0,0 +1,113 @@ +#ifndef SYSTEM_NUMA_H +#define SYSTEM_NUMA_H + +#include "qemu/bitmap.h" +#include "qapi/qapi-types-machine.h" + +struct CPUArchId; + +#define MAX_NODES 128 +#define NUMA_NODE_UNASSIGNED MAX_NODES +#define NUMA_DISTANCE_MIN 10 +#define NUMA_DISTANCE_DEFAULT 20 +#define NUMA_DISTANCE_MAX 254 +#define NUMA_DISTANCE_UNREACHABLE 255 + +/* the value of AcpiHmatLBInfo flags */ +enum { + HMAT_LB_MEM_MEMORY = 0, + HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, + HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, + HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, + HMAT_LB_LEVELS /* must be the last entry */ +}; + +/* the value of AcpiHmatLBInfo data type */ +enum { + HMAT_LB_DATA_ACCESS_LATENCY = 0, + HMAT_LB_DATA_READ_LATENCY = 1, + HMAT_LB_DATA_WRITE_LATENCY = 2, + HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, + HMAT_LB_DATA_READ_BANDWIDTH = 4, + HMAT_LB_DATA_WRITE_BANDWIDTH = 5, + HMAT_LB_TYPES /* must be the last entry */ +}; + +#define UINT16_BITS 16 + +typedef struct NodeInfo { + uint64_t node_mem; + struct HostMemoryBackend *node_memdev; + bool present; + bool has_cpu; + bool has_gi; + uint8_t lb_info_provided; + uint16_t initiator; + uint8_t distance[MAX_NODES]; +} NodeInfo; + +typedef struct NumaNodeMem { + uint64_t node_mem; + uint64_t node_plugged_mem; +} NumaNodeMem; + +struct HMAT_LB_Data { + uint8_t initiator; + uint8_t target; + uint64_t data; +}; +typedef struct HMAT_LB_Data HMAT_LB_Data; + +struct HMAT_LB_Info { + /* Indicates it's memory or the specified level memory side cache. */ + uint8_t hierarchy; + + /* Present the type of data, access/read/write latency or bandwidth. */ + uint8_t data_type; + + /* The range bitmap of bandwidth for calculating common base */ + uint64_t range_bitmap; + + /* The common base unit for latencies or bandwidths */ + uint64_t base; + + /* Array to store the latencies or bandwidths */ + GArray *list; +}; +typedef struct HMAT_LB_Info HMAT_LB_Info; + +struct NumaState { + /* Number of NUMA nodes */ + int num_nodes; + + /* Allow setting NUMA distance for different NUMA nodes */ + bool have_numa_distance; + + /* Detect if HMAT support is enabled. */ + bool hmat_enabled; + + /* NUMA nodes information */ + NodeInfo nodes[MAX_NODES]; + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + + /* Memory Side Cache Information Structure */ + NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; +}; +typedef struct NumaState NumaState; + +void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); +void parse_numa_opts(MachineState *ms); +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp); +void numa_complete_configuration(MachineState *ms); +void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); +extern QemuOptsList qemu_numa_opts; +void numa_cpu_pre_plug(const struct CPUArchId *slot, DeviceState *dev, + Error **errp); +bool numa_uses_legacy_mem(void); + +#endif diff --git a/include/system/nvmm.h b/include/system/nvmm.h new file mode 100644 index 0000000..6971ddb --- /dev/null +++ b/include/system/nvmm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018-2019 Maxime Villard, All rights reserved. + * + * NetBSD Virtual Machine Monitor (NVMM) accelerator support. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +/* header to be included in non-NVMM-specific code */ + +#ifndef QEMU_NVMM_H +#define QEMU_NVMM_H + +#ifdef COMPILING_PER_TARGET + +#ifdef CONFIG_NVMM + +int nvmm_enabled(void); + +#else /* CONFIG_NVMM */ + +#define nvmm_enabled() (0) + +#endif /* CONFIG_NVMM */ + +#endif /* COMPILING_PER_TARGET */ + +#endif /* QEMU_NVMM_H */ diff --git a/include/system/os-posix.h b/include/system/os-posix.h new file mode 100644 index 0000000..ce5b3bc --- /dev/null +++ b/include/system/os-posix.h @@ -0,0 +1,101 @@ +/* + * posix specific declarations + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010 Jes Sorensen <Jes.Sorensen@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_OS_POSIX_H +#define QEMU_OS_POSIX_H + +#include <sys/mman.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <sys/un.h> + +#ifdef CONFIG_SYSMACROS +#include <sys/sysmacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +void os_set_line_buffering(void); +void os_setup_early_signal_handling(void); +void os_set_proc_name(const char *s); +void os_setup_signal_handling(void); +int os_set_daemonize(bool d); +bool is_daemonized(void); +void os_daemonize(void); +bool os_set_runas(const char *user_id); +void os_set_chroot(const char *path); +void os_setup_limits(void); +void os_setup_post(void); +int os_mlock(bool on_fault); + +/** + * qemu_alloc_stack: + * @sz: pointer to a size_t holding the requested usable stack size + * + * Allocate memory that can be used as a stack, for instance for + * coroutines. If the memory cannot be allocated, this function + * will abort (like g_malloc()). This function also inserts an + * additional guard page to catch a potential stack overflow. + * Note that the memory required for the guard page and alignment + * and minimal stack size restrictions will increase the value of sz. + * + * The allocated stack must be freed with qemu_free_stack(). + * + * Returns: pointer to (the lowest address of) the stack memory. + */ +void *qemu_alloc_stack(size_t *sz); + +/** + * qemu_free_stack: + * @stack: stack to free + * @sz: size of stack in bytes + * + * Free a stack allocated via qemu_alloc_stack(). Note that sz must + * be exactly the adjusted stack size returned by qemu_alloc_stack. + */ +void qemu_free_stack(void *stack, size_t sz); + +/* POSIX and Mingw32 differ in the name of the stdio lock functions. */ + +static inline void qemu_flockfile(FILE *f) +{ + flockfile(f); +} + +static inline void qemu_funlockfile(FILE *f) +{ + funlockfile(f); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/system/os-wasm.h b/include/system/os-wasm.h new file mode 100644 index 0000000..3abb3aa --- /dev/null +++ b/include/system/os-wasm.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: MIT */ +/* + * posix specific declarations forked from os-posix.h, removing functions not + * working on Emscripten + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010 Jes Sorensen <Jes.Sorensen@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_OS_WASM_H +#define QEMU_OS_WASM_H + +#include <sys/mman.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <sys/un.h> + +#ifdef CONFIG_SYSMACROS +#include <sys/sysmacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +void os_set_line_buffering(void); +void os_setup_early_signal_handling(void); +void os_set_proc_name(const char *s); +void os_setup_signal_handling(void); +void os_setup_limits(void); +void os_setup_post(void); +int os_mlock(bool on_fault); +static inline int os_set_daemonize(bool d) +{ + return -1; +}; +bool is_daemonized(void); +static inline void os_daemonize(void) {} + +/** + * qemu_alloc_stack: + * @sz: pointer to a size_t holding the requested usable stack size + * + * Allocate memory that can be used as a stack, for instance for + * coroutines. If the memory cannot be allocated, this function + * will abort (like g_malloc()). This function also inserts an + * additional guard page to catch a potential stack overflow. + * Note that the memory required for the guard page and alignment + * and minimal stack size restrictions will increase the value of sz. + * + * The allocated stack must be freed with qemu_free_stack(). + * + * Returns: pointer to (the lowest address of) the stack memory. + */ +void *qemu_alloc_stack(size_t *sz); + +/** + * qemu_free_stack: + * @stack: stack to free + * @sz: size of stack in bytes + * + * Free a stack allocated via qemu_alloc_stack(). Note that sz must + * be exactly the adjusted stack size returned by qemu_alloc_stack. + */ +void qemu_free_stack(void *stack, size_t sz); + +/* POSIX and Mingw32 differ in the name of the stdio lock functions. */ + +static inline void qemu_flockfile(FILE *f) +{ + flockfile(f); +} + +static inline void qemu_funlockfile(FILE *f) +{ + funlockfile(f); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/system/os-win32.h b/include/system/os-win32.h new file mode 100644 index 0000000..3aa6cee --- /dev/null +++ b/include/system/os-win32.h @@ -0,0 +1,276 @@ +/* + * win32 specific declarations + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010 Jes Sorensen <Jes.Sorensen@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_OS_WIN32_H +#define QEMU_OS_WIN32_H + +#include <winsock2.h> +#include <windows.h> +#include <ws2tcpip.h> +#include "qemu/typedefs.h" + +#ifdef HAVE_AFUNIX_H +#include <afunix.h> +#else +/* + * Fallback definitions of things we need in afunix.h, if not available from + * the used Windows SDK or MinGW headers. + */ +#define UNIX_PATH_MAX 108 + +typedef struct sockaddr_un { + ADDRESS_FAMILY sun_family; + char sun_path[UNIX_PATH_MAX]; +} SOCKADDR_UN, *PSOCKADDR_UN; + +#define SIO_AF_UNIX_GETPEERPID _WSAIOR(IOC_VENDOR, 256) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__aarch64__) +/* + * On windows-arm64, setjmp is available in only one variant, and longjmp always + * does stack unwinding. This crash with generated code. + * Thus, we use another implementation of setjmp (not windows one), coming from + * mingw, which never performs stack unwinding. + */ +#undef setjmp +#undef longjmp +/* + * These functions are not declared in setjmp.h because __aarch64__ defines + * setjmp to _setjmpex instead. However, they are still defined in libmingwex.a, + * which gets linked automatically. + */ +int __mingw_setjmp(jmp_buf); +void __attribute__((noreturn)) __mingw_longjmp(jmp_buf, int); +#define setjmp(env) __mingw_setjmp(env) +#define longjmp(env, val) __mingw_longjmp(env, val) +#elif defined(_WIN64) +/* + * On windows-x64, setjmp is implemented by _setjmp which needs a second parameter. + * If this parameter is NULL, longjump does no stack unwinding. + * That is what we need for QEMU. Passing the value of register rsp (default) + * lets longjmp try a stack unwinding which will crash with generated code. + */ +# undef setjmp +# define setjmp(env) _setjmp(env, NULL) +#endif /* __aarch64__ */ +/* QEMU uses sigsetjmp()/siglongjmp() as the portable way to specify + * "longjmp and don't touch the signal masks". Since we know that the + * savemask parameter will always be zero we can safely define these + * in terms of setjmp/longjmp on Win32. + */ +#define sigjmp_buf jmp_buf +#define sigsetjmp(env, savemask) setjmp(env) +#define siglongjmp(env, val) longjmp(env, val) + +/* Missing POSIX functions. Don't use MinGW-w64 macros. */ +#ifndef _POSIX_THREAD_SAFE_FUNCTIONS +#undef gmtime_r +struct tm *gmtime_r(const time_t *timep, struct tm *result); +#undef localtime_r +struct tm *localtime_r(const time_t *timep, struct tm *result); +#endif /* _POSIX_THREAD_SAFE_FUNCTIONS */ + +static inline void os_setup_signal_handling(void) {} +static inline void os_daemonize(void) {} +static inline void os_setup_post(void) {} +static inline void os_set_proc_name(const char *dummy) {} +void os_set_line_buffering(void); +void os_setup_early_signal_handling(void); + +int getpagesize(void); + +#if !defined(EPROTONOSUPPORT) +# define EPROTONOSUPPORT EINVAL +#endif + +static inline int os_set_daemonize(bool d) +{ + if (d) { + return -ENOTSUP; + } + return 0; +} + +static inline bool is_daemonized(void) +{ + return false; +} + +static inline int os_mlock(bool on_fault G_GNUC_UNUSED) +{ + return -ENOSYS; +} + +static inline void os_setup_limits(void) +{ +} + +#define fsync _commit + +#if !defined(lseek) +# define lseek _lseeki64 +#endif + +int qemu_ftruncate64(int, int64_t); + +#if !defined(ftruncate) +# define ftruncate qemu_ftruncate64 +#endif + +static inline char *realpath(const char *path, char *resolved_path) +{ + _fullpath(resolved_path, path, _MAX_PATH); + return resolved_path; +} + +/* + * Older versions of MinGW do not import _lock_file and _unlock_file properly. + * This was fixed for v6.0.0 with commit b48e3ac8969d. + */ +static inline void qemu_flockfile(FILE *f) +{ +#ifdef HAVE__LOCK_FILE + _lock_file(f); +#endif +} + +static inline void qemu_funlockfile(FILE *f) +{ +#ifdef HAVE__LOCK_FILE + _unlock_file(f); +#endif +} + +/* Helper for WSAEventSelect, to report errors */ +bool qemu_socket_select(int sockfd, WSAEVENT hEventObject, + long lNetworkEvents, Error **errp); + +bool qemu_socket_unselect(int sockfd, Error **errp); + +/* We wrap all the sockets functions so that we can set errno based on + * WSAGetLastError(), and use file-descriptors instead of SOCKET. + */ + +/* + * qemu_close_socket_osfhandle: + * @fd: a file descriptor associated with a SOCKET + * + * Close only the C run-time file descriptor, leave the SOCKET opened. + * + * Returns zero on success. On error, -1 is returned, and errno is set to + * indicate the error. + */ +int qemu_close_socket_osfhandle(int fd); + +#undef close +#define close qemu_close_wrap +int qemu_close_wrap(int fd); + +#undef connect +#define connect qemu_connect_wrap +int qemu_connect_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + +#undef listen +#define listen qemu_listen_wrap +int qemu_listen_wrap(int sockfd, int backlog); + +#undef bind +#define bind qemu_bind_wrap +int qemu_bind_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + +#undef socket +#define socket qemu_socket_wrap +int qemu_socket_wrap(int domain, int type, int protocol); + +#undef accept +#define accept qemu_accept_wrap +int qemu_accept_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef shutdown +#define shutdown qemu_shutdown_wrap +int qemu_shutdown_wrap(int sockfd, int how); + +#undef ioctlsocket +#define ioctlsocket qemu_ioctlsocket_wrap +int qemu_ioctlsocket_wrap(int fd, int req, void *val); + +#undef getsockopt +#define getsockopt qemu_getsockopt_wrap +int qemu_getsockopt_wrap(int sockfd, int level, int optname, + void *optval, socklen_t *optlen); + +#undef setsockopt +#define setsockopt qemu_setsockopt_wrap +int qemu_setsockopt_wrap(int sockfd, int level, int optname, + const void *optval, socklen_t optlen); + +#undef getpeername +#define getpeername qemu_getpeername_wrap +int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef getsockname +#define getsockname qemu_getsockname_wrap +int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef send +#define send qemu_send_wrap +ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags); + +#undef sendto +#define sendto qemu_sendto_wrap +ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *addr, socklen_t addrlen); + +#undef recv +#define recv qemu_recv_wrap +ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags); + +#undef recvfrom +#define recvfrom qemu_recvfrom_wrap +ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *addr, socklen_t *addrlen); + +EXCEPTION_DISPOSITION +win32_close_exception_handler(struct _EXCEPTION_RECORD*, void*, + struct _CONTEXT*, void*); + +void *qemu_win32_map_alloc(size_t size, HANDLE *h, Error **errp); +void qemu_win32_map_free(void *ptr, HANDLE h, Error **errp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/system/qtest.h b/include/system/qtest.h new file mode 100644 index 0000000..84b1f8c --- /dev/null +++ b/include/system/qtest.h @@ -0,0 +1,36 @@ +/* + * Test Server + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QTEST_H +#define QTEST_H + +#include "chardev/char.h" + +extern bool qtest_allowed; + +static inline bool qtest_enabled(void) +{ + return qtest_allowed; +} + +void G_GNUC_PRINTF(2, 3) qtest_sendf(CharBackend *chr, const char *fmt, ...); +void qtest_set_command_cb(bool (*pc_cb)(CharBackend *chr, gchar **words)); +bool qtest_driver(void); + +void qtest_server_init(const char *qtest_chrdev, const char *qtest_log, Error **errp); + +void qtest_server_set_send_handler(void (*send)(void *, const char *), + void *opaque); +void qtest_server_inproc_recv(void *opaque, const char *buf); + +#endif diff --git a/include/system/ram_addr.h b/include/system/ram_addr.h new file mode 100644 index 0000000..15a1b1a --- /dev/null +++ b/include/system/ram_addr.h @@ -0,0 +1,561 @@ +/* + * Declarations for cpu physical memory functions + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Avi Kivity <avi@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ + +/* + * This header is for use by exec.c and memory.c ONLY. Do not include it. + * The functions declared here will be removed soon. + */ + +#ifndef SYSTEM_RAM_ADDR_H +#define SYSTEM_RAM_ADDR_H + +#include "system/xen.h" +#include "system/tcg.h" +#include "exec/cputlb.h" +#include "exec/ramlist.h" +#include "system/ramblock.h" +#include "system/memory.h" +#include "exec/target_page.h" +#include "qemu/rcu.h" + +#include "exec/hwaddr.h" +#include "exec/cpu-common.h" + +extern uint64_t total_dirty_pages; + +/** + * clear_bmap_size: calculate clear bitmap size + * + * @pages: number of guest pages + * @shift: guest page number shift + * + * Returns: number of bits for the clear bitmap + */ +static inline long clear_bmap_size(uint64_t pages, uint8_t shift) +{ + return DIV_ROUND_UP(pages, 1UL << shift); +} + +/** + * clear_bmap_set: set clear bitmap for the page range. Must be with + * bitmap_mutex held. + * + * @rb: the ramblock to operate on + * @start: the start page number + * @size: number of pages to set in the bitmap + * + * Returns: None + */ +static inline void clear_bmap_set(RAMBlock *rb, uint64_t start, + uint64_t npages) +{ + uint8_t shift = rb->clear_bmap_shift; + + bitmap_set(rb->clear_bmap, start >> shift, clear_bmap_size(npages, shift)); +} + +/** + * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set. + * Must be with bitmap_mutex held. + * + * @rb: the ramblock to operate on + * @page: the page number to check + * + * Returns: true if the bit was set, false otherwise + */ +static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page) +{ + uint8_t shift = rb->clear_bmap_shift; + + return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1); +} + +static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) +{ + return (b && b->host && offset < b->used_length) ? true : false; +} + +static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset) +{ + assert(offset_in_ramblock(block, offset)); + return (char *)block->host + offset; +} + +static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, + RAMBlock *rb) +{ + uint64_t host_addr_offset = + (uint64_t)(uintptr_t)(host_addr - (void *)rb->host); + return host_addr_offset >> TARGET_PAGE_BITS; +} + +bool ramblock_is_pmem(RAMBlock *rb); + +/** + * qemu_ram_alloc_from_file, + * qemu_ram_alloc_from_fd: Allocate a ram block from the specified backing + * file or device + * + * Parameters: + * @size: the size in bytes of the ram block + * @max_size: the maximum size of the block after resizing + * @mr: the memory region where the ram block is + * @resized: callback after calls to qemu_ram_resize + * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM, + * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY, + * RAM_READONLY_FD, RAM_GUEST_MEMFD + * @mem_path or @fd: specify the backing file or device + * @offset: Offset into target file + * @grow: extend file if necessary (but an empty file is always extended). + * @errp: pointer to Error*, to store an error if it happens + * + * Return: + * On success, return a pointer to the ram block. + * On failure, return NULL. + */ +typedef void (*qemu_ram_resize_cb)(const char *, uint64_t length, void *host); + +RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, + uint32_t ram_flags, const char *mem_path, + off_t offset, Error **errp); +RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size, + qemu_ram_resize_cb resized, MemoryRegion *mr, + uint32_t ram_flags, int fd, off_t offset, + bool grow, + Error **errp); + +RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, + MemoryRegion *mr, Error **errp); +RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags, MemoryRegion *mr, + Error **errp); +RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t max_size, + qemu_ram_resize_cb resized, + MemoryRegion *mr, Error **errp); +void qemu_ram_free(RAMBlock *block); + +int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp); + +void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length); + +/* Clear whole block of mem */ +static inline void qemu_ram_block_writeback(RAMBlock *block) +{ + qemu_ram_msync(block, 0, block->used_length); +} + +#define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1) +#define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE)) + +static inline bool cpu_physical_memory_get_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client) +{ + DirtyMemoryBlocks *blocks; + unsigned long end, page; + unsigned long idx, offset, base; + bool dirty = false; + + assert(client < DIRTY_MEMORY_NUM); + + end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; + page = start >> TARGET_PAGE_BITS; + + WITH_RCU_READ_LOCK_GUARD() { + blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); + + idx = page / DIRTY_MEMORY_BLOCK_SIZE; + offset = page % DIRTY_MEMORY_BLOCK_SIZE; + base = page - offset; + while (page < end) { + unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); + unsigned long num = next - base; + unsigned long found = find_next_bit(blocks->blocks[idx], + num, offset); + if (found < num) { + dirty = true; + break; + } + + page = next; + idx++; + offset = 0; + base += DIRTY_MEMORY_BLOCK_SIZE; + } + } + + return dirty; +} + +static inline bool cpu_physical_memory_all_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client) +{ + DirtyMemoryBlocks *blocks; + unsigned long end, page; + unsigned long idx, offset, base; + bool dirty = true; + + assert(client < DIRTY_MEMORY_NUM); + + end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; + page = start >> TARGET_PAGE_BITS; + + RCU_READ_LOCK_GUARD(); + + blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); + + idx = page / DIRTY_MEMORY_BLOCK_SIZE; + offset = page % DIRTY_MEMORY_BLOCK_SIZE; + base = page - offset; + while (page < end) { + unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); + unsigned long num = next - base; + unsigned long found = find_next_zero_bit(blocks->blocks[idx], num, offset); + if (found < num) { + dirty = false; + break; + } + + page = next; + idx++; + offset = 0; + base += DIRTY_MEMORY_BLOCK_SIZE; + } + + return dirty; +} + +static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr, + unsigned client) +{ + return cpu_physical_memory_get_dirty(addr, 1, client); +} + +static inline bool cpu_physical_memory_is_clean(ram_addr_t addr) +{ + bool vga = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA); + bool code = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE); + bool migration = + cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION); + return !(vga && code && migration); +} + +static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start, + ram_addr_t length, + uint8_t mask) +{ + uint8_t ret = 0; + + if (mask & (1 << DIRTY_MEMORY_VGA) && + !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) { + ret |= (1 << DIRTY_MEMORY_VGA); + } + if (mask & (1 << DIRTY_MEMORY_CODE) && + !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_CODE)) { + ret |= (1 << DIRTY_MEMORY_CODE); + } + if (mask & (1 << DIRTY_MEMORY_MIGRATION) && + !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_MIGRATION)) { + ret |= (1 << DIRTY_MEMORY_MIGRATION); + } + return ret; +} + +static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr, + unsigned client) +{ + unsigned long page, idx, offset; + DirtyMemoryBlocks *blocks; + + assert(client < DIRTY_MEMORY_NUM); + + page = addr >> TARGET_PAGE_BITS; + idx = page / DIRTY_MEMORY_BLOCK_SIZE; + offset = page % DIRTY_MEMORY_BLOCK_SIZE; + + RCU_READ_LOCK_GUARD(); + + blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); + + set_bit_atomic(offset, blocks->blocks[idx]); +} + +static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start, + ram_addr_t length, + uint8_t mask) +{ + DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM]; + unsigned long end, page; + unsigned long idx, offset, base; + int i; + + if (!mask && !xen_enabled()) { + return; + } + + end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; + page = start >> TARGET_PAGE_BITS; + + WITH_RCU_READ_LOCK_GUARD() { + for (i = 0; i < DIRTY_MEMORY_NUM; i++) { + blocks[i] = qatomic_rcu_read(&ram_list.dirty_memory[i]); + } + + idx = page / DIRTY_MEMORY_BLOCK_SIZE; + offset = page % DIRTY_MEMORY_BLOCK_SIZE; + base = page - offset; + while (page < end) { + unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); + + if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) { + bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx], + offset, next - page); + } + if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) { + bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx], + offset, next - page); + } + if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) { + bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx], + offset, next - page); + } + + page = next; + idx++; + offset = 0; + base += DIRTY_MEMORY_BLOCK_SIZE; + } + } + + if (xen_enabled()) { + xen_hvm_modified_memory(start, length); + } +} + +#if !defined(_WIN32) + +/* + * Contrary to cpu_physical_memory_sync_dirty_bitmap() this function returns + * the number of dirty pages in @bitmap passed as argument. On the other hand, + * cpu_physical_memory_sync_dirty_bitmap() returns newly dirtied pages that + * weren't set in the global migration bitmap. + */ +static inline +uint64_t cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap, + ram_addr_t start, + ram_addr_t pages) +{ + unsigned long i, j; + unsigned long page_number, c, nbits; + hwaddr addr; + ram_addr_t ram_addr; + uint64_t num_dirty = 0; + unsigned long len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS; + unsigned long hpratio = qemu_real_host_page_size() / TARGET_PAGE_SIZE; + unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); + + /* start address is aligned at the start of a word? */ + if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) && + (hpratio == 1)) { + unsigned long **blocks[DIRTY_MEMORY_NUM]; + unsigned long idx; + unsigned long offset; + long k; + long nr = BITS_TO_LONGS(pages); + + idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE; + offset = BIT_WORD((start >> TARGET_PAGE_BITS) % + DIRTY_MEMORY_BLOCK_SIZE); + + WITH_RCU_READ_LOCK_GUARD() { + for (i = 0; i < DIRTY_MEMORY_NUM; i++) { + blocks[i] = + qatomic_rcu_read(&ram_list.dirty_memory[i])->blocks; + } + + for (k = 0; k < nr; k++) { + if (bitmap[k]) { + unsigned long temp = leul_to_cpu(bitmap[k]); + + nbits = ctpopl(temp); + qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp); + + if (global_dirty_tracking) { + qatomic_or( + &blocks[DIRTY_MEMORY_MIGRATION][idx][offset], + temp); + if (unlikely( + global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) { + total_dirty_pages += nbits; + } + } + + num_dirty += nbits; + + if (tcg_enabled()) { + qatomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], + temp); + } + } + + if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { + offset = 0; + idx++; + } + } + } + + if (xen_enabled()) { + xen_hvm_modified_memory(start, pages << TARGET_PAGE_BITS); + } + } else { + uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE; + + if (!global_dirty_tracking) { + clients &= ~(1 << DIRTY_MEMORY_MIGRATION); + } + + /* + * bitmap-traveling is faster than memory-traveling (for addr...) + * especially when most of the memory is not dirty. + */ + for (i = 0; i < len; i++) { + if (bitmap[i] != 0) { + c = leul_to_cpu(bitmap[i]); + nbits = ctpopl(c); + if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) { + total_dirty_pages += nbits; + } + num_dirty += nbits; + do { + j = ctzl(c); + c &= ~(1ul << j); + page_number = (i * HOST_LONG_BITS + j) * hpratio; + addr = page_number * TARGET_PAGE_SIZE; + ram_addr = start + addr; + cpu_physical_memory_set_dirty_range(ram_addr, + TARGET_PAGE_SIZE * hpratio, clients); + } while (c != 0); + } + } + } + + return num_dirty; +} +#endif /* not _WIN32 */ + +static inline void cpu_physical_memory_dirty_bits_cleared(ram_addr_t start, + ram_addr_t length) +{ + if (tcg_enabled()) { + tlb_reset_dirty_range_all(start, length); + } + +} +bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client); + +DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty + (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client); + +bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, + ram_addr_t start, + ram_addr_t length); + +static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start, + ram_addr_t length) +{ + cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_MIGRATION); + cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_VGA); + cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_CODE); +} + + +/* Called with RCU critical section */ +static inline +uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, + ram_addr_t start, + ram_addr_t length) +{ + ram_addr_t addr; + unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); + uint64_t num_dirty = 0; + unsigned long *dest = rb->bmap; + + /* start address and length is aligned at the start of a word? */ + if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == + (start + rb->offset) && + !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) { + int k; + int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); + unsigned long * const *src; + unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE; + unsigned long offset = BIT_WORD((word * BITS_PER_LONG) % + DIRTY_MEMORY_BLOCK_SIZE); + unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); + + src = qatomic_rcu_read( + &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks; + + for (k = page; k < page + nr; k++) { + if (src[idx][offset]) { + unsigned long bits = qatomic_xchg(&src[idx][offset], 0); + unsigned long new_dirty; + new_dirty = ~dest[k]; + dest[k] |= bits; + new_dirty &= bits; + num_dirty += ctpopl(new_dirty); + } + + if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { + offset = 0; + idx++; + } + } + if (num_dirty) { + cpu_physical_memory_dirty_bits_cleared(start, length); + } + + if (rb->clear_bmap) { + /* + * Postpone the dirty bitmap clear to the point before we + * really send the pages, also we will split the clear + * dirty procedure into smaller chunks. + */ + clear_bmap_set(rb, start >> TARGET_PAGE_BITS, + length >> TARGET_PAGE_BITS); + } else { + /* Slow path - still do that in a huge chunk */ + memory_region_clear_dirty_bitmap(rb->mr, start, length); + } + } else { + ram_addr_t offset = rb->offset; + + for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { + if (cpu_physical_memory_test_and_clear_dirty( + start + addr + offset, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + long k = (start + addr) >> TARGET_PAGE_BITS; + if (!test_and_set_bit(k, dest)) { + num_dirty++; + } + } + } + } + + return num_dirty; +} + +#endif diff --git a/include/system/ramblock.h b/include/system/ramblock.h new file mode 100644 index 0000000..87e847e --- /dev/null +++ b/include/system/ramblock.h @@ -0,0 +1,116 @@ +/* + * Declarations for cpu physical memory functions + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Avi Kivity <avi@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ + +/* + * This header is for use by exec.c and memory.c ONLY. Do not include it. + * The functions declared here will be removed soon. + */ + +#ifndef SYSTEM_RAMBLOCK_H +#define SYSTEM_RAMBLOCK_H + +#include "exec/cpu-common.h" +#include "qemu/rcu.h" +#include "exec/ramlist.h" +#include "system/hostmem.h" + +#define TYPE_RAM_BLOCK_ATTRIBUTES "ram-block-attributes" +OBJECT_DECLARE_SIMPLE_TYPE(RamBlockAttributes, RAM_BLOCK_ATTRIBUTES) + +struct RAMBlock { + struct rcu_head rcu; + struct MemoryRegion *mr; + uint8_t *host; + uint8_t *colo_cache; /* For colo, VM's ram cache */ + ram_addr_t offset; + ram_addr_t used_length; + ram_addr_t max_length; + void (*resized)(const char*, uint64_t length, void *host); + uint32_t flags; + /* Protected by the BQL. */ + char idstr[256]; + /* RCU-enabled, writes protected by the ramlist lock */ + QLIST_ENTRY(RAMBlock) next; + QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; + Error *cpr_blocker; + int fd; + uint64_t fd_offset; + int guest_memfd; + RamBlockAttributes *attributes; + size_t page_size; + /* dirty bitmap used during migration */ + unsigned long *bmap; + + /* + * Below fields are only used by mapped-ram migration + */ + /* bitmap of pages present in the migration file */ + unsigned long *file_bmap; + /* + * offset in the file pages belonging to this ramblock are saved, + * used only during migration to a file. + */ + off_t bitmap_offset; + uint64_t pages_offset; + + /* Bitmap of already received pages. Only used on destination side. */ + unsigned long *receivedmap; + + /* + * bitmap to track already cleared dirty bitmap. When the bit is + * set, it means the corresponding memory chunk needs a log-clear. + * Set this up to non-NULL to enable the capability to postpone + * and split clearing of dirty bitmap on the remote node (e.g., + * KVM). The bitmap will be set only when doing global sync. + * + * It is only used during src side of ram migration, and it is + * protected by the global ram_state.bitmap_mutex. + * + * NOTE: this bitmap is different comparing to the other bitmaps + * in that one bit can represent multiple guest pages (which is + * decided by the `clear_bmap_shift' variable below). On + * destination side, this should always be NULL, and the variable + * `clear_bmap_shift' is meaningless. + */ + unsigned long *clear_bmap; + uint8_t clear_bmap_shift; + + /* + * RAM block length that corresponds to the used_length on the migration + * source (after RAM block sizes were synchronized). Especially, after + * starting to run the guest, used_length and postcopy_length can differ. + * Used to register/unregister uffd handlers and as the size of the received + * bitmap. Receiving any page beyond this length will bail out, as it + * could not have been valid on the source. + */ + ram_addr_t postcopy_length; +}; + +struct RamBlockAttributes { + Object parent; + + RAMBlock *ram_block; + + /* 1-setting of the bitmap represents ram is populated (shared) */ + unsigned bitmap_size; + unsigned long *bitmap; + + QLIST_HEAD(, RamDiscardListener) rdl_list; +}; + +RamBlockAttributes *ram_block_attributes_create(RAMBlock *ram_block); +void ram_block_attributes_destroy(RamBlockAttributes *attr); +int ram_block_attributes_state_change(RamBlockAttributes *attr, uint64_t offset, + uint64_t size, bool to_discard); + +#endif diff --git a/include/system/replay.h b/include/system/replay.h new file mode 100644 index 0000000..1c87c97 --- /dev/null +++ b/include/system/replay.h @@ -0,0 +1,179 @@ +/* + * QEMU replay (system interface) + * + * Copyright (c) 2010-2015 Institute for System Programming + * of the Russian Academy of Sciences. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#ifndef SYSTEM_REPLAY_H +#define SYSTEM_REPLAY_H + +#include "exec/replay-core.h" +#include "qapi/qapi-types-misc.h" +#include "qapi/qapi-types-run-state.h" +#include "qapi/qapi-types-ui.h" +#include "block/aio.h" + +/* replay clock kinds */ +enum ReplayClockKind { + /* host_clock */ + REPLAY_CLOCK_HOST, + /* virtual_rt_clock */ + REPLAY_CLOCK_VIRTUAL_RT, + REPLAY_CLOCK_COUNT +}; +typedef enum ReplayClockKind ReplayClockKind; + +/* IDs of the checkpoints */ +enum ReplayCheckpoint { + CHECKPOINT_CLOCK_WARP_START, + CHECKPOINT_CLOCK_WARP_ACCOUNT, + CHECKPOINT_RESET_REQUESTED, + CHECKPOINT_SUSPEND_REQUESTED, + CHECKPOINT_CLOCK_VIRTUAL, + CHECKPOINT_CLOCK_HOST, + CHECKPOINT_CLOCK_VIRTUAL_RT, + CHECKPOINT_INIT, + CHECKPOINT_RESET, + CHECKPOINT_COUNT +}; +typedef enum ReplayCheckpoint ReplayCheckpoint; + +typedef struct ReplayNetState ReplayNetState; + +/* Name of the initial VM snapshot */ +extern char *replay_snapshot; + +/* Replay locking + * + * The locks are needed to protect the shared structures and log file + * when doing record/replay. They also are the main sync-point between + * the main-loop thread and the vCPU thread. This was a role + * previously filled by the BQL which has been busy trying to reduce + * its impact across the code. This ensures blocks of events stay + * sequential and reproducible. + */ + +void replay_mutex_lock(void); +void replay_mutex_unlock(void); + +/* Processing the instructions */ + +/*! Returns number of executed instructions. */ +uint64_t replay_get_current_icount(void); +/*! Returns number of instructions to execute in replay mode. */ +int replay_get_instructions(void); +/*! Updates instructions counter in replay mode. */ +void replay_account_executed_instructions(void); + +/* Processing clocks and other time sources */ + +/*! Save the specified clock */ +int64_t replay_save_clock(ReplayClockKind kind, int64_t clock, + int64_t raw_icount); +/*! Read the specified clock from the log or return cached data */ +int64_t replay_read_clock(ReplayClockKind kind, int64_t raw_icount); +/*! Saves or reads the clock depending on the current replay mode. */ +#define REPLAY_CLOCK(clock, value) \ + !icount_enabled() ? (value) : \ + (replay_mode == REPLAY_MODE_PLAY \ + ? replay_read_clock((clock), icount_get_raw()) \ + : replay_mode == REPLAY_MODE_RECORD \ + ? replay_save_clock((clock), (value), icount_get_raw()) \ + : (value)) +#define REPLAY_CLOCK_LOCKED(clock, value) \ + !icount_enabled() ? (value) : \ + (replay_mode == REPLAY_MODE_PLAY \ + ? replay_read_clock((clock), icount_get_raw_locked()) \ + : replay_mode == REPLAY_MODE_RECORD \ + ? replay_save_clock((clock), (value), icount_get_raw_locked()) \ + : (value)) + +/* Events */ + +/*! Called when qemu shutdown is requested. */ +void replay_shutdown_request(ShutdownCause cause); +/*! Should be called at check points in the execution. + These check points are skipped, if they were not met. + Saves checkpoint in the SAVE mode and validates in the PLAY mode. + Returns 0 in PLAY mode if checkpoint was not found. + Returns 1 in all other cases. */ +bool replay_checkpoint(ReplayCheckpoint checkpoint); +/*! Used to determine that checkpoint or async event is pending. + Does not proceed to the next event in the log. */ +bool replay_has_event(void); +/* + * Processes the async events added to the queue (while recording) + * or reads the events from the file (while replaying). + */ +void replay_async_events(void); + +/* Asynchronous events queue */ + +/*! Enables storing events in the queue */ +void replay_enable_events(void); +/*! Returns true when saving events is enabled */ +bool replay_events_enabled(void); +/* Flushes events queue */ +void replay_flush_events(void); +/*! Adds bottom half event to the queue */ +void replay_bh_schedule_event(QEMUBH *bh); +/* Adds oneshot bottom half event to the queue */ +void replay_bh_schedule_oneshot_event(AioContext *ctx, + QEMUBHFunc *cb, void *opaque); +/*! Adds input event to the queue */ +void replay_input_event(QemuConsole *src, InputEvent *evt); +/*! Adds input sync event to the queue */ +void replay_input_sync_event(void); +/*! Adds block layer event to the queue */ +void replay_block_event(QEMUBH *bh, uint64_t id); +/*! Returns ID for the next block event */ +uint64_t blkreplay_next_id(void); + +/* Character device */ + +/*! Registers char driver to save it's events */ +void replay_register_char_driver(struct Chardev *chr); +/*! Saves write to char device event to the log */ +void replay_chr_be_write(struct Chardev *s, const uint8_t *buf, int len); +/*! Writes char write return value to the replay log. */ +void replay_char_write_event_save(int res, int offset); +/*! Reads char write return value from the replay log. */ +void replay_char_write_event_load(int *res, int *offset); +/*! Reads information about read_all character event. */ +int replay_char_read_all_load(uint8_t *buf); +/*! Writes character read_all error code into the replay log. */ +void replay_char_read_all_save_error(int res); +/*! Writes character read_all execution result into the replay log. */ +void replay_char_read_all_save_buf(uint8_t *buf, int offset); + +/* Network */ + +/*! Registers replay network filter attached to some backend. */ +ReplayNetState *replay_register_net(NetFilterState *nfs); +/*! Unregisters replay network filter. */ +void replay_unregister_net(ReplayNetState *rns); +/*! Called to write network packet to the replay log. */ +void replay_net_packet_event(ReplayNetState *rns, unsigned flags, + const struct iovec *iov, int iovcnt); + +/* Audio */ + +/*! Saves/restores number of played samples of audio out operation. */ +void replay_audio_out(size_t *played); +/*! Saves/restores recorded samples of audio in operation. */ +void replay_audio_in(size_t *recorded, void *samples, size_t *wpos, size_t size); + +/* VM state operations */ + +/*! Called at the start of execution. + Loads or saves initial vmstate depending on execution mode. */ +void replay_vmstate_init(void); +/*! Called to ensure that replay state is consistent and VM snapshot + can be created */ +bool replay_can_snapshot(void); + +#endif diff --git a/include/system/reset.h b/include/system/reset.h new file mode 100644 index 0000000..97131d9 --- /dev/null +++ b/include/system/reset.h @@ -0,0 +1,127 @@ +/* + * Reset handlers. + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2016 Red Hat, Inc. + * Copyright (c) 2024 Linaro, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_SYSTEM_RESET_H +#define QEMU_SYSTEM_RESET_H + +#include "hw/resettable.h" +#include "qapi/qapi-events-run-state.h" + +typedef void QEMUResetHandler(void *opaque); + +/** + * qemu_register_resettable: Register an object to be reset + * @obj: object to be reset: it must implement the Resettable interface + * + * Register @obj on the list of objects which will be reset when the + * simulation is reset. These objects will be reset in the order + * they were added, using the three-phase Resettable protocol, + * so first all objects go through the enter phase, then all objects + * go through the hold phase, and then finally all go through the + * exit phase. + * + * It is not permitted to register or unregister reset functions or + * resettable objects from within any of the reset phase methods of @obj. + * + * We assume that the caller holds the BQL. + */ +void qemu_register_resettable(Object *obj); + +/** + * qemu_unregister_resettable: Unregister an object to be reset + * @obj: object to unregister + * + * Remove @obj from the list of objects which are reset when the + * simulation is reset. It must have been previously added to + * the list via qemu_register_resettable(). + * + * We assume that the caller holds the BQL. + */ +void qemu_unregister_resettable(Object *obj); + +/** + * qemu_register_reset: Register a callback for system reset + * @func: function to call + * @opaque: opaque data to pass to @func + * + * Register @func on the list of functions which are called when the + * entire system is reset. Functions registered with this API and + * Resettable objects registered with qemu_register_resettable() are + * handled together, in the order in which they were registered. + * Functions registered with this API are called in the 'hold' phase + * of the 3-phase reset. + * + * In general this function should not be used in new code where possible; + * for instance, device model reset is better accomplished using the + * methods on DeviceState. + * + * It is not permitted to register or unregister reset functions or + * resettable objects from within the @func callback. + * + * We assume that the caller holds the BQL. + */ +void qemu_register_reset(QEMUResetHandler *func, void *opaque); + +/** + * qemu_register_reset_nosnapshotload: Register a callback for system reset + * @func: function to call + * @opaque: opaque data to pass to @func + * + * This is the same as qemu_register_reset(), except that @func is + * not called if the reason that the system is being reset is to + * put it into a clean state prior to loading a snapshot (i.e. for + * SHUTDOWN_CAUSE_SNAPSHOT_LOAD). + */ +void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque); + +/** + * qemu_unregister_reset: Unregister a system reset callback + * @func: function registered with qemu_register_reset() + * @opaque: the same opaque data that was passed to qemu_register_reset() + * + * Undo the effects of a qemu_register_reset(). The @func and @opaque + * must both match the arguments originally used with qemu_register_reset(). + * + * We assume that the caller holds the BQL. + */ +void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); + +/** + * qemu_devices_reset: Perform a complete system reset + * @reason: type of the reset + * + * This function performs the low-level work needed to do a complete reset + * of the system (calling all the callbacks registered with + * qemu_register_reset() and resetting all the Resettable objects registered + * with qemu_register_resettable()). It should only be called by the code in a + * MachineClass reset method. + * + * If you want to trigger a system reset from, for instance, a device + * model, don't use this function. Use qemu_system_reset_request(). + */ +void qemu_devices_reset(ResetType type); + +#endif diff --git a/include/system/rng-random.h b/include/system/rng-random.h new file mode 100644 index 0000000..0fdc6c6 --- /dev/null +++ b/include/system/rng-random.h @@ -0,0 +1,21 @@ +/* + * QEMU Random Number Generator Backend + * + * Copyright IBM, Corp. 2012 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_RNG_RANDOM_H +#define QEMU_RNG_RANDOM_H + +#include "qom/object.h" + +#define TYPE_RNG_RANDOM "rng-random" +OBJECT_DECLARE_SIMPLE_TYPE(RngRandom, RNG_RANDOM) + + +#endif diff --git a/include/system/rng.h b/include/system/rng.h new file mode 100644 index 0000000..e383f87 --- /dev/null +++ b/include/system/rng.h @@ -0,0 +1,89 @@ +/* + * QEMU Random Number Generator Backend + * + * Copyright IBM, Corp. 2012 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_RNG_H +#define QEMU_RNG_H + +#include "qemu/queue.h" +#include "qom/object.h" + +#define TYPE_RNG_BACKEND "rng-backend" +OBJECT_DECLARE_TYPE(RngBackend, RngBackendClass, + RNG_BACKEND) + +#define TYPE_RNG_BUILTIN "rng-builtin" + +typedef struct RngRequest RngRequest; + +typedef void (EntropyReceiveFunc)(void *opaque, + const void *data, + size_t size); + +struct RngRequest +{ + EntropyReceiveFunc *receive_entropy; + uint8_t *data; + void *opaque; + size_t offset; + size_t size; + QSIMPLEQ_ENTRY(RngRequest) next; +}; + +struct RngBackendClass +{ + ObjectClass parent_class; + + void (*request_entropy)(RngBackend *s, RngRequest *req); + + void (*opened)(RngBackend *s, Error **errp); +}; + +struct RngBackend +{ + Object parent; + + /*< protected >*/ + bool opened; + QSIMPLEQ_HEAD(, RngRequest) requests; +}; + + +/** + * rng_backend_request_entropy: + * @s: the backend to request entropy from + * @size: the number of bytes of data to request + * @receive_entropy: a function to be invoked when entropy is available + * @opaque: data that should be passed to @receive_entropy + * + * This function is used by the front-end to request entropy from an entropy + * source. This function can be called multiple times before @receive_entropy + * is invoked with different values of @receive_entropy and @opaque. The + * backend will queue each request and handle appropriately. + * + * The backend does not need to pass the full amount of data to @receive_entropy + * but will pass a value greater than 0. + */ +void rng_backend_request_entropy(RngBackend *s, size_t size, + EntropyReceiveFunc *receive_entropy, + void *opaque); + +/** + * rng_backend_free_request: + * @s: the backend that created the request + * @req: the request to finalize + * + * Used by child rng backend classes to finalize requests once they've been + * processed. The request is removed from the list of active requests and + * deleted. + */ +void rng_backend_finalize_request(RngBackend *s, RngRequest *req); +#endif diff --git a/include/system/rtc.h b/include/system/rtc.h new file mode 100644 index 0000000..cde83fa --- /dev/null +++ b/include/system/rtc.h @@ -0,0 +1,58 @@ +/* + * RTC configuration and clock read + * + * Copyright (c) 2003-2021 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SYSTEM_RTC_H +#define SYSTEM_RTC_H + +/** + * qemu_get_timedate: Get the current RTC time + * @tm: struct tm to fill in with RTC time + * @offset: offset in seconds to adjust the RTC time by before + * converting to struct tm format. + * + * This function fills in @tm with the current RTC time, as adjusted + * by @offset (for example, if @offset is 3600 then the returned time/date + * will be one hour further ahead than the current RTC time). + * + * The usual use is by RTC device models, which should call this function + * to find the time/date value that they should return to the guest + * when it reads the RTC registers. + * + * The behaviour of the clock whose value this function returns will + * depend on the -rtc command line option passed by the user. + */ +void qemu_get_timedate(struct tm *tm, time_t offset); + +/** + * qemu_timedate_diff: Return difference between a struct tm and the RTC + * @tm: struct tm containing the date/time to compare against + * + * Returns the difference in seconds between the RTC clock time + * and the date/time specified in @tm. For example, if @tm specifies + * a timestamp one hour further ahead than the current RTC time + * then this function will return 3600. + */ +time_t qemu_timedate_diff(struct tm *tm); + +#endif diff --git a/include/system/runstate-action.h b/include/system/runstate-action.h new file mode 100644 index 0000000..db4e309 --- /dev/null +++ b/include/system/runstate-action.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2020 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef RUNSTATE_ACTION_H +#define RUNSTATE_ACTION_H + +#include "qapi/qapi-commands-run-state.h" + +/* in system/runstate-action.c */ +extern RebootAction reboot_action; +extern ShutdownAction shutdown_action; +extern PanicAction panic_action; + +#endif /* RUNSTATE_ACTION_H */ diff --git a/include/system/runstate.h b/include/system/runstate.h new file mode 100644 index 0000000..fdd5c4a --- /dev/null +++ b/include/system/runstate.h @@ -0,0 +1,119 @@ +#ifndef SYSTEM_RUNSTATE_H +#define SYSTEM_RUNSTATE_H + +#include "qapi/qapi-types-run-state.h" +#include "qemu/notify.h" + +bool runstate_check(RunState state); +void runstate_set(RunState new_state); +RunState runstate_get(void); +bool runstate_is_running(void); +bool runstate_needs_reset(void); +void runstate_replay_enable(void); + +typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); +typedef int VMChangeStateHandlerWithRet(void *opaque, bool running, RunState state); + +VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, + void *opaque); +VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( + VMChangeStateHandler *cb, void *opaque, int priority); +VMChangeStateEntry * +qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, + VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, + void *opaque, int priority); +VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev, + VMChangeStateHandler *cb, + VMChangeStateHandlerWithRet *cb_ret, + void *opaque); +VMChangeStateEntry *qdev_add_vm_change_state_handler_full( + DeviceState *dev, VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque); +void qemu_del_vm_change_state_handler(VMChangeStateEntry *e); +/** + * vm_state_notify: Notify the state of the VM + * + * @running: whether the VM is running or not. + * @state: the #RunState of the VM. + * + * Return the result of the callback which has return value. + * If no callback has return value, still return 0 and the + * upper layer should not do additional processing. + */ +int vm_state_notify(bool running, RunState state); + +static inline bool shutdown_caused_by_guest(ShutdownCause cause) +{ + return cause >= SHUTDOWN_CAUSE_GUEST_SHUTDOWN; +} + +/* + * In a "live" state, the vcpu clock is ticking, and the runstate notifiers + * think we are running. + */ +static inline bool runstate_is_live(RunState state) +{ + return state == RUN_STATE_RUNNING || state == RUN_STATE_SUSPENDED; +} + +void vm_start(void); + +/** + * vm_prepare_start: Prepare for starting/resuming the VM + * + * @step_pending: whether any of the CPUs is about to be single-stepped by gdb + */ +int vm_prepare_start(bool step_pending); + +/** + * vm_resume: If @state is a live state, start the vm and set the state, + * else just set the state. + * + * @state: the state to restore + */ +void vm_resume(RunState state); + +int vm_stop(RunState state); +int vm_stop_force_state(RunState state); +int vm_shutdown(void); +void vm_set_suspended(bool suspended); +bool vm_get_suspended(void); + +typedef enum WakeupReason { + /* Always keep QEMU_WAKEUP_REASON_NONE = 0 */ + QEMU_WAKEUP_REASON_NONE = 0, + QEMU_WAKEUP_REASON_RTC, + QEMU_WAKEUP_REASON_PMTIMER, + QEMU_WAKEUP_REASON_OTHER, +} WakeupReason; + +void qemu_system_reset_request(ShutdownCause reason); +void qemu_system_suspend_request(void); +void qemu_register_suspend_notifier(Notifier *notifier); +bool qemu_wakeup_suspend_enabled(void); +void qemu_system_wakeup_request(WakeupReason reason, Error **errp); +void qemu_system_wakeup_enable(WakeupReason reason, bool enabled); +void qemu_register_wakeup_notifier(Notifier *notifier); +void qemu_register_wakeup_support(void); +void qemu_system_shutdown_request_with_code(ShutdownCause reason, + int exit_code); +void qemu_system_shutdown_request(ShutdownCause reason); +void qemu_system_powerdown_request(void); +void qemu_register_powerdown_notifier(Notifier *notifier); +void qemu_register_shutdown_notifier(Notifier *notifier); +void qemu_system_debug_request(void); +void qemu_system_vmstop_request(RunState reason); +void qemu_system_vmstop_request_prepare(void); +bool qemu_vmstop_requested(RunState *r); +ShutdownCause qemu_shutdown_requested_get(void); +ShutdownCause qemu_reset_requested_get(void); +void qemu_system_killed(int signal, pid_t pid); +void qemu_system_reset(ShutdownCause reason); +void qemu_system_guest_panicked(GuestPanicInformation *info); +void qemu_system_guest_crashloaded(GuestPanicInformation *info); +void qemu_system_guest_pvshutdown(void); +bool qemu_system_dump_in_progress(void); + +#endif + diff --git a/include/system/seccomp.h b/include/system/seccomp.h new file mode 100644 index 0000000..fe85989 --- /dev/null +++ b/include/system/seccomp.h @@ -0,0 +1,26 @@ +/* + * QEMU seccomp mode 2 support with libseccomp + * + * Copyright IBM, Corp. 2012 + * + * Authors: + * Eduardo Otubo <eotubo@br.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#ifndef QEMU_SECCOMP_H +#define QEMU_SECCOMP_H + +#define QEMU_SECCOMP_SET_DEFAULT (1 << 0) +#define QEMU_SECCOMP_SET_OBSOLETE (1 << 1) +#define QEMU_SECCOMP_SET_PRIVILEGED (1 << 2) +#define QEMU_SECCOMP_SET_SPAWN (1 << 3) +#define QEMU_SECCOMP_SET_RESOURCECTL (1 << 4) + +int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp); + +#endif diff --git a/include/system/spdm-socket.h b/include/system/spdm-socket.h new file mode 100644 index 0000000..5d8bd9a --- /dev/null +++ b/include/system/spdm-socket.h @@ -0,0 +1,74 @@ +/* + * QEMU SPDM socket support + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SPDM_REQUESTER_H +#define SPDM_REQUESTER_H + +/** + * spdm_socket_connect: connect to an external SPDM socket + * @port: port to connect to + * @errp: error object handle + * + * This will connect to an external SPDM socket server. On error + * it will return -1 and errp will be set. On success this function + * will return the socket number. + */ +int spdm_socket_connect(uint16_t port, Error **errp); + +/** + * spdm_socket_rsp: send and receive a message to a SPDM server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * @req: request buffer + * @req_len: request buffer length + * @rsp: response buffer + * @rsp_len: response buffer length + * + * Send platform data to a SPDM server on socket and then receive + * a response. + */ +uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type, + void *req, uint32_t req_len, + void *rsp, uint32_t rsp_len); + +/** + * spdm_socket_close: send a shutdown command to the server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * + * This will issue a shutdown command to the server. + */ +void spdm_socket_close(const int socket, uint32_t transport_type); + +#define SPDM_SOCKET_COMMAND_NORMAL 0x0001 +#define SPDM_SOCKET_COMMAND_OOB_ENCAP_KEY_UPDATE 0x8001 +#define SPDM_SOCKET_COMMAND_CONTINUE 0xFFFD +#define SPDM_SOCKET_COMMAND_SHUTDOWN 0xFFFE +#define SPDM_SOCKET_COMMAND_UNKOWN 0xFFFF +#define SPDM_SOCKET_COMMAND_TEST 0xDEAD + +#define SPDM_SOCKET_TRANSPORT_TYPE_MCTP 0x01 +#define SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE 0x02 + +#define SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE 0x1200 + +#endif diff --git a/include/system/stats.h b/include/system/stats.h new file mode 100644 index 0000000..42c236c --- /dev/null +++ b/include/system/stats.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef STATS_H +#define STATS_H + +#include "qapi/qapi-types-stats.h" + +typedef void StatRetrieveFunc(StatsResultList **result, StatsTarget target, + strList *names, strList *targets, Error **errp); +typedef void SchemaRetrieveFunc(StatsSchemaList **result, Error **errp); + +/* + * Register callbacks for the QMP query-stats command. + * + * @provider: stats provider checked against QMP command arguments + * @stats_fn: routine to query stats: + * @schema_fn: routine to query stat schemas: + */ +void add_stats_callbacks(StatsProvider provider, + StatRetrieveFunc *stats_fn, + SchemaRetrieveFunc *schemas_fn); + +/* + * Helper routines for adding stats entries to the results lists. + */ +void add_stats_entry(StatsResultList **, StatsProvider, const char *id, + StatsList *stats_list); +void add_stats_schema(StatsSchemaList **, StatsProvider, StatsTarget, + StatsSchemaValueList *); + +/* + * True if a string matches the filter passed to the stats_fn callback, + * false otherwise. + * + * Note that an empty list means no filtering, i.e. all strings will + * return true. + */ +bool apply_str_list_filter(const char *string, strList *list); + +#endif /* STATS_H */ diff --git a/include/system/system.h b/include/system/system.h new file mode 100644 index 0000000..a7effe7 --- /dev/null +++ b/include/system/system.h @@ -0,0 +1,125 @@ +#ifndef SYSTEM_H +#define SYSTEM_H +/* Misc. things related to the system emulator. */ + +#include "qemu/timer.h" +#include "qemu/notify.h" +#include "qemu/uuid.h" + +/* vl.c */ + +extern int only_migratable; +extern const char *qemu_name; +extern QemuUUID qemu_uuid; +extern bool qemu_uuid_set; + +const char *qemu_get_vm_name(void); + +/* Exit notifiers will run with BQL held. */ +void qemu_add_exit_notifier(Notifier *notify); +void qemu_remove_exit_notifier(Notifier *notify); + +void qemu_add_machine_init_done_notifier(Notifier *notify); +void qemu_remove_machine_init_done_notifier(Notifier *notify); + +void configure_rtc(QemuOpts *opts); + +void qemu_init_subsystems(void); + +extern int autostart; + +typedef enum { + VGA_NONE, VGA_STD, VGA_CIRRUS, VGA_VMWARE, VGA_XENFB, VGA_QXL, + VGA_TCX, VGA_CG3, VGA_DEVICE, VGA_VIRTIO, + VGA_TYPE_MAX, +} VGAInterfaceType; + +extern int vga_interface_type; +extern bool vga_interface_created; + +extern int graphic_width; +extern int graphic_height; +extern int graphic_depth; +extern int display_opengl; +extern const char *keyboard_layout; +extern int old_param; +extern uint8_t *boot_splash_filedata; +extern bool enable_cpu_pm; +extern QEMUClockType rtc_clock; + +typedef enum { + MLOCK_OFF = 0, + MLOCK_ON, + MLOCK_ON_FAULT, +} MlockState; + +bool should_mlock(MlockState); +bool is_mlock_on_fault(MlockState); + +extern MlockState mlock_state; + +#define MAX_OPTION_ROMS 16 +typedef struct QEMUOptionRom { + const char *name; + int32_t bootindex; +} QEMUOptionRom; +extern QEMUOptionRom option_rom[MAX_OPTION_ROMS]; +extern int nb_option_roms; + +#define MAX_PROM_ENVS 128 +extern const char *prom_envs[MAX_PROM_ENVS]; +extern unsigned int nb_prom_envs; + +/* serial ports */ + +/* Return the Chardev for serial port i, or NULL if none */ +Chardev *serial_hd(int i); + +/* parallel ports */ + +#define MAX_PARALLEL_PORTS 3 + +extern Chardev *parallel_hds[MAX_PARALLEL_PORTS]; + +void add_boot_device_path(int32_t bootindex, DeviceState *dev, + const char *suffix); +char *get_boot_devices_list(size_t *size); + +DeviceState *get_boot_device(uint32_t position); +void check_boot_index(int32_t bootindex, Error **errp); +void del_boot_device_path(DeviceState *dev, const char *suffix); +void device_add_bootindex_property(Object *obj, int32_t *bootindex, + const char *name, const char *suffix, + DeviceState *dev); +void restore_boot_order(void *opaque); +void validate_bootdevices(const char *devices, Error **errp); +void add_boot_device_lchs(DeviceState *dev, const char *suffix, + uint32_t lcyls, uint32_t lheads, uint32_t lsecs); +void del_boot_device_lchs(DeviceState *dev, const char *suffix); +char *get_boot_devices_lchs_list(size_t *size); + +/* handler to set the boot_device order for a specific type of MachineClass */ +typedef void QEMUBootSetHandler(void *opaque, const char *boot_order, + Error **errp); +void qemu_register_boot_set(QEMUBootSetHandler *func, void *opaque); +void qemu_boot_set(const char *boot_order, Error **errp); + +bool defaults_enabled(void); + +void qemu_init(int argc, char **argv); +int qemu_main_loop(void); +void qemu_cleanup(int); + +extern QemuOptsList qemu_legacy_drive_opts; +extern QemuOptsList qemu_common_drive_opts; +extern QemuOptsList qemu_drive_opts; +extern QemuOptsList bdrv_runtime_opts; +extern QemuOptsList qemu_chardev_opts; +extern QemuOptsList qemu_device_opts; +extern QemuOptsList qemu_netdev_opts; +extern QemuOptsList qemu_nic_opts; +extern QemuOptsList qemu_net_opts; +extern QemuOptsList qemu_global_opts; +extern QemuOptsList qemu_semihosting_config_opts; + +#endif diff --git a/include/system/tcg.h b/include/system/tcg.h new file mode 100644 index 0000000..7622dce --- /dev/null +++ b/include/system/tcg.h @@ -0,0 +1,28 @@ +/* + * QEMU TCG support + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +/* header to be included in non-TCG-specific code */ + +#ifndef SYSTEM_TCG_H +#define SYSTEM_TCG_H + +#ifdef CONFIG_TCG +extern bool tcg_allowed; +#define tcg_enabled() (tcg_allowed) +#else +#define tcg_enabled() 0 +#endif + +/** + * qemu_tcg_mttcg_enabled: + * Check whether we are running MultiThread TCG or not. + * + * Returns: %true if we are in MTTCG mode %false otherwise. + */ +bool qemu_tcg_mttcg_enabled(void); + +#endif diff --git a/include/system/tpm.h b/include/system/tpm.h new file mode 100644 index 0000000..1ee568b --- /dev/null +++ b/include/system/tpm.h @@ -0,0 +1,94 @@ +/* + * Public TPM functions + * + * Copyright (C) 2011-2013 IBM Corporation + * + * Authors: + * Stefan Berger <stefanb@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_TPM_H +#define QEMU_TPM_H + +#include "qapi/qapi-types-tpm.h" +#include "qom/object.h" + +#ifdef CONFIG_TPM + +int tpm_config_parse(QemuOptsList *opts_list, const char *optstr); +int tpm_init(void); +void tpm_cleanup(void); + +typedef enum TPMVersion { + TPM_VERSION_UNSPEC = 0, + TPM_VERSION_1_2 = 1, + TPM_VERSION_2_0 = 2, +} TPMVersion; + +#define TYPE_TPM_IF "tpm-if" +typedef struct TPMIfClass TPMIfClass; +DECLARE_CLASS_CHECKERS(TPMIfClass, TPM_IF, + TYPE_TPM_IF) +#define TPM_IF(obj) \ + INTERFACE_CHECK(TPMIf, (obj), TYPE_TPM_IF) + +typedef struct TPMIf TPMIf; + +struct TPMIfClass { + InterfaceClass parent_class; + + enum TpmModel model; + void (*request_completed)(TPMIf *obj, int ret); + enum TPMVersion (*get_version)(TPMIf *obj); +}; + +#define TYPE_TPM_TIS_ISA "tpm-tis" +#define TYPE_TPM_TIS_SYSBUS "tpm-tis-device" +#define TYPE_TPM_CRB "tpm-crb" +#define TYPE_TPM_SPAPR "tpm-spapr" +#define TYPE_TPM_TIS_I2C "tpm-tis-i2c" + +#define TPM_IS_TIS_ISA(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_ISA) +#define TPM_IS_TIS_SYSBUS(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_SYSBUS) +#define TPM_IS_CRB(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_CRB) +#define TPM_IS_SPAPR(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_SPAPR) +#define TPM_IS_TIS_I2C(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_I2C) + +/* returns NULL unless there is exactly one TPM device */ +static inline TPMIf *tpm_find(void) +{ + Object *obj = object_resolve_path_type("", TYPE_TPM_IF, NULL); + + return TPM_IF(obj); +} + +static inline TPMVersion tpm_get_version(TPMIf *ti) +{ + if (!ti) { + return TPM_VERSION_UNSPEC; + } + + return TPM_IF_GET_CLASS(ti)->get_version(ti); +} + +#else /* CONFIG_TPM */ + +#define tpm_init() (0) +#define tpm_cleanup() + +/* needed for an alignment check in non-tpm code */ +static inline Object *TPM_IS_CRB(Object *obj) +{ + return NULL; +} + +#endif /* CONFIG_TPM */ + +#endif /* QEMU_TPM_H */ diff --git a/include/system/tpm_backend.h b/include/system/tpm_backend.h new file mode 100644 index 0000000..01b11f6 --- /dev/null +++ b/include/system/tpm_backend.h @@ -0,0 +1,216 @@ +/* + * QEMU TPM Backend + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Stefan Berger <stefanb@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef TPM_BACKEND_H +#define TPM_BACKEND_H + +#include "qom/object.h" +#include "qemu/option.h" +#include "system/tpm.h" +#include "qapi/error.h" + +#ifdef CONFIG_TPM + +#define TYPE_TPM_BACKEND "tpm-backend" +OBJECT_DECLARE_TYPE(TPMBackend, TPMBackendClass, + TPM_BACKEND) + + +typedef struct TPMBackendCmd { + uint8_t locty; + const uint8_t *in; + uint32_t in_len; + uint8_t *out; + uint32_t out_len; + bool selftest_done; +} TPMBackendCmd; + +struct TPMBackend { + Object parent; + + /*< protected >*/ + TPMIf *tpmif; + bool opened; + bool had_startup_error; + TPMBackendCmd *cmd; + + /* <public> */ + char *id; + + QLIST_ENTRY(TPMBackend) list; +}; + +struct TPMBackendClass { + ObjectClass parent_class; + + enum TpmType type; + const QemuOptDesc *opts; + /* get a descriptive text of the backend to display to the user */ + const char *desc; + + TPMBackend *(*create)(QemuOpts *opts); + + /* start up the TPM on the backend - optional */ + int (*startup_tpm)(TPMBackend *t, size_t buffersize); + + /* optional */ + void (*reset)(TPMBackend *t); + + void (*cancel_cmd)(TPMBackend *t); + + /* optional */ + bool (*get_tpm_established_flag)(TPMBackend *t); + + /* optional */ + int (*reset_tpm_established_flag)(TPMBackend *t, uint8_t locty); + + TPMVersion (*get_tpm_version)(TPMBackend *t); + + size_t (*get_buffer_size)(TPMBackend *t); + + TpmTypeOptions *(*get_tpm_options)(TPMBackend *t); + + void (*handle_request)(TPMBackend *s, TPMBackendCmd *cmd, Error **errp); +}; + +/** + * tpm_backend_get_type: + * @s: the backend + * + * Returns the TpmType of the backend. + */ +enum TpmType tpm_backend_get_type(TPMBackend *s); + +/** + * tpm_backend_init: + * @s: the backend to initialized + * @tpmif: TPM interface + * @datacb: callback for sending data to frontend + * @errp: a pointer to return the #Error object if an error occurs. + * + * Initialize the backend with the given variables. + * + * Returns 0 on success. + */ +int tpm_backend_init(TPMBackend *s, TPMIf *tpmif, Error **errp); + +/** + * tpm_backend_startup_tpm: + * @s: the backend whose TPM support is to be started + * @buffersize: the buffer size the TPM is supposed to use, + * 0 to leave it as-is + * + * Returns 0 on success. + */ +int tpm_backend_startup_tpm(TPMBackend *s, size_t buffersize); + +/** + * tpm_backend_had_startup_error: + * @s: the backend to query for a startup error + * + * Check whether the backend had an error during startup. Returns + * false if no error occurred and the backend can be used, true + * otherwise. + */ +bool tpm_backend_had_startup_error(TPMBackend *s); + +/** + * tpm_backend_deliver_request: + * @s: the backend to send the request to + * @cmd: the command to deliver + * + * Send a request to the backend. The backend will then send the request + * to the TPM implementation. + */ +void tpm_backend_deliver_request(TPMBackend *s, TPMBackendCmd *cmd); + +/** + * tpm_backend_reset: + * @s: the backend to reset + * + * Reset the backend into a well defined state with all previous errors + * reset. + */ +void tpm_backend_reset(TPMBackend *s); + +/** + * tpm_backend_cancel_cmd: + * @s: the backend + * + * Cancel any ongoing command being processed by the TPM implementation + * on behalf of the QEMU guest. + */ +void tpm_backend_cancel_cmd(TPMBackend *s); + +/** + * tpm_backend_get_tpm_established_flag: + * @s: the backend + * + * Get the TPM establishment flag. This function may be called very + * frequently by the frontend since for example in the TIS implementation + * this flag is part of a register. + */ +bool tpm_backend_get_tpm_established_flag(TPMBackend *s); + +/** + * tpm_backend_reset_tpm_established_flag: + * @s: the backend + * @locty: the locality number + * + * Reset the TPM establishment flag. + */ +int tpm_backend_reset_tpm_established_flag(TPMBackend *s, uint8_t locty); + +/** + * tpm_backend_get_tpm_version: + * @s: the backend to call into + * + * Get the TPM Version that is emulated at the backend. + * + * Returns TPMVersion. + */ +TPMVersion tpm_backend_get_tpm_version(TPMBackend *s); + +/** + * tpm_backend_get_buffer_size: + * @s: the backend to call into + * + * Get the TPM's buffer size. + * + * Returns buffer size. + */ +size_t tpm_backend_get_buffer_size(TPMBackend *s); + +/** + * tpm_backend_finish_sync: + * @s: the backend to call into + * + * Finish the pending command synchronously (this will call aio_poll() + * on qemu main AIOContext until it ends) + */ +void tpm_backend_finish_sync(TPMBackend *s); + +/** + * tpm_backend_query_tpm: + * @s: the backend + * + * Query backend tpm info + * + * Returns newly allocated TPMInfo + */ +TPMInfo *tpm_backend_query_tpm(TPMBackend *s); + +TPMBackend *qemu_find_tpm_be(const char *id); + +#endif /* CONFIG_TPM */ + +#endif /* TPM_BACKEND_H */ diff --git a/include/system/tpm_util.h b/include/system/tpm_util.h new file mode 100644 index 0000000..1858693 --- /dev/null +++ b/include/system/tpm_util.h @@ -0,0 +1,72 @@ +/* + * TPM utility functions + * + * Copyright (c) 2010 - 2015 IBM Corporation + * Authors: + * Stefan Berger <stefanb@us.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#ifndef SYSTEM_TPM_UTIL_H +#define SYSTEM_TPM_UTIL_H + +#include "system/tpm.h" +#include "qemu/bswap.h" + +void tpm_util_write_fatal_error_response(uint8_t *out, uint32_t out_len); + +bool tpm_util_is_selftest(const uint8_t *in, uint32_t in_len); + +int tpm_util_test_tpmdev(int tpm_fd, TPMVersion *tpm_version); + +static inline uint16_t tpm_cmd_get_tag(const void *b) +{ + return lduw_be_p(b); +} + +static inline void tpm_cmd_set_tag(void *b, uint16_t tag) +{ + stw_be_p(b, tag); +} + +static inline uint32_t tpm_cmd_get_size(const void *b) +{ + return ldl_be_p(b + 2); +} + +static inline void tpm_cmd_set_size(void *b, uint32_t size) +{ + stl_be_p(b + 2, size); +} + +static inline uint32_t tpm_cmd_get_ordinal(const void *b) +{ + return ldl_be_p(b + 6); +} + +static inline uint32_t tpm_cmd_get_errcode(const void *b) +{ + return ldl_be_p(b + 6); +} + +static inline void tpm_cmd_set_error(void *b, uint32_t error) +{ + stl_be_p(b + 6, error); +} + +void tpm_util_show_buffer(const unsigned char *buffer, + size_t buffer_size, const char *string); + +#endif /* SYSTEM_TPM_UTIL_H */ diff --git a/include/system/vhost-user-backend.h b/include/system/vhost-user-backend.h new file mode 100644 index 0000000..5634ebd --- /dev/null +++ b/include/system/vhost-user-backend.h @@ -0,0 +1,48 @@ +/* + * QEMU vhost-user backend + * + * Copyright (C) 2018 Red Hat Inc + * + * Authors: + * Marc-AndrĂ© Lureau <marcandre.lureau@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_VHOST_USER_BACKEND_H +#define QEMU_VHOST_USER_BACKEND_H + +#include "qom/object.h" +#include "system/memory.h" +#include "qemu/option.h" +#include "qemu/bitmap.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-user.h" +#include "chardev/char-fe.h" +#include "io/channel.h" + +#define TYPE_VHOST_USER_BACKEND "vhost-user-backend" +OBJECT_DECLARE_SIMPLE_TYPE(VhostUserBackend, + VHOST_USER_BACKEND) + + + +struct VhostUserBackend { + /* private */ + Object parent; + + char *chr_name; + CharBackend chr; + VhostUserState vhost_user; + struct vhost_dev dev; + VirtIODevice *vdev; + bool started; + bool completed; +}; + +int vhost_user_backend_dev_init(VhostUserBackend *b, VirtIODevice *vdev, + unsigned nvqs, Error **errp); +void vhost_user_backend_start(VhostUserBackend *b); +int vhost_user_backend_stop(VhostUserBackend *b); + +#endif diff --git a/include/system/watchdog.h b/include/system/watchdog.h new file mode 100644 index 0000000..745c89b --- /dev/null +++ b/include/system/watchdog.h @@ -0,0 +1,32 @@ +/* + * Virtual hardware watchdog. + * + * Copyright (C) 2009 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * By Richard W.M. Jones (rjones@redhat.com). + */ + +#ifndef QEMU_WATCHDOG_H +#define QEMU_WATCHDOG_H + +#include "qemu/queue.h" +#include "qapi/qapi-types-run-state.h" + +/* in hw/watchdog.c */ +WatchdogAction get_watchdog_action(void); +void watchdog_perform_action(void); + +#endif /* QEMU_WATCHDOG_H */ diff --git a/include/system/whpx.h b/include/system/whpx.h new file mode 100644 index 0000000..00ff409 --- /dev/null +++ b/include/system/whpx.h @@ -0,0 +1,34 @@ +/* + * QEMU Windows Hypervisor Platform accelerator (WHPX) support + * + * Copyright Microsoft, Corp. 2017 + * + * Authors: + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* header to be included in non-WHPX-specific code */ + +#ifndef QEMU_WHPX_H +#define QEMU_WHPX_H + +#ifdef COMPILING_PER_TARGET + +#ifdef CONFIG_WHPX + +int whpx_enabled(void); +bool whpx_apic_in_platform(void); + +#else /* CONFIG_WHPX */ + +#define whpx_enabled() (0) +#define whpx_apic_in_platform() (0) + +#endif /* CONFIG_WHPX */ + +#endif /* COMPILING_PER_TARGET */ + +#endif /* QEMU_WHPX_H */ diff --git a/include/system/xen-mapcache.h b/include/system/xen-mapcache.h new file mode 100644 index 0000000..bb454a7 --- /dev/null +++ b/include/system/xen-mapcache.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2011 Citrix Ltd. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef XEN_MAPCACHE_H +#define XEN_MAPCACHE_H + +#include "exec/cpu-common.h" +#include "system/xen.h" + +typedef hwaddr (*phys_offset_to_gaddr_t)(hwaddr phys_offset, + ram_addr_t size); +void xen_map_cache_init(phys_offset_to_gaddr_t f, + void *opaque); +uint8_t *xen_map_cache(MemoryRegion *mr, hwaddr phys_addr, hwaddr size, + ram_addr_t ram_addr_offset, + uint8_t lock, bool dma, + bool is_write); +ram_addr_t xen_ram_addr_from_mapcache(void *ptr); +void xen_invalidate_map_cache_entry(uint8_t *buffer); +void xen_invalidate_map_cache(void); +uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr, + hwaddr new_phys_addr, + hwaddr size); + +#endif /* XEN_MAPCACHE_H */ diff --git a/include/system/xen.h b/include/system/xen.h new file mode 100644 index 0000000..c2f283d --- /dev/null +++ b/include/system/xen.h @@ -0,0 +1,35 @@ +/* + * QEMU Xen support + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +/* header to be included in non-Xen-specific code */ + +#ifndef SYSTEM_XEN_H +#define SYSTEM_XEN_H + +#include "exec/cpu-common.h" + +#ifdef COMPILING_PER_TARGET +# ifdef CONFIG_XEN +# define CONFIG_XEN_IS_POSSIBLE +# endif +#else +# define CONFIG_XEN_IS_POSSIBLE +#endif /* COMPILING_PER_TARGET */ + +#ifdef CONFIG_XEN_IS_POSSIBLE +extern bool xen_allowed; +#define xen_enabled() (xen_allowed) +#else /* !CONFIG_XEN_IS_POSSIBLE */ +#define xen_enabled() 0 +#endif /* CONFIG_XEN_IS_POSSIBLE */ + +void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length); +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, + struct MemoryRegion *mr, Error **errp); +bool xen_mr_is_memory(MemoryRegion *mr); +bool xen_mr_is_grants(MemoryRegion *mr); +#endif |