diff options
178 files changed, 10217 insertions, 1324 deletions
diff --git a/.gitlab-ci.d/custom-runners.yml b/.gitlab-ci.d/custom-runners.yml index 2d493f7..3eb8216 100644 --- a/.gitlab-ci.d/custom-runners.yml +++ b/.gitlab-ci.d/custom-runners.yml @@ -29,6 +29,6 @@ junit: build/meson-logs/*.junit.xml include: - - local: '/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml' - - local: '/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml' - - local: '/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml' + - local: '/.gitlab-ci.d/custom-runners/ubuntu-24.04-s390x.yml' + - local: '/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch64.yml' + - local: '/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch32.yml' diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml b/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch32.yml index 8727687..75029c9 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch32.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch32.yml @@ -1,13 +1,13 @@ -# All ubuntu-22.04 jobs should run successfully in an environment +# All ubuntu-24.04 jobs should run successfully in an environment # setup by the scripts/ci/setup/ubuntu/build-environment.yml task -# "Install basic packages to build QEMU on Ubuntu 22.04" +# "Install basic packages to build QEMU on Ubuntu 24.04" -ubuntu-22.04-aarch32-all: +ubuntu-24.04-aarch32-all: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch32 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml b/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch64.yml index ca2f140..d26c782 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-aarch64.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-24.04-aarch64.yml @@ -1,13 +1,13 @@ -# All ubuntu-22.04 jobs should run successfully in an environment +# All ubuntu-24.04 jobs should run successfully in an environment # setup by the scripts/ci/setup/ubuntu/build-environment.yml task -# "Install basic packages to build QEMU on Ubuntu 22.04" +# "Install basic packages to build QEMU on Ubuntu 24.04" -ubuntu-22.04-aarch64-all-linux-static: +ubuntu-24.04-aarch64-all-linux-static: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -23,12 +23,12 @@ ubuntu-22.04-aarch64-all-linux-static: - make check-tcg - make --output-sync -j`nproc --ignore=40` check -ubuntu-22.04-aarch64-all: +ubuntu-24.04-aarch64-all: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -45,12 +45,12 @@ ubuntu-22.04-aarch64-all: - make --output-sync -j`nproc --ignore=40` - make --output-sync -j`nproc --ignore=40` check -ubuntu-22.04-aarch64-without-defaults: +ubuntu-24.04-aarch64-without-defaults: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -67,12 +67,12 @@ ubuntu-22.04-aarch64-without-defaults: - make --output-sync -j`nproc --ignore=40` - make --output-sync -j`nproc --ignore=40` check -ubuntu-22.04-aarch64-alldbg: +ubuntu-24.04-aarch64-alldbg: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -86,12 +86,12 @@ ubuntu-22.04-aarch64-alldbg: - make --output-sync -j`nproc --ignore=40` - make --output-sync -j`nproc --ignore=40` check -ubuntu-22.04-aarch64-clang: +ubuntu-24.04-aarch64-clang: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -108,11 +108,11 @@ ubuntu-22.04-aarch64-clang: - make --output-sync -j`nproc --ignore=40` - make --output-sync -j`nproc --ignore=40` check -ubuntu-22.04-aarch64-tci: +ubuntu-24.04-aarch64-tci: needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -128,12 +128,12 @@ ubuntu-22.04-aarch64-tci: || { cat config.log meson-logs/meson-log.txt; exit 1; } - make --output-sync -j`nproc --ignore=40` -ubuntu-22.04-aarch64-notcg: +ubuntu-24.04-aarch64-notcg: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - aarch64 rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml b/.gitlab-ci.d/custom-runners/ubuntu-24.04-s390x.yml index e62ff17..45dbee1 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-24.04-s390x.yml @@ -1,13 +1,13 @@ -# All ubuntu-22.04 jobs should run successfully in an environment +# All ubuntu-24.04 jobs should run successfully in an environment # setup by the scripts/ci/setup/ubuntu/build-environment.yml task -# "Install basic packages to build QEMU on Ubuntu 22.04" +# "Install basic packages to build QEMU on Ubuntu 24.04" -ubuntu-22.04-s390x-all-linux: +ubuntu-24.04-s390x-all-linux: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -21,12 +21,12 @@ ubuntu-22.04-s390x-all-linux: - make --output-sync check-tcg - make --output-sync -j`nproc` check -ubuntu-22.04-s390x-all-system: +ubuntu-24.04-s390x-all-system: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x timeout: 75m rules: @@ -42,12 +42,12 @@ ubuntu-22.04-s390x-all-system: - make --output-sync -j`nproc` - make --output-sync -j`nproc` check -ubuntu-22.04-s390x-alldbg: +ubuntu-24.04-s390x-alldbg: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -65,12 +65,12 @@ ubuntu-22.04-s390x-alldbg: - make --output-sync -j`nproc` - make --output-sync -j`nproc` check -ubuntu-22.04-s390x-clang: +ubuntu-24.04-s390x-clang: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -87,11 +87,11 @@ ubuntu-22.04-s390x-clang: - make --output-sync -j`nproc` - make --output-sync -j`nproc` check -ubuntu-22.04-s390x-tci: +ubuntu-24.04-s390x-tci: needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' @@ -107,12 +107,12 @@ ubuntu-22.04-s390x-tci: || { cat config.log meson-logs/meson-log.txt; exit 1; } - make --output-sync -j`nproc` -ubuntu-22.04-s390x-notcg: +ubuntu-24.04-s390x-notcg: extends: .custom_runner_template needs: [] stage: build tags: - - ubuntu_22.04 + - ubuntu_24.04 - s390x rules: - if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/' diff --git a/.gitmodules b/.gitmodules index e27dfe8..c307216 100644 --- a/.gitmodules +++ b/.gitmodules @@ -15,7 +15,8 @@ url = https://gitlab.com/qemu-project/qemu-palcode.git [submodule "roms/u-boot"] path = roms/u-boot - url = https://gitlab.com/qemu-project-mirrors/u-boot.git + # upstream is https://github.com/u-boot/u-boot + url = https://gitlab.com/qemu-project/u-boot.git [submodule "roms/skiboot"] path = roms/skiboot url = https://gitlab.com/qemu-project/skiboot.git @@ -27,7 +28,8 @@ url = https://gitlab.com/qemu-project/seabios-hppa.git [submodule "roms/u-boot-sam460ex"] path = roms/u-boot-sam460ex - url = https://gitlab.com/qemu-project-mirrors/u-boot-sam460ex.git + # upstream is https://github.com/zbalaton/u-boot-sam460ex + url = https://gitlab.com/qemu-project/u-boot-sam460ex.git [submodule "roms/edk2"] path = roms/edk2 url = https://gitlab.com/qemu-project/edk2.git diff --git a/.gitpublish b/.gitpublish index a13f8c7..a3adb21 100644 --- a/.gitpublish +++ b/.gitpublish @@ -4,48 +4,48 @@ # See https://github.com/stefanha/git-publish for more information # [gitpublishprofile "default"] -base = master +base = origin/master to = qemu-devel@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "rfc"] -base = master +base = origin/master prefix = RFC PATCH to = qemu-devel@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "stable"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-stable@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "trivial"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-trivial@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "block"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-block@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "arm"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-arm@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "s390"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-s390@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null [gitpublishprofile "ppc"] -base = master +base = origin/master to = qemu-devel@nongnu.org cc = qemu-ppc@nongnu.org cccmd = scripts/get_maintainer.pl --noroles --norolestats --nogit --nogit-fallback 2>/dev/null diff --git a/MAINTAINERS b/MAINTAINERS index 128f90b..84cfd85 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -551,6 +551,21 @@ F: target/i386/whpx/ F: accel/stubs/whpx-stub.c F: include/system/whpx.h +MSHV +M: Magnus Kulke <magnus.kulke@linux.microsoft.com> +R: Wei Liu <wei.liu@kernel.org> +S: Supported +F: accel/mshv/ +F: include/system/mshv.h +F: include/hw/hyperv/hvgdk*.h +F: include/hw/hyperv/hvhdk*.h + +X86 MSHV CPUs +M: Magnus Kulke <magnus.kulke@linux.microsoft.com> +R: Wei Liu <wei.liu@kernel.org> +S: Supported +F: target/i386/mshv/ + X86 Instruction Emulator M: Cameron Esfahani <dirty@apple.com> M: Roman Bolshakov <rbolshakov@ddn.com> diff --git a/accel/Kconfig b/accel/Kconfig index 4263cab..a60f114 100644 --- a/accel/Kconfig +++ b/accel/Kconfig @@ -13,6 +13,9 @@ config TCG config KVM bool +config MSHV + bool + config XEN bool select FSDEV_9P if VIRTFS diff --git a/accel/accel-irq.c b/accel/accel-irq.c new file mode 100644 index 0000000..7f864e3 --- /dev/null +++ b/accel/accel-irq.c @@ -0,0 +1,106 @@ +/* + * Accelerated irqchip abstraction + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/pci/msi.h" + +#include "system/kvm.h" +#include "system/mshv.h" +#include "system/accel-irq.h" + +int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_add_msi_route(vector, dev); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_add_msi_route(c, vector, dev); + } + return -ENOSYS; +} + +int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_update_msi_route(vector, msg, dev); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_update_msi_route(kvm_state, vector, msg, dev); + } + return -ENOSYS; +} + +void accel_irqchip_commit_route_changes(KVMRouteChange *c) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_commit_routes(); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_commit_route_changes(c); + } +} + +void accel_irqchip_commit_routes(void) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_commit_routes(); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_commit_routes(kvm_state); + } +} + +void accel_irqchip_release_virq(int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_release_virq(virq); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_release_virq(kvm_state, virq); + } +} + +int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn, + int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_add_irqfd_notifier_gsi(n, rn, virq); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, rn, virq); + } + return -ENOSYS; +} + +int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_remove_irqfd_notifier_gsi(n, virq); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, virq); + } + return -ENOSYS; +} diff --git a/accel/meson.build b/accel/meson.build index 25b0f10..983dfd0 100644 --- a/accel/meson.build +++ b/accel/meson.build @@ -1,6 +1,6 @@ common_ss.add(files('accel-common.c')) specific_ss.add(files('accel-target.c')) -system_ss.add(files('accel-system.c', 'accel-blocker.c', 'accel-qmp.c')) +system_ss.add(files('accel-system.c', 'accel-blocker.c', 'accel-qmp.c', 'accel-irq.c')) user_ss.add(files('accel-user.c')) subdir('tcg') @@ -10,6 +10,7 @@ if have_system subdir('kvm') subdir('xen') subdir('stubs') + subdir('mshv') endif # qtest diff --git a/accel/mshv/irq.c b/accel/mshv/irq.c new file mode 100644 index 0000000..adf8f33 --- /dev/null +++ b/accel/mshv/irq.c @@ -0,0 +1,399 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Stanislav Kinsburskii <skinsburskii@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "linux/mshv.h" +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/intc/ioapic.h" +#include "hw/pci/msi.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "trace.h" +#include <stdint.h> +#include <sys/ioctl.h> + +#define MSHV_IRQFD_RESAMPLE_FLAG (1 << MSHV_IRQFD_BIT_RESAMPLE) +#define MSHV_IRQFD_BIT_DEASSIGN_FLAG (1 << MSHV_IRQFD_BIT_DEASSIGN) + +static MshvMsiControl *msi_control; +static QemuMutex msi_control_mutex; + +void mshv_init_msicontrol(void) +{ + qemu_mutex_init(&msi_control_mutex); + msi_control = g_new0(MshvMsiControl, 1); + msi_control->gsi_routes = g_hash_table_new(g_direct_hash, g_direct_equal); + msi_control->updated = false; +} + +static int set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + GHashTable *gsi_routes; + + trace_mshv_set_msi_routing(gsi, addr, data); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("gsi >= MSHV_MAX_MSI_ROUTES"); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + + if (entry + && entry->address_hi == high_addr + && entry->address_lo == low_addr + && entry->data == data) + { + /* nothing to update */ + return 0; + } + + /* free old entry */ + g_free(entry); + + /* create new entry */ + entry = g_new0(struct mshv_user_irq_entry, 1); + entry->gsi = gsi; + entry->address_hi = high_addr; + entry->address_lo = low_addr; + entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), entry); + msi_control->updated = true; + } + + return 0; +} + +static int add_msi_routing(uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *route_entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + int gsi; + GHashTable *gsi_routes; + + trace_mshv_add_msi_routing(addr, data); + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + /* find an empty slot */ + gsi = 0; + gsi_routes = msi_control->gsi_routes; + while (gsi < MSHV_MAX_MSI_ROUTES) { + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (!route_entry) { + break; + } + gsi++; + } + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("No empty gsi slot available"); + return -1; + } + + /* create new entry */ + route_entry = g_new0(struct mshv_user_irq_entry, 1); + route_entry->gsi = gsi; + route_entry->address_hi = high_addr; + route_entry->address_lo = low_addr; + route_entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), route_entry); + msi_control->updated = true; + } + + return gsi; +} + +static int commit_msi_routing_table(int vm_fd) +{ + guint len; + int i, ret; + size_t table_size; + struct mshv_user_irq_table *table; + GHashTableIter iter; + gpointer key, value; + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + if (!msi_control->updated) { + /* nothing to update */ + return 0; + } + + /* Calculate the size of the table */ + len = g_hash_table_size(msi_control->gsi_routes); + table_size = sizeof(struct mshv_user_irq_table) + + len * sizeof(struct mshv_user_irq_entry); + table = g_malloc0(table_size); + + g_hash_table_iter_init(&iter, msi_control->gsi_routes); + i = 0; + while (g_hash_table_iter_next(&iter, &key, &value)) { + struct mshv_user_irq_entry *entry = value; + table->entries[i] = *entry; + i++; + } + table->nr = i; + + trace_mshv_commit_msi_routing_table(vm_fd, len); + + ret = ioctl(vm_fd, MSHV_SET_MSI_ROUTING, table); + g_free(table); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + return -1; + } + msi_control->updated = false; + } + return 0; +} + +static int remove_msi_routing(uint32_t gsi) +{ + struct mshv_user_irq_entry *route_entry; + GHashTable *gsi_routes; + + trace_mshv_remove_msi_routing(gsi); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("Invalid GSI: %u", gsi); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (route_entry) { + g_hash_table_remove(gsi_routes, GINT_TO_POINTER(gsi)); + g_free(route_entry); + msi_control->updated = true; + } + } + + return 0; +} + +/* Pass an eventfd which is to be used for injecting interrupts from userland */ +static int irqfd(int vm_fd, int fd, int resample_fd, uint32_t gsi, + uint32_t flags) +{ + int ret; + struct mshv_user_irqfd arg = { + .fd = fd, + .resamplefd = resample_fd, + .gsi = gsi, + .flags = flags, + }; + + ret = ioctl(vm_fd, MSHV_IRQFD, &arg); + if (ret < 0) { + error_report("Failed to set irqfd: gsi=%u, fd=%d", gsi, fd); + return -1; + } + return ret; +} + +static int register_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + + trace_mshv_register_irqfd(vm_fd, event_fd, gsi); + + ret = irqfd(vm_fd, event_fd, 0, gsi, 0); + if (ret < 0) { + error_report("Failed to register irqfd: gsi=%u", gsi); + return -1; + } + return 0; +} + +static int register_irqfd_with_resample(int vm_fd, int event_fd, + int resample_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_RESAMPLE_FLAG; + + ret = irqfd(vm_fd, event_fd, resample_fd, gsi, flags); + if (ret < 0) { + error_report("Failed to register irqfd with resample: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int unregister_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_BIT_DEASSIGN_FLAG; + + ret = irqfd(vm_fd, event_fd, 0, gsi, flags); + if (ret < 0) { + error_report("Failed to unregister irqfd: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int irqchip_update_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq, bool add) +{ + int fd = event_notifier_get_fd(event); + int rfd = resample ? event_notifier_get_fd(resample) : -1; + int vm_fd = mshv_state->vm; + + trace_mshv_irqchip_update_irqfd_notifier_gsi(fd, rfd, virq, add); + + if (!add) { + return unregister_irqfd(vm_fd, fd, virq); + } + + if (rfd > 0) { + return register_irqfd_with_resample(vm_fd, fd, rfd, virq); + } + + return register_irqfd(vm_fd, fd, virq); +} + + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev) +{ + MSIMessage msg = { 0, 0 }; + int virq = 0; + + if (pci_available && dev) { + msg = pci_get_msi_message(dev, vector); + virq = add_msi_routing(msg.address, le32_to_cpu(msg.data)); + } + + return virq; +} + +void mshv_irqchip_release_virq(int virq) +{ + remove_msi_routing(virq); +} + +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev) +{ + int ret; + + ret = set_msi_routing(virq, msg.address, le32_to_cpu(msg.data)); + if (ret < 0) { + error_report("Failed to set msi routing"); + return -1; + } + + return 0; +} + +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_dest_mode, + bool level_triggered) +{ + int ret; + int vm_fd = mshv_state->vm; + + if (vector == 0) { + warn_report("Ignoring request for interrupt vector 0"); + return 0; + } + + union hv_interrupt_control control = { + .interrupt_type = interrupt_type, + .level_triggered = level_triggered, + .logical_dest_mode = logical_dest_mode, + .rsvd = 0, + }; + + struct hv_input_assert_virtual_interrupt arg = {0}; + arg.control = control; + arg.dest_addr = (uint64_t)vp_index; + arg.vector = vector; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_ASSERT_VIRTUAL_INTERRUPT; + args.in_sz = sizeof(arg); + args.in_ptr = (uint64_t)&arg; + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to request interrupt"); + return -errno; + } + return 0; +} + +void mshv_irqchip_commit_routes(void) +{ + int ret; + int vm_fd = mshv_state->vm; + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + abort(); + } +} + +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, resample, virq, true); +} + +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *event, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, NULL, virq, false); +} + +int mshv_reserve_ioapic_msi_routes(int vm_fd) +{ + int ret, gsi; + + /* + * Reserve GSI 0-23 for IOAPIC pins, to avoid conflicts of legacy + * peripherals with MSI-X devices + */ + for (gsi = 0; gsi < IOAPIC_NUM_PINS; gsi++) { + ret = add_msi_routing(0, 0); + if (ret < 0) { + error_report("Failed to reserve GSI %d", gsi); + return -1; + } + } + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit reserved IOAPIC MSI routes"); + return -1; + } + + return 0; +} diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c new file mode 100644 index 0000000..0e2164a --- /dev/null +++ b/accel/mshv/mem.c @@ -0,0 +1,563 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qemu/lockable.h" +#include "qemu/error-report.h" +#include "qemu/rcu.h" +#include "linux/mshv.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "exec/memattrs.h" +#include <sys/ioctl.h> +#include "trace.h" + +typedef struct SlotsRCUReclaim { + struct rcu_head rcu; + GList *old_head; + MshvMemorySlot *removed_slot; +} SlotsRCUReclaim; + +static void rcu_reclaim_slotlist(struct rcu_head *rcu) +{ + SlotsRCUReclaim *r = container_of(rcu, SlotsRCUReclaim, rcu); + g_list_free(r->old_head); + g_free(r->removed_slot); + g_free(r); +} + +static void publish_slots(GList *new_head, GList *old_head, + MshvMemorySlot *removed_slot) +{ + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + qatomic_store_release(&manager->slots, new_head); + + SlotsRCUReclaim *r = g_new(SlotsRCUReclaim, 1); + r->old_head = old_head; + r->removed_slot = removed_slot; + + call_rcu1(&r->rcu, rcu_reclaim_slotlist); +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static int remove_slot(MshvMemorySlot *slot) +{ + GList *old_head, *new_head; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + old_head = qatomic_load_acquire(&manager->slots); + + if (!g_list_find(old_head, slot)) { + error_report("slot requested for removal not found"); + return -1; + } + + new_head = g_list_copy(old_head); + new_head = g_list_remove(new_head, slot); + manager->n_slots--; + + publish_slots(new_head, old_head, slot); + + return 0; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *append_slot(uint64_t gpa, uint64_t userspace_addr, + uint64_t size, bool readonly) +{ + GList *old_head, *new_head; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + old_head = qatomic_load_acquire(&manager->slots); + + if (manager->n_slots >= MSHV_MAX_MEM_SLOTS) { + error_report("no free memory slots available"); + return NULL; + } + + slot = g_new0(MshvMemorySlot, 1); + slot->guest_phys_addr = gpa; + slot->userspace_addr = userspace_addr; + slot->memory_size = size; + slot->readonly = readonly; + + new_head = g_list_copy(old_head); + new_head = g_list_append(new_head, slot); + manager->n_slots++; + + publish_slots(new_head, old_head, NULL); + + return slot; +} + +static int slot_overlaps(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + uint64_t start_1 = slot1->userspace_addr, + start_2 = slot2->userspace_addr; + size_t len_1 = slot1->memory_size, + len_2 = slot2->memory_size; + + if (slot1 == slot2) { + return -1; + } + + return ranges_overlap(start_1, len_1, start_2, len_2) ? 0 : -1; +} + +static bool is_mapped(MshvMemorySlot *slot) +{ + /* Subsequent reads of mapped field see a fully-initialized slot */ + return qatomic_load_acquire(&slot->mapped); +} + +/* + * Find slot that is: + * - overlapping in userspace + * - currently mapped in the guest + * + * Needs to be called with mshv_state->msm.mutex or RCU read lock held. + */ +static MshvMemorySlot *find_overlap_mem_slot(GList *head, MshvMemorySlot *slot) +{ + GList *found; + MshvMemorySlot *overlap_slot; + + found = g_list_find_custom(head, slot, (GCompareFunc) slot_overlaps); + + if (!found) { + return NULL; + } + + overlap_slot = found->data; + if (!overlap_slot || !is_mapped(overlap_slot)) { + return NULL; + } + + return overlap_slot; +} + +static int set_guest_memory(int vm_fd, + const struct mshv_user_mem_region *region) +{ + int ret; + + ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region); + if (ret < 0) { + error_report("failed to set guest memory: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int map_or_unmap(int vm_fd, const MshvMemorySlot *slot, bool map) +{ + struct mshv_user_mem_region region = {0}; + + region.guest_pfn = slot->guest_phys_addr >> MSHV_PAGE_SHIFT; + region.size = slot->memory_size; + region.userspace_addr = slot->userspace_addr; + + if (!map) { + region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP); + trace_mshv_unmap_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return set_guest_memory(vm_fd, ®ion); + } + + region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE); + if (!slot->readonly) { + region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE); + } + + trace_mshv_map_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return set_guest_memory(vm_fd, ®ion); +} + +static int slot_matches_region(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + return (slot1->guest_phys_addr == slot2->guest_phys_addr && + slot1->userspace_addr == slot2->userspace_addr && + slot1->memory_size == slot2->memory_size) ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *find_mem_slot_by_region(uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + MshvMemorySlot ref_slot = { + .guest_phys_addr = gpa, + .userspace_addr = userspace_addr, + .memory_size = size, + }; + GList *found; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + found = g_list_find_custom(manager->slots, &ref_slot, + (GCompareFunc) slot_matches_region); + + return found ? found->data : NULL; +} + +static int slot_covers_gpa(const MshvMemorySlot *slot, uint64_t *gpa_p) +{ + uint64_t gpa_offset, gpa = *gpa_p; + + gpa_offset = gpa - slot->guest_phys_addr; + return (slot->guest_phys_addr <= gpa && gpa_offset < slot->memory_size) + ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex or RCU read lock held */ +static MshvMemorySlot *find_mem_slot_by_gpa(GList *head, uint64_t gpa) +{ + GList *found; + MshvMemorySlot *slot; + + trace_mshv_find_slot_by_gpa(gpa); + + found = g_list_find_custom(head, &gpa, (GCompareFunc) slot_covers_gpa); + if (found) { + slot = found->data; + trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return slot; + } + + return NULL; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static void set_mapped(MshvMemorySlot *slot, bool mapped) +{ + /* prior writes to mapped field becomes visible before readers see slot */ + qatomic_store_release(&slot->mapped, mapped); +} + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa) +{ + MshvMemorySlot *gpa_slot, *overlap_slot; + GList *head; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + /* fast path, called often by unmapped_gpa vm exit */ + WITH_RCU_READ_LOCK_GUARD() { + assert(manager); + head = qatomic_load_acquire(&manager->slots); + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(head, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(head, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + } + + /* + * We'll modify the mapping list, so we need to upgrade to mutex and + * recheck. + */ + assert(manager); + QEMU_LOCK_GUARD(&manager->mutex); + + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(manager->slots, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(manager->slots, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + + /* unmap overlapping slot */ + ret = map_or_unmap(vm_fd, overlap_slot, false); + if (ret < 0) { + error_report("failed to unmap overlap region"); + abort(); + } + set_mapped(overlap_slot, false); + warn_report("mapped out userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + overlap_slot->userspace_addr, + overlap_slot->guest_phys_addr, + overlap_slot->memory_size); + + /* map region for gpa */ + ret = map_or_unmap(vm_fd, gpa_slot, true); + if (ret < 0) { + error_report("failed to map new region"); + abort(); + } + set_mapped(gpa_slot, true); + warn_report("mapped in userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + gpa_slot->userspace_addr, gpa_slot->guest_phys_addr, + gpa_slot->memory_size); + + return MshvRemapOk; +} + +static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size, + uint8_t *data) +{ + warn_report("read from unmapped mmio region gpa=0x%lx size=%lu", gpa, size); + + if (size == 0 || size > 8) { + error_report("invalid size %lu for reading from unmapped mmio region", + size); + return -1; + } + + memset(data, 0xFF, size); + + return 0; +} + +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + if (instruction_fetch) { + trace_mshv_insn_fetch(gpa, size); + } else { + trace_mshv_mem_read(gpa, size); + } + + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, false); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + return handle_unmapped_mmio_region_read(gpa, size, data); + } + + error_report("failed to read guest memory at 0x%lx", gpa); + return -1; +} + +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + trace_mshv_mem_write(gpa, size); + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, true); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + warn_report("write to unmapped mmio region gpa=0x%lx size=%lu", gpa, + size); + return 0; + } + + error_report("Failed to write guest memory"); + return -1; +} + +static int tracked_unmap(int vm_fd, uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + int ret; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (!slot) { + trace_mshv_skip_unset_mem(userspace_addr, gpa, size); + /* no work to do */ + return 0; + } + + if (!is_mapped(slot)) { + /* remove slot, no need to unmap */ + return remove_slot(slot); + } + + ret = map_or_unmap(vm_fd, slot, false); + if (ret < 0) { + error_report("failed to unmap memory region"); + return ret; + } + return remove_slot(slot); +} + +static int tracked_map(int vm_fd, uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr) +{ + MshvMemorySlot *slot, *overlap_slot; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (slot) { + error_report("memory region already mapped at gpa=0x%lx, " + "userspace_addr=0x%lx, size=0x%lx", + slot->guest_phys_addr, slot->userspace_addr, + slot->memory_size); + return -1; + } + + slot = append_slot(gpa, userspace_addr, size, readonly); + + overlap_slot = find_overlap_mem_slot(manager->slots, slot); + if (overlap_slot) { + trace_mshv_remap_attempt(slot->userspace_addr, + slot->guest_phys_addr, + slot->memory_size); + warn_report("attempt to map region [0x%lx-0x%lx], while " + "[0x%lx-0x%lx] is already mapped in the guest", + userspace_addr, userspace_addr + size - 1, + overlap_slot->userspace_addr, + overlap_slot->userspace_addr + + overlap_slot->memory_size - 1); + + /* do not register mem slot in hv, but record for later swap-in */ + set_mapped(slot, false); + + return 0; + } + + ret = map_or_unmap(vm_fd, slot, true); + if (ret < 0) { + error_report("failed to map memory region"); + return -1; + } + set_mapped(slot, true); + + return 0; +} + +static int set_memory(uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr, bool add) +{ + int vm_fd = mshv_state->vm; + + if (add) { + return tracked_map(vm_fd, gpa, size, readonly, userspace_addr); + } + + return tracked_unmap(vm_fd, gpa, size, userspace_addr); +} + +/* + * Calculate and align the start address and the size of the section. + * Return the size. If the size is 0, the aligned section is empty. + */ +static hwaddr align_section(MemoryRegionSection *section, hwaddr *start) +{ + hwaddr size = int128_get64(section->size); + hwaddr delta, aligned; + + /* + * works in page size chunks, but the function may be called + * with sub-page size and unaligned start address. Pad the start + * address to next and truncate size to previous page boundary. + */ + aligned = ROUND_UP(section->offset_within_address_space, + qemu_real_host_page_size()); + delta = aligned - section->offset_within_address_space; + *start = aligned; + if (delta > size) { + return 0; + } + + return (size - delta) & qemu_real_host_page_mask(); +} + +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add) +{ + int ret = 0; + MemoryRegion *area = section->mr; + bool writable = !area->readonly && !area->rom_device; + hwaddr start_addr, mr_offset, size; + void *ram; + + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); + + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); + + /* + * If the memory device is a writable non-ram area, we do not + * want to map it into the guest memory. If it is not a ROM device, + * we want to remove mshv memory mapping, so accesses will trap. + */ + if (!memory_region_is_ram(area)) { + if (writable) { + return; + } else if (!area->romd_mode) { + add = false; + } + } + + if (!size) { + return; + } + + mr_offset = section->offset_within_region + start_addr - + section->offset_within_address_space; + + ram = memory_region_get_ram_ptr(area) + mr_offset; + + ret = set_memory(start_addr, size, !writable, (uint64_t)ram, add); + if (ret < 0) { + error_report("failed to set memory region"); + abort(); + } +} + +void mshv_init_memory_slot_manager(MshvState *mshv_state) +{ + MshvMemorySlotManager *manager; + + assert(mshv_state); + manager = &mshv_state->msm; + + manager->n_slots = 0; + manager->slots = NULL; + qemu_mutex_init(&manager->mutex); +} diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build new file mode 100644 index 0000000..d3a2b32 --- /dev/null +++ b/accel/mshv/meson.build @@ -0,0 +1,9 @@ +mshv_ss = ss.source_set() +mshv_ss.add(if_true: files( + 'irq.c', + 'mem.c', + 'msr.c', + 'mshv-all.c' +)) + +specific_ss.add_all(when: 'CONFIG_MSHV', if_true: mshv_ss) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c new file mode 100644 index 0000000..45174f7 --- /dev/null +++ b/accel/mshv/mshv-all.c @@ -0,0 +1,727 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * Wei Liu <liuwe@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/event_notifier.h" +#include "qemu/module.h" +#include "qemu/main-loop.h" +#include "hw/boards.h" + +#include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "linux/mshv.h" + +#include "qemu/accel.h" +#include "qemu/guest-random.h" +#include "accel/accel-ops.h" +#include "accel/accel-cpu-ops.h" +#include "system/cpus.h" +#include "system/runstate.h" +#include "system/accel-blocker.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/reset.h" +#include "trace.h" +#include <err.h> +#include <stdint.h> +#include <sys/ioctl.h> + +#define TYPE_MSHV_ACCEL ACCEL_CLASS_NAME("mshv") + +DECLARE_INSTANCE_CHECKER(MshvState, MSHV_STATE, TYPE_MSHV_ACCEL) + +bool mshv_allowed; + +MshvState *mshv_state; + +static int init_mshv(int *mshv_fd) +{ + int fd = open("/dev/mshv", O_RDWR | O_CLOEXEC); + if (fd < 0) { + error_report("Failed to open /dev/mshv: %s", strerror(errno)); + return -1; + } + *mshv_fd = fd; + return 0; +} + +/* freeze 1 to pause, 0 to resume */ +static int set_time_freeze(int vm_fd, int freeze) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + in.property_code = HV_PARTITION_PROPERTY_TIME_FREEZE; + in.property_value = freeze; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set time freeze"); + return -1; + } + + return 0; +} + +static int pause_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 1); + if (ret < 0) { + error_report("Failed to pause partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int resume_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 0); + if (ret < 0) { + error_report("Failed to resume partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int create_partition(int mshv_fd, int *vm_fd) +{ + int ret; + struct mshv_create_partition args = {0}; + + /* Initialize pt_flags with the desired features */ + uint64_t pt_flags = (1ULL << MSHV_PT_BIT_LAPIC) | + (1ULL << MSHV_PT_BIT_X2APIC) | + (1ULL << MSHV_PT_BIT_GPA_SUPER_PAGES); + + /* Set default isolation type */ + uint64_t pt_isolation = MSHV_PT_ISOLATION_NONE; + + args.pt_flags = pt_flags; + args.pt_isolation = pt_isolation; + + ret = ioctl(mshv_fd, MSHV_CREATE_PARTITION, &args); + if (ret < 0) { + error_report("Failed to create partition: %s", strerror(errno)); + return -1; + } + + *vm_fd = ret; + return 0; +} + +static int set_synthetic_proc_features(int vm_fd) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + union hv_partition_synthetic_processor_features features = {0}; + + /* Access the bitfield and set the desired features */ + features.hypervisor_present = 1; + features.hv1 = 1; + features.access_partition_reference_counter = 1; + features.access_synic_regs = 1; + features.access_synthetic_timer_regs = 1; + features.access_partition_reference_tsc = 1; + features.access_frequency_regs = 1; + features.access_intr_ctrl_regs = 1; + features.access_vp_index = 1; + features.access_hypercall_regs = 1; + features.tb_flush_hypercalls = 1; + features.synthetic_cluster_ipi = 1; + features.direct_synthetic_timers = 1; + + mshv_arch_amend_proc_features(&features); + + in.property_code = HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES; + in.property_value = features.as_uint64[0]; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("synthetic_proc_features", args.code, args.in_sz); + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set synthethic proc features"); + return -errno; + } + return 0; +} + +static int initialize_vm(int vm_fd) +{ + int ret = ioctl(vm_fd, MSHV_INITIALIZE_PARTITION); + if (ret < 0) { + error_report("Failed to initialize partition: %s", strerror(errno)); + return -1; + } + return 0; +} + +static int create_vm(int mshv_fd, int *vm_fd) +{ + int ret = create_partition(mshv_fd, vm_fd); + if (ret < 0) { + return -1; + } + + ret = set_synthetic_proc_features(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = initialize_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_reserve_ioapic_msi_routes(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_arch_post_init_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + /* Always create a frozen partition */ + pause_vm(*vm_fd); + + return 0; +} + +static void mem_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + memory_region_ref(section->mr); + mshv_set_phys_mem(mml, section, true); +} + +static void mem_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + mshv_set_phys_mem(mml, section, false); + memory_region_unref(section->mr); +} + +typedef enum { + DATAMATCH_NONE, + DATAMATCH_U32, + DATAMATCH_U64, +} DatamatchTag; + +typedef struct { + DatamatchTag tag; + union { + uint32_t u32; + uint64_t u64; + } value; +} Datamatch; + +/* flags: determine whether to de/assign */ +static int ioeventfd(int vm_fd, int event_fd, uint64_t addr, Datamatch dm, + uint32_t flags) +{ + struct mshv_user_ioeventfd args = {0}; + args.fd = event_fd; + args.addr = addr; + args.flags = flags; + + if (dm.tag == DATAMATCH_NONE) { + args.datamatch = 0; + } else { + flags |= BIT(MSHV_IOEVENTFD_BIT_DATAMATCH); + args.flags = flags; + if (dm.tag == DATAMATCH_U64) { + args.len = sizeof(uint64_t); + args.datamatch = dm.value.u64; + } else { + args.len = sizeof(uint32_t); + args.datamatch = dm.value.u32; + } + } + + return ioctl(vm_fd, MSHV_IOEVENTFD, &args); +} + +static int unregister_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + flags |= BIT(MSHV_IOEVENTFD_BIT_DEASSIGN); + dm.tag = DATAMATCH_NONE; + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static int register_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr, + uint64_t val, bool is_64bit, bool is_datamatch) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + if (!is_datamatch) { + dm.tag = DATAMATCH_NONE; + } else if (is_64bit) { + dm.tag = DATAMATCH_U64; + dm.value.u64 = val; + } else { + dm.tag = DATAMATCH_U32; + dm.value.u32 = val; + } + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static void mem_ioeventfd_add(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + bool is_64 = int128_get64(section->size) == 8; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_add(addr, int128_get64(section->size), data); + + ret = register_ioevent(mshv_state->vm, fd, addr, data, is_64, match_data); + + if (ret < 0) { + error_report("Failed to register ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + +static void mem_ioeventfd_del(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_del(section->offset_within_address_space, + int128_get64(section->size), data); + + ret = unregister_ioevent(mshv_state->vm, fd, addr); + if (ret < 0) { + error_report("Failed to unregister ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + +static MemoryListener mshv_memory_listener = { + .name = "mshv", + .priority = MEMORY_LISTENER_PRIORITY_ACCEL, + .region_add = mem_region_add, + .region_del = mem_region_del, + .eventfd_add = mem_ioeventfd_add, + .eventfd_del = mem_ioeventfd_del, +}; + +static MemoryListener mshv_io_listener = { + .name = "mshv", .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND, + /* MSHV does not support PIO eventfd */ +}; + +static void register_mshv_memory_listener(MshvState *s, MshvMemoryListener *mml, + AddressSpace *as, int as_id, + const char *name) +{ + int i; + + mml->listener = mshv_memory_listener; + mml->listener.name = name; + memory_listener_register(&mml->listener, as); + for (i = 0; i < s->nr_as; ++i) { + if (!s->as[i].as) { + s->as[i].as = as; + s->as[i].ml = mml; + break; + } + } +} + +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args) +{ + int ret = 0; + + ret = ioctl(fd, MSHV_ROOT_HVCALL, args); + if (ret < 0) { + error_report("Failed to perform hvcall: %s", strerror(errno)); + return -1; + } + return ret; +} + +static int mshv_init_vcpu(CPUState *cpu) +{ + int vm_fd = mshv_state->vm; + uint8_t vp_index = cpu->cpu_index; + int ret; + + cpu->accel = g_new0(AccelCPUState, 1); + mshv_arch_init_vcpu(cpu); + + ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd); + if (ret < 0) { + return -1; + } + + cpu->accel->dirty = true; + + return 0; +} + +static int mshv_init(AccelState *as, MachineState *ms) +{ + MshvState *s; + int mshv_fd, vm_fd, ret; + + if (mshv_state) { + warn_report("MSHV accelerator already initialized"); + return 0; + } + + s = MSHV_STATE(as); + + accel_blocker_init(); + + s->vm = 0; + + ret = init_mshv(&mshv_fd); + if (ret < 0) { + return -1; + } + + mshv_init_mmio_emu(); + + mshv_init_msicontrol(); + + mshv_init_memory_slot_manager(s); + + ret = create_vm(mshv_fd, &vm_fd); + if (ret < 0) { + close(mshv_fd); + return -1; + } + + ret = resume_vm(vm_fd); + if (ret < 0) { + close(mshv_fd); + close(vm_fd); + return -1; + } + + s->vm = vm_fd; + s->fd = mshv_fd; + s->nr_as = 1; + s->as = g_new0(MshvAddressSpace, s->nr_as); + + mshv_state = s; + + register_mshv_memory_listener(s, &s->memory_listener, &address_space_memory, + 0, "mshv-memory"); + memory_listener_register(&mshv_io_listener, &address_space_io); + + return 0; +} + +static int mshv_destroy_vcpu(CPUState *cpu) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vm_fd = mshv_state->vm; + + mshv_remove_vcpu(vm_fd, cpu_fd); + mshv_vcpufd(cpu) = 0; + + mshv_arch_destroy_vcpu(cpu); + g_clear_pointer(&cpu->accel, g_free); + return 0; +} + +static int mshv_cpu_exec(CPUState *cpu) +{ + hv_message mshv_msg; + enum MshvVmExit exit_reason; + int ret = 0; + + bql_unlock(); + cpu_exec_start(cpu); + + do { + if (cpu->accel->dirty) { + ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after init: %s", + strerror(-ret)); + ret = -1; + break; + } + cpu->accel->dirty = false; + } + + ret = mshv_run_vcpu(mshv_state->vm, cpu, &mshv_msg, &exit_reason); + if (ret < 0) { + error_report("Failed to run on vcpu %d", cpu->cpu_index); + abort(); + } + + switch (exit_reason) { + case MshvVmExitIgnore: + break; + default: + ret = EXCP_INTERRUPT; + break; + } + } while (ret == 0); + + cpu_exec_end(cpu); + bql_lock(); + + if (ret < 0) { + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + return ret; +} + +/* + * The signal handler is triggered when QEMU's main thread receives a SIG_IPI + * (SIGUSR1). This signal causes the current CPU thread to be kicked, forcing a + * VM exit on the CPU. The VM exit generates an exit reason that breaks the loop + * (see mshv_cpu_exec). If the exit is due to a Ctrl+A+x command, the system + * will shut down. For other cases, the system will continue running. + */ +static void sa_ipi_handler(int sig) +{ + /* TODO: call IOCTL to set_immediate_exit, once implemented. */ + + qemu_cpu_kick_self(); +} + +static void init_signal(CPUState *cpu) +{ + /* init cpu signals */ + struct sigaction sigact; + sigset_t set; + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = sa_ipi_handler; + sigaction(SIG_IPI, &sigact, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &set); + sigdelset(&set, SIG_IPI); + pthread_sigmask(SIG_SETMASK, &set, NULL); +} + +static void *mshv_vcpu_thread(void *arg) +{ + CPUState *cpu = arg; + int ret; + + rcu_register_thread(); + + bql_lock(); + qemu_thread_get_self(cpu->thread); + cpu->thread_id = qemu_get_thread_id(); + current_cpu = cpu; + ret = mshv_init_vcpu(cpu); + if (ret < 0) { + error_report("Failed to init vcpu %d", cpu->cpu_index); + goto cleanup; + } + init_signal(cpu); + + /* signal CPU creation */ + cpu_thread_signal_created(cpu); + qemu_guest_random_seed_thread_part2(cpu->random_seed); + + do { + qemu_process_cpu_events(cpu); + if (cpu_can_run(cpu)) { + mshv_cpu_exec(cpu); + } + } while (!cpu->unplug || cpu_can_run(cpu)); + + mshv_destroy_vcpu(cpu); +cleanup: + cpu_thread_signal_destroyed(cpu); + bql_unlock(); + rcu_unregister_thread(); + return NULL; +} + +static void mshv_start_vcpu_thread(CPUState *cpu) +{ + char thread_name[VCPU_THREAD_NAME_SIZE]; + + cpu->thread = g_malloc0(sizeof(QemuThread)); + cpu->halt_cond = g_malloc0(sizeof(QemuCond)); + + qemu_cond_init(cpu->halt_cond); + + trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu, + QEMU_THREAD_JOINABLE); +} + +static void do_mshv_cpu_synchronize_post_init(CPUState *cpu, + run_on_cpu_data arg) +{ + int ret = mshv_arch_put_registers(cpu); + if (ret < 0) { + error_report("Failed to put registers after init: %s", strerror(-ret)); + abort(); + } + + cpu->accel->dirty = false; +} + +static void mshv_cpu_synchronize_post_init(CPUState *cpu) +{ + run_on_cpu(cpu, do_mshv_cpu_synchronize_post_init, RUN_ON_CPU_NULL); +} + +static void mshv_cpu_synchronize_post_reset(CPUState *cpu) +{ + int ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after reset: %s", + strerror(-ret)); + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + cpu->accel->dirty = false; +} + +static void do_mshv_cpu_synchronize_pre_loadvm(CPUState *cpu, + run_on_cpu_data arg) +{ + cpu->accel->dirty = true; +} + +static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu) +{ + run_on_cpu(cpu, do_mshv_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); +} + +static void do_mshv_cpu_synchronize(CPUState *cpu, run_on_cpu_data arg) +{ + if (!cpu->accel->dirty) { + int ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("Failed to load registers for vcpu %d", + cpu->cpu_index); + + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + cpu->accel->dirty = true; + } +} + +static void mshv_cpu_synchronize(CPUState *cpu) +{ + if (!cpu->accel->dirty) { + run_on_cpu(cpu, do_mshv_cpu_synchronize, RUN_ON_CPU_NULL); + } +} + +static bool mshv_cpus_are_resettable(void) +{ + return false; +} + +static void mshv_accel_class_init(ObjectClass *oc, const void *data) +{ + AccelClass *ac = ACCEL_CLASS(oc); + + ac->name = "MSHV"; + ac->init_machine = mshv_init; + ac->allowed = &mshv_allowed; +} + +static void mshv_accel_instance_init(Object *obj) +{ + MshvState *s = MSHV_STATE(obj); + + s->vm = 0; +} + +static const TypeInfo mshv_accel_type = { + .name = TYPE_MSHV_ACCEL, + .parent = TYPE_ACCEL, + .instance_init = mshv_accel_instance_init, + .class_init = mshv_accel_class_init, + .instance_size = sizeof(MshvState), +}; + +static void mshv_accel_ops_class_init(ObjectClass *oc, const void *data) +{ + AccelOpsClass *ops = ACCEL_OPS_CLASS(oc); + + ops->create_vcpu_thread = mshv_start_vcpu_thread; + ops->synchronize_post_init = mshv_cpu_synchronize_post_init; + ops->synchronize_post_reset = mshv_cpu_synchronize_post_reset; + ops->synchronize_state = mshv_cpu_synchronize; + ops->synchronize_pre_loadvm = mshv_cpu_synchronize_pre_loadvm; + ops->cpus_are_resettable = mshv_cpus_are_resettable; + ops->handle_interrupt = generic_handle_interrupt; +} + +static const TypeInfo mshv_accel_ops_type = { + .name = ACCEL_OPS_NAME("mshv"), + .parent = TYPE_ACCEL_OPS, + .class_init = mshv_accel_ops_class_init, + .abstract = true, +}; + +static void mshv_type_init(void) +{ + type_register_static(&mshv_accel_type); + type_register_static(&mshv_accel_ops_type); +} + +type_init(mshv_type_init); diff --git a/accel/mshv/msr.c b/accel/mshv/msr.c new file mode 100644 index 0000000..e6e5bae --- /dev/null +++ b/accel/mshv/msr.c @@ -0,0 +1,375 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "linux/mshv.h" +#include "qemu/error-report.h" + +static uint32_t supported_msrs[64] = { + IA32_MSR_TSC, + IA32_MSR_EFER, + IA32_MSR_KERNEL_GS_BASE, + IA32_MSR_APIC_BASE, + IA32_MSR_PAT, + IA32_MSR_SYSENTER_CS, + IA32_MSR_SYSENTER_ESP, + IA32_MSR_SYSENTER_EIP, + IA32_MSR_STAR, + IA32_MSR_LSTAR, + IA32_MSR_CSTAR, + IA32_MSR_SFMASK, + IA32_MSR_MTRR_DEF_TYPE, + IA32_MSR_MTRR_PHYSBASE0, + IA32_MSR_MTRR_PHYSMASK0, + IA32_MSR_MTRR_PHYSBASE1, + IA32_MSR_MTRR_PHYSMASK1, + IA32_MSR_MTRR_PHYSBASE2, + IA32_MSR_MTRR_PHYSMASK2, + IA32_MSR_MTRR_PHYSBASE3, + IA32_MSR_MTRR_PHYSMASK3, + IA32_MSR_MTRR_PHYSBASE4, + IA32_MSR_MTRR_PHYSMASK4, + IA32_MSR_MTRR_PHYSBASE5, + IA32_MSR_MTRR_PHYSMASK5, + IA32_MSR_MTRR_PHYSBASE6, + IA32_MSR_MTRR_PHYSMASK6, + IA32_MSR_MTRR_PHYSBASE7, + IA32_MSR_MTRR_PHYSMASK7, + IA32_MSR_MTRR_FIX64K_00000, + IA32_MSR_MTRR_FIX16K_80000, + IA32_MSR_MTRR_FIX16K_A0000, + IA32_MSR_MTRR_FIX4K_C0000, + IA32_MSR_MTRR_FIX4K_C8000, + IA32_MSR_MTRR_FIX4K_D0000, + IA32_MSR_MTRR_FIX4K_D8000, + IA32_MSR_MTRR_FIX4K_E0000, + IA32_MSR_MTRR_FIX4K_E8000, + IA32_MSR_MTRR_FIX4K_F0000, + IA32_MSR_MTRR_FIX4K_F8000, + IA32_MSR_TSC_AUX, + IA32_MSR_DEBUG_CTL, + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_SINT0, + HV_X64_MSR_SINT1, + HV_X64_MSR_SINT2, + HV_X64_MSR_SINT3, + HV_X64_MSR_SINT4, + HV_X64_MSR_SINT5, + HV_X64_MSR_SINT6, + HV_X64_MSR_SINT7, + HV_X64_MSR_SINT8, + HV_X64_MSR_SINT9, + HV_X64_MSR_SINT10, + HV_X64_MSR_SINT11, + HV_X64_MSR_SINT12, + HV_X64_MSR_SINT13, + HV_X64_MSR_SINT14, + HV_X64_MSR_SINT15, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_SIEFP, + HV_X64_MSR_SIMP, + HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_EOM, +}; +static const size_t msr_count = ARRAY_SIZE(supported_msrs); + +static int compare_msr_index(const void *a, const void *b) +{ + return *(uint32_t *)a - *(uint32_t *)b; +} + +__attribute__((constructor)) +static void init_sorted_msr_map(void) +{ + qsort(supported_msrs, msr_count, sizeof(uint32_t), compare_msr_index); +} + +static int mshv_is_supported_msr(uint32_t msr) +{ + return bsearch(&msr, supported_msrs, msr_count, sizeof(uint32_t), + compare_msr_index) != NULL; +} + +static int mshv_msr_to_hv_reg_name(uint32_t msr, uint32_t *hv_reg) +{ + switch (msr) { + case IA32_MSR_TSC: + *hv_reg = HV_X64_REGISTER_TSC; + return 0; + case IA32_MSR_EFER: + *hv_reg = HV_X64_REGISTER_EFER; + return 0; + case IA32_MSR_KERNEL_GS_BASE: + *hv_reg = HV_X64_REGISTER_KERNEL_GS_BASE; + return 0; + case IA32_MSR_APIC_BASE: + *hv_reg = HV_X64_REGISTER_APIC_BASE; + return 0; + case IA32_MSR_PAT: + *hv_reg = HV_X64_REGISTER_PAT; + return 0; + case IA32_MSR_SYSENTER_CS: + *hv_reg = HV_X64_REGISTER_SYSENTER_CS; + return 0; + case IA32_MSR_SYSENTER_ESP: + *hv_reg = HV_X64_REGISTER_SYSENTER_ESP; + return 0; + case IA32_MSR_SYSENTER_EIP: + *hv_reg = HV_X64_REGISTER_SYSENTER_EIP; + return 0; + case IA32_MSR_STAR: + *hv_reg = HV_X64_REGISTER_STAR; + return 0; + case IA32_MSR_LSTAR: + *hv_reg = HV_X64_REGISTER_LSTAR; + return 0; + case IA32_MSR_CSTAR: + *hv_reg = HV_X64_REGISTER_CSTAR; + return 0; + case IA32_MSR_SFMASK: + *hv_reg = HV_X64_REGISTER_SFMASK; + return 0; + case IA32_MSR_MTRR_CAP: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_CAP; + return 0; + case IA32_MSR_MTRR_DEF_TYPE: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_DEF_TYPE; + return 0; + case IA32_MSR_MTRR_PHYSBASE0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0; + return 0; + case IA32_MSR_MTRR_PHYSMASK0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0; + return 0; + case IA32_MSR_MTRR_PHYSBASE1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1; + return 0; + case IA32_MSR_MTRR_PHYSMASK1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1; + return 0; + case IA32_MSR_MTRR_PHYSBASE2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2; + return 0; + case IA32_MSR_MTRR_PHYSMASK2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2; + return 0; + case IA32_MSR_MTRR_PHYSBASE3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3; + return 0; + case IA32_MSR_MTRR_PHYSMASK3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3; + return 0; + case IA32_MSR_MTRR_PHYSBASE4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4; + return 0; + case IA32_MSR_MTRR_PHYSMASK4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4; + return 0; + case IA32_MSR_MTRR_PHYSBASE5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5; + return 0; + case IA32_MSR_MTRR_PHYSMASK5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5; + return 0; + case IA32_MSR_MTRR_PHYSBASE6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6; + return 0; + case IA32_MSR_MTRR_PHYSMASK6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6; + return 0; + case IA32_MSR_MTRR_PHYSBASE7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7; + return 0; + case IA32_MSR_MTRR_PHYSMASK7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7; + return 0; + case IA32_MSR_MTRR_FIX64K_00000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX64K00000; + return 0; + case IA32_MSR_MTRR_FIX16K_80000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16K80000; + return 0; + case IA32_MSR_MTRR_FIX16K_A0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16KA0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC8000; + return 0; + case IA32_MSR_MTRR_FIX4K_D0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD0000; + return 0; + case IA32_MSR_MTRR_FIX4K_D8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD8000; + return 0; + case IA32_MSR_MTRR_FIX4K_E0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE0000; + return 0; + case IA32_MSR_MTRR_FIX4K_E8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE8000; + return 0; + case IA32_MSR_MTRR_FIX4K_F0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF0000; + return 0; + case IA32_MSR_MTRR_FIX4K_F8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF8000; + return 0; + case IA32_MSR_TSC_AUX: + *hv_reg = HV_X64_REGISTER_TSC_AUX; + return 0; + case IA32_MSR_BNDCFGS: + *hv_reg = HV_X64_REGISTER_BNDCFGS; + return 0; + case IA32_MSR_DEBUG_CTL: + *hv_reg = HV_X64_REGISTER_DEBUG_CTL; + return 0; + case IA32_MSR_TSC_ADJUST: + *hv_reg = HV_X64_REGISTER_TSC_ADJUST; + return 0; + case IA32_MSR_SPEC_CTRL: + *hv_reg = HV_X64_REGISTER_SPEC_CTRL; + return 0; + case HV_X64_MSR_GUEST_OS_ID: + *hv_reg = HV_REGISTER_GUEST_OS_ID; + return 0; + case HV_X64_MSR_SINT0: + *hv_reg = HV_REGISTER_SINT0; + return 0; + case HV_X64_MSR_SINT1: + *hv_reg = HV_REGISTER_SINT1; + return 0; + case HV_X64_MSR_SINT2: + *hv_reg = HV_REGISTER_SINT2; + return 0; + case HV_X64_MSR_SINT3: + *hv_reg = HV_REGISTER_SINT3; + return 0; + case HV_X64_MSR_SINT4: + *hv_reg = HV_REGISTER_SINT4; + return 0; + case HV_X64_MSR_SINT5: + *hv_reg = HV_REGISTER_SINT5; + return 0; + case HV_X64_MSR_SINT6: + *hv_reg = HV_REGISTER_SINT6; + return 0; + case HV_X64_MSR_SINT7: + *hv_reg = HV_REGISTER_SINT7; + return 0; + case HV_X64_MSR_SINT8: + *hv_reg = HV_REGISTER_SINT8; + return 0; + case HV_X64_MSR_SINT9: + *hv_reg = HV_REGISTER_SINT9; + return 0; + case HV_X64_MSR_SINT10: + *hv_reg = HV_REGISTER_SINT10; + return 0; + case HV_X64_MSR_SINT11: + *hv_reg = HV_REGISTER_SINT11; + return 0; + case HV_X64_MSR_SINT12: + *hv_reg = HV_REGISTER_SINT12; + return 0; + case HV_X64_MSR_SINT13: + *hv_reg = HV_REGISTER_SINT13; + return 0; + case HV_X64_MSR_SINT14: + *hv_reg = HV_REGISTER_SINT14; + return 0; + case HV_X64_MSR_SINT15: + *hv_reg = HV_REGISTER_SINT15; + return 0; + case IA32_MSR_MISC_ENABLE: + *hv_reg = HV_X64_REGISTER_MSR_IA32_MISC_ENABLE; + return 0; + case HV_X64_MSR_SCONTROL: + *hv_reg = HV_REGISTER_SCONTROL; + return 0; + case HV_X64_MSR_SIEFP: + *hv_reg = HV_REGISTER_SIEFP; + return 0; + case HV_X64_MSR_SIMP: + *hv_reg = HV_REGISTER_SIMP; + return 0; + case HV_X64_MSR_REFERENCE_TSC: + *hv_reg = HV_REGISTER_REFERENCE_TSC; + return 0; + case HV_X64_MSR_EOM: + *hv_reg = HV_REGISTER_EOM; + return 0; + default: + error_report("failed to map MSR %u to HV register name", msr); + return -1; + } +} + +static int set_msrs(const CPUState *cpu, GList *msrs) +{ + size_t n_msrs; + GList *entries; + MshvMsrEntry *entry; + enum hv_register_name name; + struct hv_register_assoc *assoc; + int ret; + size_t i = 0; + + n_msrs = g_list_length(msrs); + hv_register_assoc *assocs = g_new0(hv_register_assoc, n_msrs); + + entries = msrs; + for (const GList *elem = entries; elem != NULL; elem = elem->next) { + entry = elem->data; + ret = mshv_msr_to_hv_reg_name(entry->index, &name); + if (ret < 0) { + g_free(assocs); + return ret; + } + assoc = &assocs[i]; + assoc->name = name; + /* the union has been initialized to 0 */ + assoc->value.reg64 = entry->data; + i++; + } + ret = mshv_set_generic_regs(cpu, assocs, n_msrs); + g_free(assocs); + if (ret < 0) { + error_report("failed to set msrs"); + return -1; + } + return 0; +} + + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs) +{ + GList *valid_msrs = NULL; + uint32_t msr_index; + int ret; + + for (size_t i = 0; i < n_msrs; i++) { + msr_index = msrs[i].index; + /* check whether index of msrs is in SUPPORTED_MSRS */ + if (mshv_is_supported_msr(msr_index)) { + valid_msrs = g_list_append(valid_msrs, (void *) &msrs[i]); + } + } + + ret = set_msrs(cpu, valid_msrs); + g_list_free(valid_msrs); + + return ret; +} diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events new file mode 100644 index 0000000..36f0d59 --- /dev/null +++ b/accel/mshv/trace-events @@ -0,0 +1,33 @@ +# Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> +# Magnus Kulke <magnuskulke@microsoft.com> +# +# SPDX-License-Identifier: GPL-2.0-or-later + +mshv_start_vcpu_thread(const char* thread, uint32_t cpu) "thread=%s cpu_index=%d" + +mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "add=%d gpa=0x%" PRIx64 " size=0x%" PRIx64 " user=0x%" PRIx64 " readonly=%d result=%d" +mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" +mshv_mem_ioeventfd_del(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" + +mshv_hvcall_args(const char* hvcall, uint16_t code, uint16_t in_sz) "built args for '%s' code: %d in_sz: %d" + +mshv_handle_interrupt(uint32_t cpu, int mask) "cpu_index=%d mask=0x%x" +mshv_set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) "gsi=%d addr=0x%" PRIx64 " data=0x%x" +mshv_remove_msi_routing(uint32_t gsi) "gsi=%d" +mshv_add_msi_routing(uint64_t addr, uint32_t data) "addr=0x%" PRIx64 " data=0x%x" +mshv_commit_msi_routing_table(int vm_fd, int len) "vm_fd=%d table_size=%d" +mshv_register_irqfd(int vm_fd, int event_fd, uint32_t gsi) "vm_fd=%d event_fd=%d gsi=%d" +mshv_irqchip_update_irqfd_notifier_gsi(int event_fd, int resample_fd, int virq, bool add) "event_fd=%d resample_fd=%d virq=%d add=%d" + +mshv_insn_fetch(uint64_t addr, size_t size) "gpa=0x%" PRIx64 " size=%zu" +mshv_mem_write(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_mem_read(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s gpa=0x%010" PRIx64 +mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t access_type) "\tgva=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%" PRIx64 " access_type=%d" + +mshv_found_slot(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_skip_unset_mem(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_remap_attempt(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_find_slot_by_gpa(uint64_t gpa) "\tgpa=0x%010" PRIx64 diff --git a/accel/mshv/trace.h b/accel/mshv/trace.h new file mode 100644 index 0000000..0dca48f --- /dev/null +++ b/accel/mshv/trace.h @@ -0,0 +1,14 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "trace/trace-accel_mshv.h" diff --git a/accel/stubs/meson.build b/accel/stubs/meson.build index 9dfc4f9..48eccd1 100644 --- a/accel/stubs/meson.build +++ b/accel/stubs/meson.build @@ -5,5 +5,6 @@ system_stubs_ss.add(when: 'CONFIG_TCG', if_false: files('tcg-stub.c')) system_stubs_ss.add(when: 'CONFIG_HVF', if_false: files('hvf-stub.c')) system_stubs_ss.add(when: 'CONFIG_NVMM', if_false: files('nvmm-stub.c')) system_stubs_ss.add(when: 'CONFIG_WHPX', if_false: files('whpx-stub.c')) +system_stubs_ss.add(when: 'CONFIG_MSHV', if_false: files('mshv-stub.c')) specific_ss.add_all(when: ['CONFIG_SYSTEM_ONLY'], if_true: system_stubs_ss) diff --git a/accel/stubs/mshv-stub.c b/accel/stubs/mshv-stub.c new file mode 100644 index 0000000..e499b19 --- /dev/null +++ b/accel/stubs/mshv-stub.c @@ -0,0 +1,44 @@ +/* + * QEMU MSHV stub + * + * Copyright Red Hat, Inc. 2025 + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/pci/msi.h" +#include "system/mshv.h" + +bool mshv_allowed; + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev) +{ + return -ENOSYS; +} + +void mshv_irqchip_release_virq(int virq) +{ +} + +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev) +{ + return -ENOSYS; +} + +void mshv_irqchip_commit_routes(void) +{ +} + +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n, + const EventNotifier *rn, int virq) +{ + return -ENOSYS; +} + +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq) +{ + return -ENOSYS; +} diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 7214d41..3010dd4 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -90,9 +90,6 @@ */ QEMU_BUILD_BUG_ON(sizeof(vaddr) > sizeof(run_on_cpu_data)); -/* We currently can't handle more than 16 bits in the MMUIDX bitmask. - */ -QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16); #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1) static inline size_t tlb_n_entries(CPUTLBDescFast *fast) diff --git a/block/curl.c b/block/curl.c index e0f98e0..68cf83c 100644 --- a/block/curl.c +++ b/block/curl.c @@ -471,11 +471,11 @@ static int curl_init_state(BDRVCURLState *s, CURLState *state) (void *)curl_read_cb) || curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state) || curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state) || - curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1) || - curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1) || - curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1) || + curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1L) || + curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1L) || + curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1L) || curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg) || - curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1)) { + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1L)) { goto err; } if (s->username) { @@ -800,7 +800,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, } s->accept_range = false; - if (curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1) || + if (curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1L) || curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION, curl_header_cb) || curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s)) { pstrcpy(state->errmsg, CURL_ERROR_SIZE, @@ -1216,8 +1216,9 @@ fi if test "$rust" != disabled && test -z "$rust_target_triple"; then # arch and os generally matches between meson and rust rust_arch=$host_arch + # default to host vendor + rust_vendor=$(echo "$rust_host_triple" | cut -d'-' -f2) rust_os=$host_os - rust_machine=unknown rust_osvariant= # tweak rust_os if needed; also, machine and variant depend on the OS @@ -1225,7 +1226,7 @@ if test "$rust" != disabled && test -z "$rust_target_triple"; then case "$host_os" in darwin) # e.g. aarch64-apple-darwin - rust_machine=apple + rust_vendor=apple ;; linux) @@ -1273,13 +1274,13 @@ EOF ;; sunos) - rust_machine=pc + rust_vendor=pc rust_os=solaris ;; windows) # e.g. aarch64-pc-windows-gnullvm, x86_64-pc-windows-gnu (MSVC not supported) - rust_machine=pc + rust_vendor=pc if test "$host_arch" = aarch64; then rust_osvariant=gnullvm else @@ -1310,7 +1311,7 @@ EOF sparc64) if test "$rust_os" = solaris; then rust_arch=sparcv9 - rust_machine=sun + rust_vendor=sun fi ;; @@ -1324,7 +1325,7 @@ EOF # e.g. aarch64-linux-android rust_target_triple=$rust_arch-$rust_os-$rust_osvariant else - rust_target_triple=$rust_arch-$rust_machine-$rust_os${rust_osvariant:+-$rust_osvariant} + rust_target_triple=$rust_arch-$rust_vendor-$rust_os${rust_osvariant:+-$rust_osvariant} fi fi @@ -2003,6 +2004,8 @@ if test "$skip_meson" = no; then test -n "${LIB_FUZZING_ENGINE+xxx}" && meson_option_add "-Dfuzzing_engine=$LIB_FUZZING_ENGINE" test "$plugins" = yes && meson_option_add "-Dplugins=true" test "$tcg" != enabled && meson_option_add "-Dtcg=$tcg" + test -n "$gdb_bin" && meson_option_add "-Dgdb=$gdb_bin" + run_meson() { NINJA=$ninja $meson setup "$@" "$PWD" "$source_path" } diff --git a/contrib/elf2dmp/download.c b/contrib/elf2dmp/download.c index 21306b3..fa8da0f 100644 --- a/contrib/elf2dmp/download.c +++ b/contrib/elf2dmp/download.c @@ -27,8 +27,8 @@ bool download_url(const char *name, const char *url) if (curl_easy_setopt(curl, CURLOPT_URL, url) != CURLE_OK || curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL) != CURLE_OK || curl_easy_setopt(curl, CURLOPT_WRITEDATA, file) != CURLE_OK - || curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1) != CURLE_OK - || curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0) != CURLE_OK + || curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != CURLE_OK + || curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L) != CURLE_OK || curl_easy_perform(curl) != CURLE_OK) { unlink(name); fclose(file); diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst index 798cb46..fc27436 100644 --- a/docs/about/build-platforms.rst +++ b/docs/about/build-platforms.rst @@ -53,7 +53,7 @@ Those hosts are officially supported, with various accelerators: * - SPARC - tcg * - x86 - - hvf (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen + - hvf (64 bit only), mshv (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen Other host architectures are not supported. It is possible to build QEMU system emulation on an unsupported host architecture using the configure diff --git a/docs/devel/codebase.rst b/docs/devel/codebase.rst index 2a31437..69d8827 100644 --- a/docs/devel/codebase.rst +++ b/docs/devel/codebase.rst @@ -48,7 +48,7 @@ yet, so sometimes the source code is all you have. * `accel <https://gitlab.com/qemu-project/qemu/-/tree/master/accel>`_: Infrastructure and architecture agnostic code related to the various `accelerators <Accelerators>` supported by QEMU - (TCG, KVM, hvf, whpx, xen, nvmm). + (TCG, KVM, hvf, whpx, xen, nvmm, mshv). Contains interfaces for operations that will be implemented per `target <https://gitlab.com/qemu-project/qemu/-/tree/master/target>`_. * `audio <https://gitlab.com/qemu-project/qemu/-/tree/master/audio>`_: diff --git a/docs/glossary.rst b/docs/glossary.rst index 4fa044b..2857731 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -12,7 +12,7 @@ Accelerator A specific API used to accelerate execution of guest instructions. It can be hardware-based, through a virtualization API provided by the host OS (kvm, hvf, -whpx, ...), or software-based (tcg). See this description of `supported +whpx, mshv, ...), or software-based (tcg). See this description of `supported accelerators<Accelerators>`. Board @@ -101,9 +101,8 @@ manage a virtual machine. QEMU is a virtualizer, that interacts with various hypervisors. In the context of QEMU, an hypervisor is an API, provided by the Host OS, -allowing to execute virtual machines. Linux implementation is KVM (and supports -Xen as well). For MacOS, it's HVF. Windows defines WHPX. And NetBSD provides -NVMM. +allowing to execute virtual machines. Linux provides a choice of KVM, Xen +or MSHV; MacOS provides HVF; Windows provides WHPX; NetBSD provides NVMM. .. _machine: diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index 1aa0a6e..bf81da1 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -28,6 +28,7 @@ the following architecture extensions: - FEAT_BF16 (AArch64 BFloat16 instructions) - FEAT_BTI (Branch Target Identification) - FEAT_CCIDX (Extended cache index) +- FEAT_CHK (Check Feature Status) - FEAT_CMOW (Control for cache maintenance permission) - FEAT_CRC32 (CRC32 instructions) - FEAT_Crypto (Cryptographic Extension) @@ -72,6 +73,7 @@ the following architecture extensions: - FEAT_FRINTTS (Floating-point to integer instructions) - FEAT_FlagM (Flag manipulation instructions v2) - FEAT_FlagM2 (Enhancements to flag manipulation instructions) +- FEAT_GCS (Guarded Control Stack Extension) - FEAT_GTG (Guest translation granule size) - FEAT_HAFDBS (Hardware management of the access flag and dirty bit state) - FEAT_HBC (Hinted conditional branches) @@ -92,6 +94,9 @@ the following architecture extensions: - FEAT_LSE2 (Large System Extensions v2) - FEAT_LSE128 (128-bit Atomics) - FEAT_LVA (Large Virtual Address space) +- FEAT_MEC (Memory Encryption Contexts) + + * This is a register-only implementation without encryption. - FEAT_MixedEnd (Mixed-endian support) - FEAT_MixedEndEL0 (Mixed-endian support at EL0) - FEAT_MOPS (Standardization of memory operations) @@ -123,6 +128,8 @@ the following architecture extensions: - FEAT_RME_GPC2 (RME Granule Protection Check 2 Extension) - FEAT_RNG (Random number generator) - FEAT_RPRES (Increased precision of FRECPE and FRSQRTE) +- FEAT_S1PIE (Stage 1 permission indirections) +- FEAT_S2PIE (Stage 2 permission indirections) - FEAT_S2FWB (Stage 2 forced Write-Back) - FEAT_SB (Speculation Barrier) - FEAT_SCTLR2 (Extension to SCTLR_ELx) diff --git a/docs/system/introduction.rst b/docs/system/introduction.rst index 4cd46b5..9c57523 100644 --- a/docs/system/introduction.rst +++ b/docs/system/introduction.rst @@ -23,6 +23,9 @@ Tiny Code Generator (TCG) capable of emulating many CPUs. * - Xen - Linux (as dom0) - Arm, x86 + * - MSHV + - Linux (as dom0) + - x86 * - Hypervisor Framework (hvf) - MacOS - x86 (64 bit only), Arm (64 bit only) diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 6142f60..eaaa880 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -308,6 +308,19 @@ SRST ERST { + .name = "mshv", + .args_type = "", + .params = "", + .help = "show MSHV information", + .cmd = hmp_info_mshv, + }, + +SRST + ``info mshv`` + Show MSHV information. +ERST + + { .name = "numa", .args_type = "", .params = "", diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c index 3a612e2..682ed9f 100644 --- a/hw/core/machine-hmp-cmds.c +++ b/hw/core/machine-hmp-cmds.c @@ -163,6 +163,21 @@ void hmp_info_kvm(Monitor *mon, const QDict *qdict) qapi_free_KvmInfo(info); } +void hmp_info_mshv(Monitor *mon, const QDict *qdict) +{ + MshvInfo *info; + + info = qmp_query_mshv(NULL); + monitor_printf(mon, "mshv support: "); + if (info->present) { + monitor_printf(mon, "%s\n", info->enabled ? "enabled" : "disabled"); + } else { + monitor_printf(mon, "not compiled\n"); + } + + qapi_free_MshvInfo(info); +} + void hmp_info_uuid(Monitor *mon, const QDict *qdict) { UuidInfo *info; diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c index 6aca1a6..e24bf0d 100644 --- a/hw/core/machine-qmp-cmds.c +++ b/hw/core/machine-qmp-cmds.c @@ -28,6 +28,20 @@ #include "system/runstate.h" #include "system/system.h" #include "hw/s390x/storage-keys.h" +#include <sys/stat.h> + +/* + * QMP query for MSHV + */ +MshvInfo *qmp_query_mshv(Error **errp) +{ + MshvInfo *info = g_malloc0(sizeof(*info)); + + info->enabled = mshv_enabled(); + info->present = accel_find("mshv"); + + return info; +} /* * fast means: we NEVER interrupt vCPU threads to retrieve diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 34b0066..4d6bcbb 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -87,6 +87,8 @@ const size_t pc_compat_10_1_len = G_N_ELEMENTS(pc_compat_10_1); GlobalProperty pc_compat_10_0[] = { { TYPE_X86_CPU, "x-consistent-cache", "false" }, { TYPE_X86_CPU, "x-vendor-cpuid-only-v2", "false" }, + { TYPE_X86_CPU, "x-arch-cap-always-on", "true" }, + { TYPE_X86_CPU, "x-pdcm-on-even-without-pmu", "true" }, }; const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0); diff --git a/hw/intc/apic.c b/hw/intc/apic.c index bcb1035..6d78596 100644 --- a/hw/intc/apic.c +++ b/hw/intc/apic.c @@ -27,6 +27,7 @@ #include "hw/pci/msi.h" #include "qemu/host-utils.h" #include "system/kvm.h" +#include "system/mshv.h" #include "trace.h" #include "hw/i386/apic-msidef.h" #include "qapi/error.h" @@ -932,6 +933,13 @@ static void apic_send_msi(MSIMessage *msi) uint8_t trigger_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; uint8_t delivery = (data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x7; /* XXX: Ignore redirection hint. */ +#ifdef CONFIG_MSHV + if (mshv_enabled()) { + mshv_request_interrupt(mshv_state, delivery, vector, dest, + dest_mode, trigger_mode); + return; + } +#endif apic_deliver_irq(dest, dest_mode, delivery, vector, trigger_mode); } diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 133bef8..e431d00 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -30,12 +30,18 @@ #include "hw/intc/ioapic_internal.h" #include "hw/pci/msi.h" #include "hw/qdev-properties.h" +#include "system/accel-irq.h" #include "system/kvm.h" #include "system/system.h" #include "hw/i386/apic-msidef.h" #include "hw/i386/x86-iommu.h" #include "trace.h" + +#if defined(CONFIG_KVM) || defined(CONFIG_MSHV) +#define ACCEL_GSI_IRQFD_POSSIBLE +#endif + #define APIC_DELIVERY_MODE_SHIFT 8 #define APIC_POLARITY_SHIFT 14 #define APIC_TRIG_MODE_SHIFT 15 @@ -191,10 +197,10 @@ static void ioapic_set_irq(void *opaque, int vector, int level) static void ioapic_update_kvm_routes(IOAPICCommonState *s) { -#ifdef CONFIG_KVM +#ifdef ACCEL_GSI_IRQFD_POSSIBLE int i; - if (kvm_irqchip_is_split()) { + if (accel_irqchip_is_split()) { for (i = 0; i < IOAPIC_NUM_PINS; i++) { MSIMessage msg; struct ioapic_entry_info info; @@ -202,15 +208,15 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) if (!info.masked) { msg.address = info.addr; msg.data = info.data; - kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL); + accel_irqchip_update_msi_route(i, msg, NULL); } } - kvm_irqchip_commit_routes(kvm_state); + accel_irqchip_commit_routes(); } #endif } -#ifdef CONFIG_KVM +#ifdef ACCEL_KERNEL_GSI_IRQFD_POSSIBLE static void ioapic_iec_notifier(void *private, bool global, uint32_t index, uint32_t mask) { @@ -428,11 +434,11 @@ static const MemoryRegionOps ioapic_io_ops = { static void ioapic_machine_done_notify(Notifier *notifier, void *data) { -#ifdef CONFIG_KVM +#ifdef ACCEL_KERNEL_GSI_IRQFD_POSSIBLE IOAPICCommonState *s = container_of(notifier, IOAPICCommonState, machine_done); - if (kvm_irqchip_is_split()) { + if (accel_irqchip_is_split()) { X86IOMMUState *iommu = x86_iommu_get_default(); if (iommu) { /* Register this IOAPIC with IOMMU IEC notifier, so that diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index f87d274..5282089 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -652,7 +652,16 @@ static const PCIIOMMUOps s390_iommu_ops = { .get_address_space = s390_pci_dma_iommu, }; -static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) +/** + * set_ind_bit_atomic - Atomically set a bit in an indicator + * + * @ind_loc: Address of the indicator + * @to_be_set: Bit to set + * + * Returns true if the bit was set by this function, false if it was + * already set or mapping failed. + */ +static bool set_ind_bit_atomic(uint64_t ind_loc, uint8_t to_be_set) { uint8_t expected, actual; hwaddr len = 1; @@ -662,7 +671,7 @@ static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) ind_addr = cpu_physical_memory_map(ind_loc, &len, true); if (!ind_addr) { s390_pci_generate_error_event(ERR_EVENT_AIRERR, 0, 0, 0, 0); - return -1; + return false; } actual = *ind_addr; do { @@ -671,7 +680,7 @@ static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) } while (actual != expected); cpu_physical_memory_unmap((void *)ind_addr, len, 1, len); - return actual; + return (actual & to_be_set) ? false : true; } static void s390_msi_ctrl_write(void *opaque, hwaddr addr, uint64_t data, @@ -693,10 +702,10 @@ static void s390_msi_ctrl_write(void *opaque, hwaddr addr, uint64_t data, ind_bit = pbdev->routes.adapter.ind_offset; sum_bit = pbdev->routes.adapter.summary_offset; - set_ind_atomic(pbdev->routes.adapter.ind_addr + (ind_bit + vec) / 8, + set_ind_bit_atomic(pbdev->routes.adapter.ind_addr + (ind_bit + vec) / 8, 0x80 >> ((ind_bit + vec) % 8)); - if (!set_ind_atomic(pbdev->routes.adapter.summary_addr + sum_bit / 8, - 0x80 >> (sum_bit % 8))) { + if (set_ind_bit_atomic(pbdev->routes.adapter.summary_addr + sum_bit / 8, + 0x80 >> (sum_bit % 8))) { css_adapter_interrupt(CSS_IO_ADAPTER_PCI, pbdev->isc); } } @@ -891,6 +900,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) s390_pci_init_default_group(); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); + s390_pcihost_kvm_realize(); } static void s390_pcihost_unrealize(DeviceState *dev) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index ab96ce1..937e22f 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -34,6 +34,7 @@ #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/loader.h" +#include "system/accel-irq.h" #include "system/kvm.h" #include "hw/virtio/virtio-pci.h" #include "qemu/range.h" @@ -858,11 +859,11 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, if (irqfd->users == 0) { KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); - ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); + ret = accel_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_route_changes(&c); + accel_irqchip_commit_route_changes(&c); irqfd->virq = ret; } irqfd->users++; @@ -874,7 +875,7 @@ static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy, { VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; if (--irqfd->users == 0) { - kvm_irqchip_release_virq(kvm_state, irqfd->virq); + accel_irqchip_release_virq(irqfd->virq); } } @@ -883,7 +884,7 @@ static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, unsigned int vector) { VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; - return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq); + return accel_irqchip_add_irqfd_notifier_gsi(n, NULL, irqfd->virq); } static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, @@ -893,7 +894,7 @@ static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; int ret; - ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq); + ret = accel_irqchip_remove_irqfd_notifier_gsi(n, irqfd->virq); assert(ret == 0); } static int virtio_pci_get_notifier(VirtIOPCIProxy *proxy, int queue_no, @@ -1028,12 +1029,12 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, - &proxy->pci_dev); + ret = accel_irqchip_update_msi_route(irqfd->virq, msg, + &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + accel_irqchip_commit_routes(); } } @@ -1262,7 +1263,7 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); int r, n; bool with_irqfd = msix_enabled(&proxy->pci_dev) && - kvm_msi_via_irqfd_enabled(); + accel_msi_via_irqfd_enabled() ; nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX); @@ -1466,7 +1467,7 @@ static void virtio_pci_set_vector(VirtIODevice *vdev, uint16_t new_vector) { bool kvm_irqfd = (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) && - msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled(); + msix_enabled(&proxy->pci_dev) && accel_msi_via_irqfd_enabled(); if (new_vector == old_vector) { return; diff --git a/include/exec/memopidx.h b/include/exec/memopidx.h index eb7f159..66d9c58 100644 --- a/include/exec/memopidx.h +++ b/include/exec/memopidx.h @@ -25,9 +25,10 @@ typedef uint32_t MemOpIdx; static inline MemOpIdx make_memop_idx(MemOp op, unsigned idx) { #ifdef CONFIG_DEBUG_TCG - assert(idx <= 15); + assert(idx <= 31); + assert(clz32(op) >= 5); #endif - return (op << 4) | idx; + return (op << 5) | idx; } /** @@ -38,7 +39,7 @@ static inline MemOpIdx make_memop_idx(MemOp op, unsigned idx) */ static inline MemOp get_memop(MemOpIdx oi) { - return oi >> 4; + return oi >> 5; } /** @@ -49,7 +50,7 @@ static inline MemOp get_memop(MemOpIdx oi) */ static inline unsigned get_mmuidx(MemOpIdx oi) { - return oi & 15; + return oi & 31; } #endif diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 0fcbc92..e79e8e0 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -169,7 +169,7 @@ struct CPUClass { vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr); const char *gdb_core_xml_file; - const gchar * (*gdb_arch_name)(CPUState *cpu); + const char * (*gdb_arch_name)(CPUState *cpu); const char * (*gdb_get_core_xml_file)(CPUState *cpu); void (*disas_set_info)(CPUState *cpu, disassemble_info *info); @@ -198,10 +198,11 @@ struct CPUClass { }; /* - * Fix the number of mmu modes to 16. + * Fix the number of mmu modes across all targets. + * Current maximum is target/arm/. */ -#define NB_MMU_MODES 16 -typedef uint16_t MMUIdxMap; +#define NB_MMU_MODES 22 +typedef uint32_t MMUIdxMap; /* Use a fully associative victim tlb of 8 entries. */ #define CPU_VTLB_SIZE 8 diff --git a/include/hw/hyperv/hvgdk.h b/include/hw/hyperv/hvgdk.h new file mode 100644 index 0000000..71161f4 --- /dev/null +++ b/include/hw/hyperv/hvgdk.h @@ -0,0 +1,20 @@ +/* + * Type definitions for the mshv guest interface. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVGDK_H +#define HW_HYPERV_HVGDK_H + +#define HVGDK_H_VERSION (25125) + +enum hv_unimplemented_msr_action { + HV_UNIMPLEMENTED_MSR_ACTION_FAULT = 0, + HV_UNIMPLEMENTED_MSR_ACTION_IGNORE_WRITE_READ_ZERO = 1, + HV_UNIMPLEMENTED_MSR_ACTION_COUNT = 2, +}; + +#endif /* HW_HYPERV_HVGDK_H */ diff --git a/include/hw/hyperv/hvgdk_mini.h b/include/hw/hyperv/hvgdk_mini.h new file mode 100644 index 0000000..d89315f --- /dev/null +++ b/include/hw/hyperv/hvgdk_mini.h @@ -0,0 +1,817 @@ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVGDK_MINI_H +#define HW_HYPERV_HVGDK_MINI_H + +#define MSHV_IOCTL 0xB8 + +typedef enum hv_register_name { + /* Pending Interruption Register */ + HV_REGISTER_PENDING_INTERRUPTION = 0x00010002, + + /* X64 User-Mode Registers */ + HV_X64_REGISTER_RAX = 0x00020000, + HV_X64_REGISTER_RCX = 0x00020001, + HV_X64_REGISTER_RDX = 0x00020002, + HV_X64_REGISTER_RBX = 0x00020003, + HV_X64_REGISTER_RSP = 0x00020004, + HV_X64_REGISTER_RBP = 0x00020005, + HV_X64_REGISTER_RSI = 0x00020006, + HV_X64_REGISTER_RDI = 0x00020007, + HV_X64_REGISTER_R8 = 0x00020008, + HV_X64_REGISTER_R9 = 0x00020009, + HV_X64_REGISTER_R10 = 0x0002000A, + HV_X64_REGISTER_R11 = 0x0002000B, + HV_X64_REGISTER_R12 = 0x0002000C, + HV_X64_REGISTER_R13 = 0x0002000D, + HV_X64_REGISTER_R14 = 0x0002000E, + HV_X64_REGISTER_R15 = 0x0002000F, + HV_X64_REGISTER_RIP = 0x00020010, + HV_X64_REGISTER_RFLAGS = 0x00020011, + + /* X64 Floating Point and Vector Registers */ + HV_X64_REGISTER_XMM0 = 0x00030000, + HV_X64_REGISTER_XMM1 = 0x00030001, + HV_X64_REGISTER_XMM2 = 0x00030002, + HV_X64_REGISTER_XMM3 = 0x00030003, + HV_X64_REGISTER_XMM4 = 0x00030004, + HV_X64_REGISTER_XMM5 = 0x00030005, + HV_X64_REGISTER_XMM6 = 0x00030006, + HV_X64_REGISTER_XMM7 = 0x00030007, + HV_X64_REGISTER_XMM8 = 0x00030008, + HV_X64_REGISTER_XMM9 = 0x00030009, + HV_X64_REGISTER_XMM10 = 0x0003000A, + HV_X64_REGISTER_XMM11 = 0x0003000B, + HV_X64_REGISTER_XMM12 = 0x0003000C, + HV_X64_REGISTER_XMM13 = 0x0003000D, + HV_X64_REGISTER_XMM14 = 0x0003000E, + HV_X64_REGISTER_XMM15 = 0x0003000F, + HV_X64_REGISTER_FP_MMX0 = 0x00030010, + HV_X64_REGISTER_FP_MMX1 = 0x00030011, + HV_X64_REGISTER_FP_MMX2 = 0x00030012, + HV_X64_REGISTER_FP_MMX3 = 0x00030013, + HV_X64_REGISTER_FP_MMX4 = 0x00030014, + HV_X64_REGISTER_FP_MMX5 = 0x00030015, + HV_X64_REGISTER_FP_MMX6 = 0x00030016, + HV_X64_REGISTER_FP_MMX7 = 0x00030017, + HV_X64_REGISTER_FP_CONTROL_STATUS = 0x00030018, + HV_X64_REGISTER_XMM_CONTROL_STATUS = 0x00030019, + + /* X64 Control Registers */ + HV_X64_REGISTER_CR0 = 0x00040000, + HV_X64_REGISTER_CR2 = 0x00040001, + HV_X64_REGISTER_CR3 = 0x00040002, + HV_X64_REGISTER_CR4 = 0x00040003, + HV_X64_REGISTER_CR8 = 0x00040004, + HV_X64_REGISTER_XFEM = 0x00040005, + + /* X64 Segment Registers */ + HV_X64_REGISTER_ES = 0x00060000, + HV_X64_REGISTER_CS = 0x00060001, + HV_X64_REGISTER_SS = 0x00060002, + HV_X64_REGISTER_DS = 0x00060003, + HV_X64_REGISTER_FS = 0x00060004, + HV_X64_REGISTER_GS = 0x00060005, + HV_X64_REGISTER_LDTR = 0x00060006, + HV_X64_REGISTER_TR = 0x00060007, + + /* X64 Table Registers */ + HV_X64_REGISTER_IDTR = 0x00070000, + HV_X64_REGISTER_GDTR = 0x00070001, + + /* X64 Virtualized MSRs */ + HV_X64_REGISTER_TSC = 0x00080000, + HV_X64_REGISTER_EFER = 0x00080001, + HV_X64_REGISTER_KERNEL_GS_BASE = 0x00080002, + HV_X64_REGISTER_APIC_BASE = 0x00080003, + HV_X64_REGISTER_PAT = 0x00080004, + HV_X64_REGISTER_SYSENTER_CS = 0x00080005, + HV_X64_REGISTER_SYSENTER_EIP = 0x00080006, + HV_X64_REGISTER_SYSENTER_ESP = 0x00080007, + HV_X64_REGISTER_STAR = 0x00080008, + HV_X64_REGISTER_LSTAR = 0x00080009, + HV_X64_REGISTER_CSTAR = 0x0008000A, + HV_X64_REGISTER_SFMASK = 0x0008000B, + HV_X64_REGISTER_INITIAL_APIC_ID = 0x0008000C, + + /* X64 Cache control MSRs */ + HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D, + HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F, + HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070, + HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071, + HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072, + HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073, + HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074, + HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075, + HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076, + HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077, + HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078, + HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079, + HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A, + + HV_X64_REGISTER_TSC_AUX = 0x0008007B, + HV_X64_REGISTER_BNDCFGS = 0x0008007C, + HV_X64_REGISTER_DEBUG_CTL = 0x0008007D, + + /* Available */ + + HV_X64_REGISTER_SPEC_CTRL = 0x00080084, + HV_X64_REGISTER_TSC_ADJUST = 0x00080096, + + /* Other MSRs */ + HV_X64_REGISTER_MSR_IA32_MISC_ENABLE = 0x000800A0, + + /* Misc */ + HV_REGISTER_GUEST_OS_ID = 0x00090002, + HV_REGISTER_REFERENCE_TSC = 0x00090017, + + /* Hypervisor-defined Registers (Synic) */ + HV_REGISTER_SINT0 = 0x000A0000, + HV_REGISTER_SINT1 = 0x000A0001, + HV_REGISTER_SINT2 = 0x000A0002, + HV_REGISTER_SINT3 = 0x000A0003, + HV_REGISTER_SINT4 = 0x000A0004, + HV_REGISTER_SINT5 = 0x000A0005, + HV_REGISTER_SINT6 = 0x000A0006, + HV_REGISTER_SINT7 = 0x000A0007, + HV_REGISTER_SINT8 = 0x000A0008, + HV_REGISTER_SINT9 = 0x000A0009, + HV_REGISTER_SINT10 = 0x000A000A, + HV_REGISTER_SINT11 = 0x000A000B, + HV_REGISTER_SINT12 = 0x000A000C, + HV_REGISTER_SINT13 = 0x000A000D, + HV_REGISTER_SINT14 = 0x000A000E, + HV_REGISTER_SINT15 = 0x000A000F, + HV_REGISTER_SCONTROL = 0x000A0010, + HV_REGISTER_SVERSION = 0x000A0011, + HV_REGISTER_SIEFP = 0x000A0012, + HV_REGISTER_SIMP = 0x000A0013, + HV_REGISTER_EOM = 0x000A0014, + HV_REGISTER_SIRBP = 0x000A0015, +} hv_register_name; + +enum hv_intercept_type { + HV_INTERCEPT_TYPE_X64_IO_PORT = 0X00000000, + HV_INTERCEPT_TYPE_X64_MSR = 0X00000001, + HV_INTERCEPT_TYPE_X64_CPUID = 0X00000002, + HV_INTERCEPT_TYPE_EXCEPTION = 0X00000003, + + /* Used to be HV_INTERCEPT_TYPE_REGISTER */ + HV_INTERCEPT_TYPE_RESERVED0 = 0X00000004, + HV_INTERCEPT_TYPE_MMIO = 0X00000005, + HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0X00000006, + HV_INTERCEPT_TYPE_X64_APIC_SMI = 0X00000007, + HV_INTERCEPT_TYPE_HYPERCALL = 0X00000008, + + HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0X00000009, + HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0X0000000A, + + HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0X0000000B, + HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0X0000000C, + HV_INTERCEPT_TYPE_MAX, + HV_INTERCEPT_TYPE_INVALID = 0XFFFFFFFF, +}; + +struct hv_u128 { + uint64_t low_part; + uint64_t high_part; +}; + +union hv_x64_xmm_control_status_register { + struct hv_u128 as_uint128; + struct { + union { + /* long mode */ + uint64_t last_fp_rdp; + /* 32 bit mode */ + struct { + uint32_t last_fp_dp; + uint16_t last_fp_ds; + uint16_t padding; + }; + }; + uint32_t xmm_status_control; + uint32_t xmm_status_control_mask; + }; +}; + +union hv_x64_fp_register { + struct hv_u128 as_uint128; + struct { + uint64_t mantissa; + uint64_t biased_exponent:15; + uint64_t sign:1; + uint64_t reserved:48; + }; +}; + +union hv_x64_pending_exception_event { + uint64_t as_uint64[2]; + struct { + uint32_t event_pending:1; + uint32_t event_type:3; + uint32_t reserved0:4; + uint32_t deliver_error_code:1; + uint32_t reserved1:7; + uint32_t vector:16; + uint32_t error_code; + uint64_t exception_parameter; + }; +}; + +union hv_x64_pending_virtualization_fault_event { + uint64_t as_uint64[2]; + struct { + uint32_t event_pending:1; + uint32_t event_type:3; + uint32_t reserved0:4; + uint32_t reserved1:8; + uint32_t parameter0:16; + uint32_t code; + uint64_t parameter1; + }; +}; + +union hv_x64_pending_interruption_register { + uint64_t as_uint64; + struct { + uint32_t interruption_pending:1; + uint32_t interruption_type:3; + uint32_t deliver_error_code:1; + uint32_t instruction_length:4; + uint32_t nested_event:1; + uint32_t reserved:6; + uint32_t interruption_vector:16; + uint32_t error_code; + }; +}; + +union hv_x64_register_sev_control { + uint64_t as_uint64; + struct { + uint64_t enable_encrypted_state:1; + uint64_t reserved_z:11; + uint64_t vmsa_gpa_page_number:52; + }; +}; + +union hv_x64_msr_npiep_config_contents { + uint64_t as_uint64; + struct { + /* + * These bits enable instruction execution prevention for + * specific instructions. + */ + uint64_t prevents_gdt:1; + uint64_t prevents_idt:1; + uint64_t prevents_ldt:1; + uint64_t prevents_tr:1; + + /* The reserved bits must always be 0. */ + uint64_t reserved:60; + }; +}; + +typedef struct hv_x64_segment_register { + uint64_t base; + uint32_t limit; + uint16_t selector; + union { + struct { + uint16_t segment_type:4; + uint16_t non_system_segment:1; + uint16_t descriptor_privilege_level:2; + uint16_t present:1; + uint16_t reserved:4; + uint16_t available:1; + uint16_t _long:1; + uint16_t _default:1; + uint16_t granularity:1; + }; + uint16_t attributes; + }; +} hv_x64_segment_register; + +typedef struct hv_x64_table_register { + uint16_t pad[3]; + uint16_t limit; + uint64_t base; +} hv_x64_table_register; + +union hv_x64_fp_control_status_register { + struct hv_u128 as_uint128; + struct { + uint16_t fp_control; + uint16_t fp_status; + uint8_t fp_tag; + uint8_t reserved; + uint16_t last_fp_op; + union { + /* long mode */ + uint64_t last_fp_rip; + /* 32 bit mode */ + struct { + uint32_t last_fp_eip; + uint16_t last_fp_cs; + uint16_t padding; + }; + }; + }; +}; + +/* General Hypervisor Register Content Definitions */ + +union hv_explicit_suspend_register { + uint64_t as_uint64; + struct { + uint64_t suspended:1; + uint64_t reserved:63; + }; +}; + +union hv_internal_activity_register { + uint64_t as_uint64; + + struct { + uint64_t startup_suspend:1; + uint64_t halt_suspend:1; + uint64_t idle_suspend:1; + uint64_t rsvd_z:61; + }; +}; + +union hv_x64_interrupt_state_register { + uint64_t as_uint64; + struct { + uint64_t interrupt_shadow:1; + uint64_t nmi_masked:1; + uint64_t reserved:62; + }; +}; + +union hv_intercept_suspend_register { + uint64_t as_uint64; + struct { + uint64_t suspended:1; + uint64_t reserved:63; + }; +}; + +typedef union hv_register_value { + struct hv_u128 reg128; + uint64_t reg64; + uint32_t reg32; + uint16_t reg16; + uint8_t reg8; + union hv_x64_fp_register fp; + union hv_x64_fp_control_status_register fp_control_status; + union hv_x64_xmm_control_status_register xmm_control_status; + struct hv_x64_segment_register segment; + struct hv_x64_table_register table; + union hv_explicit_suspend_register explicit_suspend; + union hv_intercept_suspend_register intercept_suspend; + union hv_internal_activity_register internal_activity; + union hv_x64_interrupt_state_register interrupt_state; + union hv_x64_pending_interruption_register pending_interruption; + union hv_x64_msr_npiep_config_contents npiep_config; + union hv_x64_pending_exception_event pending_exception_event; + union hv_x64_pending_virtualization_fault_event + pending_virtualization_fault_event; + union hv_x64_register_sev_control sev_control; +} hv_register_value; + +typedef struct hv_register_assoc { + uint32_t name; /* enum hv_register_name */ + uint32_t reserved1; + uint64_t reserved2; + union hv_register_value value; +} hv_register_assoc; + +union hv_input_vtl { + uint8_t as_uint8; + struct { + uint8_t target_vtl:4; + uint8_t use_target_vtl:1; + uint8_t reserved_z:3; + }; +}; + +typedef struct hv_input_get_vp_registers { + uint64_t partition_id; + uint32_t vp_index; + union hv_input_vtl input_vtl; + uint8_t rsvd_z8; + uint16_t rsvd_z16; + uint32_t names[]; +} hv_input_get_vp_registers; + +typedef struct hv_input_set_vp_registers { + uint64_t partition_id; + uint32_t vp_index; + union hv_input_vtl input_vtl; + uint8_t rsvd_z8; + uint16_t rsvd_z16; + struct hv_register_assoc elements[]; +} hv_input_set_vp_registers; + +#define MSHV_VP_MAX_REGISTERS 128 + +struct mshv_vp_registers { + int count; /* at most MSHV_VP_MAX_REGISTERS */ + struct hv_register_assoc *regs; +}; + +union hv_interrupt_control { + uint64_t as_uint64; + struct { + uint32_t interrupt_type; /* enum hv_interrupt type */ + uint32_t level_triggered:1; + uint32_t logical_dest_mode:1; + uint32_t rsvd:30; + }; +}; + +struct hv_input_assert_virtual_interrupt { + uint64_t partition_id; + union hv_interrupt_control control; + uint64_t dest_addr; /* cpu's apic id */ + uint32_t vector; + uint8_t target_vtl; + uint8_t rsvd_z0; + uint16_t rsvd_z1; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +struct hv_local_interrupt_controller_state { + /* HV_X64_INTERRUPT_CONTROLLER_STATE */ + uint32_t apic_id; + uint32_t apic_version; + uint32_t apic_ldr; + uint32_t apic_dfr; + uint32_t apic_spurious; + uint32_t apic_isr[8]; + uint32_t apic_tmr[8]; + uint32_t apic_irr[8]; + uint32_t apic_esr; + uint32_t apic_icr_high; + uint32_t apic_icr_low; + uint32_t apic_lvt_timer; + uint32_t apic_lvt_thermal; + uint32_t apic_lvt_perfmon; + uint32_t apic_lvt_lint0; + uint32_t apic_lvt_lint1; + uint32_t apic_lvt_error; + uint32_t apic_lvt_cmci; + uint32_t apic_error_status; + uint32_t apic_initial_count; + uint32_t apic_counter_value; + uint32_t apic_divide_configuration; + uint32_t apic_remote_read; +}; + +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* From hvgdk_mini.h */ + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_EOM 0x40000084 + +/* Define port identifier type. */ +union hv_port_id { + uint32_t asuint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + }; +}; + +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define hypervisor message types. */ +enum hv_message_type { + HVMSG_NONE = 0x00000000, + + /* Memory access messages. */ + HVMSG_UNMAPPED_GPA = 0x80000000, + HVMSG_GPA_INTERCEPT = 0x80000001, + HVMSG_UNACCEPTED_GPA = 0x80000003, + HVMSG_GPA_ATTRIBUTE_INTERCEPT = 0x80000004, + + /* Timer notification messages. */ + HVMSG_TIMER_EXPIRED = 0x80000010, + + /* Error messages. */ + HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, + HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + + /* + * Opaque intercept message. The original intercept message is only + * accessible from the mapped intercept message page. + */ + HVMSG_OPAQUE_INTERCEPT = 0x8000003F, + + /* Trace buffer complete messages. */ + HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, + + /* Hypercall intercept */ + HVMSG_HYPERCALL_INTERCEPT = 0x80000050, + + /* SynIC intercepts */ + HVMSG_SYNIC_EVENT_INTERCEPT = 0x80000060, + HVMSG_SYNIC_SINT_INTERCEPT = 0x80000061, + HVMSG_SYNIC_SINT_DELIVERABLE = 0x80000062, + + /* Async call completion intercept */ + HVMSG_ASYNC_CALL_COMPLETION = 0x80000070, + + /* Root scheduler messages */ + HVMSG_SCHEDULER_VP_SIGNAL_BITSE = 0x80000100, + HVMSG_SCHEDULER_VP_SIGNAL_PAIR = 0x80000101, + + /* Platform-specific processor intercept messages. */ + HVMSG_X64_IO_PORT_INTERCEPT = 0x80010000, + HVMSG_X64_MSR_INTERCEPT = 0x80010001, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005, + HVMSG_X64_IOMMU_PRQ = 0x80010006, + HVMSG_X64_HALT = 0x80010007, + HVMSG_X64_INTERRUPTION_DELIVERABLE = 0x80010008, + HVMSG_X64_SIPI_INTERCEPT = 0x80010009, + HVMSG_X64_SEV_VMGEXIT_INTERCEPT = 0x80010013, +}; + +union hv_x64_vp_execution_state { + uint16_t as_uint16; + struct { + uint16_t cpl:2; + uint16_t cr0_pe:1; + uint16_t cr0_am:1; + uint16_t efer_lma:1; + uint16_t debug_active:1; + uint16_t interruption_pending:1; + uint16_t vtl:4; + uint16_t enclave_mode:1; + uint16_t interrupt_shadow:1; + uint16_t virtualization_fault_active:1; + uint16_t reserved:2; + }; +}; + +/* From openvmm::hvdef */ +enum hv_x64_intercept_access_type { + HV_X64_INTERCEPT_ACCESS_TYPE_READ = 0, + HV_X64_INTERCEPT_ACCESS_TYPE_WRITE = 1, + HV_X64_INTERCEPT_ACCESS_TYPE_EXECUTE = 2, +}; + +struct hv_x64_intercept_message_header { + uint32_t vp_index; + uint8_t instruction_length:4; + uint8_t cr8:4; /* Only set for exo partitions */ + uint8_t intercept_access_type; + union hv_x64_vp_execution_state execution_state; + struct hv_x64_segment_register cs_segment; + uint64_t rip; + uint64_t rflags; +}; + +union hv_x64_io_port_access_info { + uint8_t as_uint8; + struct { + uint8_t access_size:3; + uint8_t string_op:1; + uint8_t rep_prefix:1; + uint8_t reserved:3; + }; +}; + +typedef struct hv_x64_io_port_intercept_message { + struct hv_x64_intercept_message_header header; + uint16_t port_number; + union hv_x64_io_port_access_info access_info; + uint8_t instruction_byte_count; + uint32_t reserved; + uint64_t rax; + uint8_t instruction_bytes[16]; + struct hv_x64_segment_register ds_segment; + struct hv_x64_segment_register es_segment; + uint64_t rcx; + uint64_t rsi; + uint64_t rdi; +} hv_x64_io_port_intercept_message; + +union hv_x64_memory_access_info { + uint8_t as_uint8; + struct { + uint8_t gva_valid:1; + uint8_t gva_gpa_valid:1; + uint8_t hypercall_output_pending:1; + uint8_t tlb_locked_no_overlay:1; + uint8_t reserved:4; + }; +}; + +struct hv_x64_memory_intercept_message { + struct hv_x64_intercept_message_header header; + uint32_t cache_type; /* enum hv_cache_type */ + uint8_t instruction_byte_count; + union hv_x64_memory_access_info memory_access_info; + uint8_t tpr_priority; + uint8_t reserved1; + uint64_t guest_virtual_address; + uint64_t guest_physical_address; + uint8_t instruction_bytes[16]; +}; + +union hv_message_flags { + uint8_t asu8; + struct { + uint8_t msg_pending:1; + uint8_t reserved:7; + }; +}; + +struct hv_message_header { + uint32_t message_type; + uint8_t payload_size; + union hv_message_flags message_flags; + uint8_t reserved[2]; + union { + uint64_t sender; + union hv_port_id port; + }; +}; + +struct hv_message { + struct hv_message_header header; + union { + uint64_t payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +}; + +/* From github.com/rust-vmm/mshv-bindings/src/x86_64/regs.rs */ + +struct hv_cpuid_entry { + uint32_t function; + uint32_t index; + uint32_t flags; + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint32_t padding[3]; +}; + +struct hv_cpuid { + uint32_t nent; + uint32_t padding; + struct hv_cpuid_entry entries[0]; +}; + +#define IA32_MSR_TSC 0x00000010 +#define IA32_MSR_EFER 0xC0000080 +#define IA32_MSR_KERNEL_GS_BASE 0xC0000102 +#define IA32_MSR_APIC_BASE 0x0000001B +#define IA32_MSR_PAT 0x0277 +#define IA32_MSR_SYSENTER_CS 0x00000174 +#define IA32_MSR_SYSENTER_ESP 0x00000175 +#define IA32_MSR_SYSENTER_EIP 0x00000176 +#define IA32_MSR_STAR 0xC0000081 +#define IA32_MSR_LSTAR 0xC0000082 +#define IA32_MSR_CSTAR 0xC0000083 +#define IA32_MSR_SFMASK 0xC0000084 + +#define IA32_MSR_MTRR_CAP 0x00FE +#define IA32_MSR_MTRR_DEF_TYPE 0x02FF +#define IA32_MSR_MTRR_PHYSBASE0 0x0200 +#define IA32_MSR_MTRR_PHYSMASK0 0x0201 +#define IA32_MSR_MTRR_PHYSBASE1 0x0202 +#define IA32_MSR_MTRR_PHYSMASK1 0x0203 +#define IA32_MSR_MTRR_PHYSBASE2 0x0204 +#define IA32_MSR_MTRR_PHYSMASK2 0x0205 +#define IA32_MSR_MTRR_PHYSBASE3 0x0206 +#define IA32_MSR_MTRR_PHYSMASK3 0x0207 +#define IA32_MSR_MTRR_PHYSBASE4 0x0208 +#define IA32_MSR_MTRR_PHYSMASK4 0x0209 +#define IA32_MSR_MTRR_PHYSBASE5 0x020A +#define IA32_MSR_MTRR_PHYSMASK5 0x020B +#define IA32_MSR_MTRR_PHYSBASE6 0x020C +#define IA32_MSR_MTRR_PHYSMASK6 0x020D +#define IA32_MSR_MTRR_PHYSBASE7 0x020E +#define IA32_MSR_MTRR_PHYSMASK7 0x020F + +#define IA32_MSR_MTRR_FIX64K_00000 0x0250 +#define IA32_MSR_MTRR_FIX16K_80000 0x0258 +#define IA32_MSR_MTRR_FIX16K_A0000 0x0259 +#define IA32_MSR_MTRR_FIX4K_C0000 0x0268 +#define IA32_MSR_MTRR_FIX4K_C8000 0x0269 +#define IA32_MSR_MTRR_FIX4K_D0000 0x026A +#define IA32_MSR_MTRR_FIX4K_D8000 0x026B +#define IA32_MSR_MTRR_FIX4K_E0000 0x026C +#define IA32_MSR_MTRR_FIX4K_E8000 0x026D +#define IA32_MSR_MTRR_FIX4K_F0000 0x026E +#define IA32_MSR_MTRR_FIX4K_F8000 0x026F + +#define IA32_MSR_TSC_AUX 0xC0000103 +#define IA32_MSR_BNDCFGS 0x00000d90 +#define IA32_MSR_DEBUG_CTL 0x1D9 +#define IA32_MSR_SPEC_CTRL 0x00000048 +#define IA32_MSR_TSC_ADJUST 0x0000003b + +#define IA32_MSR_MISC_ENABLE 0x000001a0 + +#define HV_TRANSLATE_GVA_VALIDATE_READ (0x0001) +#define HV_TRANSLATE_GVA_VALIDATE_WRITE (0x0002) +#define HV_TRANSLATE_GVA_VALIDATE_EXECUTE (0x0004) + +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +#define HVCALL_GET_PARTITION_PROPERTY 0x0044 +#define HVCALL_SET_PARTITION_PROPERTY 0x0045 +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 +#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 +#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 + +#endif /* HW_HYPERV_HVGDK_MINI_H */ diff --git a/include/hw/hyperv/hvhdk.h b/include/hw/hyperv/hvhdk.h new file mode 100644 index 0000000..866c821 --- /dev/null +++ b/include/hw/hyperv/hvhdk.h @@ -0,0 +1,249 @@ +/* + * Type definitions for the mshv host. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVHDK_H +#define HW_HYPERV_HVHDK_H + +#define HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS 1 + +struct hv_input_set_partition_property { + uint64_t partition_id; + uint32_t property_code; /* enum hv_partition_property_code */ + uint32_t padding; + uint64_t property_value; +}; + +union hv_partition_synthetic_processor_features { + uint64_t as_uint64[HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS]; + + struct { + /* + * Report a hypervisor is present. CPUID leaves + * 0x40000000 and 0x40000001 are supported. + */ + uint64_t hypervisor_present:1; + + /* + * Features associated with HV#1: + */ + + /* Report support for Hv1 (CPUID leaves 0x40000000 - 0x40000006). */ + uint64_t hv1:1; + + /* + * Access to HV_X64_MSR_VP_RUNTIME. + * Corresponds to access_vp_run_time_reg privilege. + */ + uint64_t access_vp_run_time_reg:1; + + /* + * Access to HV_X64_MSR_TIME_REF_COUNT. + * Corresponds to access_partition_reference_counter privilege. + */ + uint64_t access_partition_reference_counter:1; + + /* + * Access to SINT-related registers (HV_X64_MSR_SCONTROL through + * HV_X64_MSR_EOM and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15). + * Corresponds to access_synic_regs privilege. + */ + uint64_t access_synic_regs:1; + + /* + * Access to synthetic timers and associated MSRs + * (HV_X64_MSR_STIMER0_CONFIG through HV_X64_MSR_STIMER3_COUNT). + * Corresponds to access_synthetic_timer_regs privilege. + */ + uint64_t access_synthetic_timer_regs:1; + + /* + * Access to APIC MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and + * HV_X64_MSR_TPR) as well as the VP assist page. + * Corresponds to access_intr_ctrl_regs privilege. + */ + uint64_t access_intr_ctrl_regs:1; + + /* + * Access to registers associated with hypercalls + * (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL). + * Corresponds to access_hypercall_msrs privilege. + */ + uint64_t access_hypercall_regs:1; + + /* VP index can be queried. corresponds to access_vp_index privilege. */ + uint64_t access_vp_index:1; + + /* + * Access to the reference TSC. Corresponds to + * access_partition_reference_tsc privilege. + */ + uint64_t access_partition_reference_tsc:1; + + /* + * Partition has access to the guest idle reg. Corresponds to + * access_guest_idle_reg privilege. + */ + uint64_t access_guest_idle_reg:1; + + /* + * Partition has access to frequency regs. corresponds to + * access_frequency_regs privilege. + */ + uint64_t access_frequency_regs:1; + + uint64_t reserved_z12:1; /* Reserved for access_reenlightenment_controls */ + uint64_t reserved_z13:1; /* Reserved for access_root_scheduler_reg */ + uint64_t reserved_z14:1; /* Reserved for access_tsc_invariant_controls */ + + /* + * Extended GVA ranges for HvCallFlushVirtualAddressList hypercall. + * Corresponds to privilege. + */ + uint64_t enable_extended_gva_ranges_for_flush_virtual_address_list:1; + + uint64_t reserved_z16:1; /* Reserved for access_vsm. */ + uint64_t reserved_z17:1; /* Reserved for access_vp_registers. */ + + /* Use fast hypercall output. Corresponds to privilege. */ + uint64_t fast_hypercall_output:1; + + uint64_t reserved_z19:1; /* Reserved for enable_extended_hypercalls. */ + + /* + * HvStartVirtualProcessor can be used to start virtual processors. + * Corresponds to privilege. + */ + uint64_t start_virtual_processor:1; + + uint64_t reserved_z21:1; /* Reserved for Isolation. */ + + /* Synthetic timers in direct mode. */ + uint64_t direct_synthetic_timers:1; + + uint64_t reserved_z23:1; /* Reserved for synthetic time unhalted timer */ + + /* Use extended processor masks. */ + uint64_t extended_processor_masks:1; + + /* + * HvCallFlushVirtualAddressSpace / HvCallFlushVirtualAddressList are + * supported. + */ + uint64_t tb_flush_hypercalls:1; + + /* HvCallSendSyntheticClusterIpi is supported. */ + uint64_t synthetic_cluster_ipi:1; + + /* HvCallNotifyLongSpinWait is supported. */ + uint64_t notify_long_spin_wait:1; + + /* HvCallQueryNumaDistance is supported. */ + uint64_t query_numa_distance:1; + + /* HvCallSignalEvent is supported. Corresponds to privilege. */ + uint64_t signal_events:1; + + /* HvCallRetargetDeviceInterrupt is supported. */ + uint64_t retarget_device_interrupt:1; + + /* HvCallRestorePartitionTime is supported. */ + uint64_t restore_time:1; + + /* EnlightenedVmcs nested enlightenment is supported. */ + uint64_t enlightened_vmcs:1; + + uint64_t reserved:30; + }; +}; + +enum hv_translate_gva_result_code { + HV_TRANSLATE_GVA_SUCCESS = 0, + + /* Translation failures. */ + HV_TRANSLATE_GVA_PAGE_NOT_PRESENT = 1, + HV_TRANSLATE_GVA_PRIVILEGE_VIOLATION = 2, + HV_TRANSLATE_GVA_INVALIDE_PAGE_TABLE_FLAGS = 3, + + /* GPA access failures. */ + HV_TRANSLATE_GVA_GPA_UNMAPPED = 4, + HV_TRANSLATE_GVA_GPA_NO_READ_ACCESS = 5, + HV_TRANSLATE_GVA_GPA_NO_WRITE_ACCESS = 6, + HV_TRANSLATE_GVA_GPA_ILLEGAL_OVERLAY_ACCESS = 7, + + /* + * Intercept for memory access by either + * - a higher VTL + * - a nested hypervisor (due to a violation of the nested page table) + */ + HV_TRANSLATE_GVA_INTERCEPT = 8, + + HV_TRANSLATE_GVA_GPA_UNACCEPTED = 9, +}; + +union hv_translate_gva_result { + uint64_t as_uint64; + struct { + uint32_t result_code; /* enum hv_translate_hva_result_code */ + uint32_t cache_type:8; + uint32_t overlay_page:1; + uint32_t reserved:23; + }; +}; + +typedef struct hv_input_translate_virtual_address { + uint64_t partition_id; + uint32_t vp_index; + uint32_t padding; + uint64_t control_flags; + uint64_t gva_page; +} hv_input_translate_virtual_address; + +typedef struct hv_output_translate_virtual_address { + union hv_translate_gva_result translation_result; + uint64_t gpa_page; +} hv_output_translate_virtual_address; + +typedef struct hv_register_x64_cpuid_result_parameters { + struct { + uint32_t eax; + uint32_t ecx; + uint8_t subleaf_specific; + uint8_t always_override; + uint16_t padding; + } input; + struct { + uint32_t eax; + uint32_t eax_mask; + uint32_t ebx; + uint32_t ebx_mask; + uint32_t ecx; + uint32_t ecx_mask; + uint32_t edx; + uint32_t edx_mask; + } result; +} hv_register_x64_cpuid_result_parameters; + +typedef struct hv_register_x64_msr_result_parameters { + uint32_t msr_index; + uint32_t access_type; + uint32_t action; /* enum hv_unimplemented_msr_action */ +} hv_register_x64_msr_result_parameters; + +union hv_register_intercept_result_parameters { + struct hv_register_x64_cpuid_result_parameters cpuid; + struct hv_register_x64_msr_result_parameters msr; +}; + +typedef struct hv_input_register_intercept_result { + uint64_t partition_id; + uint32_t vp_index; + uint32_t intercept_type; /* enum hv_intercept_type */ + union hv_register_intercept_result_parameters parameters; +} hv_input_register_intercept_result; + +#endif /* HW_HYPERV_HVHDK_H */ diff --git a/include/hw/hyperv/hvhdk_mini.h b/include/hw/hyperv/hvhdk_mini.h new file mode 100644 index 0000000..9c2f3cf --- /dev/null +++ b/include/hw/hyperv/hvhdk_mini.h @@ -0,0 +1,102 @@ +/* + * Type definitions for the mshv host interface. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVHDK_MINI_H +#define HW_HYPERV_HVHDK_MINI_H + +#define HVHVK_MINI_VERSION (25294) + +/* Each generic set contains 64 elements */ +#define HV_GENERIC_SET_SHIFT (6) +#define HV_GENERIC_SET_MASK (63) + +enum hv_generic_set_format { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +enum hv_partition_property_code { + /* Privilege properties */ + HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, + HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES = 0x00010001, + + /* Scheduling properties */ + HV_PARTITION_PROPERTY_SUSPEND = 0x00020000, + HV_PARTITION_PROPERTY_CPU_RESERVE = 0x00020001, + HV_PARTITION_PROPERTY_CPU_CAP = 0x00020002, + HV_PARTITION_PROPERTY_CPU_WEIGHT = 0x00020003, + HV_PARTITION_PROPERTY_CPU_GROUP_ID = 0x00020004, + + /* Time properties */ + HV_PARTITION_PROPERTY_TIME_FREEZE = 0x00030003, + HV_PARTITION_PROPERTY_REFERENCE_TIME = 0x00030005, + + /* Debugging properties */ + HV_PARTITION_PROPERTY_DEBUG_CHANNEL_ID = 0x00040000, + + /* Resource properties */ + HV_PARTITION_PROPERTY_VIRTUAL_TLB_PAGE_COUNT = 0x00050000, + HV_PARTITION_PROPERTY_VSM_CONFIG = 0x00050001, + HV_PARTITION_PROPERTY_ZERO_MEMORY_ON_RESET = 0x00050002, + HV_PARTITION_PROPERTY_PROCESSORS_PER_SOCKET = 0x00050003, + HV_PARTITION_PROPERTY_NESTED_TLB_SIZE = 0x00050004, + HV_PARTITION_PROPERTY_GPA_PAGE_ACCESS_TRACKING = 0x00050005, + HV_PARTITION_PROPERTY_VSM_PERMISSIONS_DIRTY_SINCE_LAST_QUERY = 0x00050006, + HV_PARTITION_PROPERTY_SGX_LAUNCH_CONTROL_CONFIG = 0x00050007, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL0 = 0x00050008, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL1 = 0x00050009, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL2 = 0x0005000a, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL3 = 0x0005000b, + HV_PARTITION_PROPERTY_ISOLATION_STATE = 0x0005000c, + HV_PARTITION_PROPERTY_ISOLATION_CONTROL = 0x0005000d, + HV_PARTITION_PROPERTY_ALLOCATION_ID = 0x0005000e, + HV_PARTITION_PROPERTY_MONITORING_ID = 0x0005000f, + HV_PARTITION_PROPERTY_IMPLEMENTED_PHYSICAL_ADDRESS_BITS = 0x00050010, + HV_PARTITION_PROPERTY_NON_ARCHITECTURAL_CORE_SHARING = 0x00050011, + HV_PARTITION_PROPERTY_HYPERCALL_DOORBELL_PAGE = 0x00050012, + HV_PARTITION_PROPERTY_ISOLATION_POLICY = 0x00050014, + HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION = 0x00050017, + HV_PARTITION_PROPERTY_SEV_VMGEXIT_OFFLOADS = 0x00050022, + + /* Compatibility properties */ + HV_PARTITION_PROPERTY_PROCESSOR_VENDOR = 0x00060000, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES_DEPRECATED = 0x00060001, + HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, + HV_PARTITION_PROPERTY_PROCESSOR_CL_FLUSH_SIZE = 0x00060003, + HV_PARTITION_PROPERTY_ENLIGHTENMENT_MODIFICATIONS = 0x00060004, + HV_PARTITION_PROPERTY_COMPATIBILITY_VERSION = 0x00060005, + HV_PARTITION_PROPERTY_PHYSICAL_ADDRESS_WIDTH = 0x00060006, + HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, + HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES0 = 0x0006000a, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES1 = 0x0006000b, + + /* Guest software properties */ + HV_PARTITION_PROPERTY_GUEST_OS_ID = 0x00070000, + + /* Nested virtualization properties */ + HV_PARTITION_PROPERTY_PROCESSOR_VIRTUALIZATION_FEATURES = 0x00080000, +}; + +/* HV Map GPA (Guest Physical Address) Flags */ +#define HV_MAP_GPA_PERMISSIONS_NONE 0x0 +#define HV_MAP_GPA_READABLE 0x1 +#define HV_MAP_GPA_WRITABLE 0x2 +#define HV_MAP_GPA_KERNEL_EXECUTABLE 0x4 +#define HV_MAP_GPA_USER_EXECUTABLE 0x8 +#define HV_MAP_GPA_EXECUTABLE 0xC +#define HV_MAP_GPA_PERMISSIONS_MASK 0xF +#define HV_MAP_GPA_ADJUSTABLE 0x8000 +#define HV_MAP_GPA_NO_ACCESS 0x10000 +#define HV_MAP_GPA_NOT_CACHED 0x200000 +#define HV_MAP_GPA_LARGE_PAGE 0x80000000 + +#define HV_PFN_RNG_PAGEBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */ + +#endif /* HW_HYPERV_HVHDK_MINI_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index cd97bdf..76fa57c 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -49,9 +49,9 @@ #define VIRT_LOWMEM_SIZE 0x10000000 #define VIRT_HIGHMEM_BASE 0x80000000 #define VIRT_GED_EVT_ADDR 0x100e0000 -#define VIRT_GED_MEM_ADDR (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN) -#define VIRT_GED_REG_ADDR (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN) -#define VIRT_GED_CPUHP_ADDR (VIRT_GED_REG_ADDR + ACPI_GED_REG_COUNT) +#define VIRT_GED_MEM_ADDR QEMU_ALIGN_UP(VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN, 4) +#define VIRT_GED_REG_ADDR QEMU_ALIGN_UP(VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN, 4) +#define VIRT_GED_CPUHP_ADDR QEMU_ALIGN_UP(VIRT_GED_REG_ADDR + ACPI_GED_REG_COUNT, 4) #define COMMAND_LINE_SIZE 512 diff --git a/include/hw/s390x/s390-pci-kvm.h b/include/hw/s390x/s390-pci-kvm.h index 933814a..c33f283 100644 --- a/include/hw/s390x/s390-pci-kvm.h +++ b/include/hw/s390x/s390-pci-kvm.h @@ -14,12 +14,19 @@ #include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/s390-pci-inst.h" +#include "system/kvm.h" #ifdef CONFIG_KVM +static inline void s390_pcihost_kvm_realize(void) +{ + kvm_msi_via_irqfd_allowed = kvm_irqfds_enabled(); +} + bool s390_pci_kvm_interp_allowed(void); int s390_pci_kvm_aif_enable(S390PCIBusDevice *pbdev, ZpciFib *fib, bool assist); int s390_pci_kvm_aif_disable(S390PCIBusDevice *pbdev); #else +static inline void s390_pcihost_kvm_realize(void) {} static inline bool s390_pci_kvm_interp_allowed(void) { return false; diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h index ae116d9..31bd812 100644 --- a/include/monitor/hmp.h +++ b/include/monitor/hmp.h @@ -24,6 +24,7 @@ strList *hmp_split_at_comma(const char *str); void hmp_info_name(Monitor *mon, const QDict *qdict); void hmp_info_version(Monitor *mon, const QDict *qdict); void hmp_info_kvm(Monitor *mon, const QDict *qdict); +void hmp_info_mshv(Monitor *mon, const QDict *qdict); void hmp_info_status(Monitor *mon, const QDict *qdict); void hmp_info_uuid(Monitor *mon, const QDict *qdict); void hmp_info_chardev(Monitor *mon, const QDict *qdict); diff --git a/include/system/accel-irq.h b/include/system/accel-irq.h new file mode 100644 index 0000000..671fb7d --- /dev/null +++ b/include/system/accel-irq.h @@ -0,0 +1,37 @@ +/* + * Accelerated irqchip abstraction + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef SYSTEM_ACCEL_IRQ_H +#define SYSTEM_ACCEL_IRQ_H +#include "hw/pci/msi.h" +#include "qemu/osdep.h" +#include "system/kvm.h" +#include "system/mshv.h" + +static inline bool accel_msi_via_irqfd_enabled(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_msi_via_irqfd_enabled(); +} + +static inline bool accel_irqchip_is_split(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_irqchip_is_split(); +} + +int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); +int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev); +void accel_irqchip_commit_route_changes(KVMRouteChange *c); +void accel_irqchip_commit_routes(void); +void accel_irqchip_release_virq(int virq); +int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn, + int virq); +int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq); +#endif diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h index fa9228d..55497ed 100644 --- a/include/system/hw_accel.h +++ b/include/system/hw_accel.h @@ -14,6 +14,7 @@ #include "hw/core/cpu.h" #include "system/kvm.h" #include "system/hvf.h" +#include "system/mshv.h" #include "system/whpx.h" #include "system/nvmm.h" diff --git a/include/system/mshv.h b/include/system/mshv.h new file mode 100644 index 0000000..8b1fc20 --- /dev/null +++ b/include/system/mshv.h @@ -0,0 +1,64 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_H +#define QEMU_MSHV_H + +#include "qemu/osdep.h" +#include "qemu/accel.h" +#include "hw/hyperv/hyperv-proto.h" +#include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "qapi/qapi-types-common.h" +#include "system/memory.h" +#include "accel/accel-ops.h" + +#ifdef COMPILING_PER_TARGET +#ifdef CONFIG_MSHV +#include <linux/mshv.h> +#define CONFIG_MSHV_IS_POSSIBLE +#endif +#else +#define CONFIG_MSHV_IS_POSSIBLE +#endif + +#define MSHV_MAX_MSI_ROUTES 4096 + +#define MSHV_PAGE_SHIFT 12 + +#ifdef CONFIG_MSHV_IS_POSSIBLE +extern bool mshv_allowed; +#define mshv_enabled() (mshv_allowed) +#define mshv_msi_via_irqfd_enabled() mshv_enabled() +#else /* CONFIG_MSHV_IS_POSSIBLE */ +#define mshv_enabled() false +#define mshv_msi_via_irqfd_enabled() mshv_enabled() +#endif + +typedef struct MshvState MshvState; +extern MshvState *mshv_state; + +/* interrupt */ +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_destination_mode, + bool level_triggered); + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev); +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev); +void mshv_irqchip_commit_routes(void); +void mshv_irqchip_release_virq(int virq); +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n, + const EventNotifier *rn, int virq); +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq); + +#endif diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h new file mode 100644 index 0000000..490563c --- /dev/null +++ b/include/system/mshv_int.h @@ -0,0 +1,155 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_INT_H +#define QEMU_MSHV_INT_H + +#define MSHV_MSR_ENTRIES_COUNT 64 + +#define MSHV_MAX_MEM_SLOTS 32 + +typedef struct hyperv_message hv_message; + +typedef struct MshvHvCallArgs { + void *base; + void *input_page; + void *output_page; +} MshvHvCallArgs; + +struct AccelCPUState { + int cpufd; + bool dirty; + MshvHvCallArgs hvcall_args; +}; + +typedef struct MshvMemoryListener { + MemoryListener listener; + int as_id; +} MshvMemoryListener; + +typedef struct MshvAddressSpace { + MshvMemoryListener *ml; + AddressSpace *as; +} MshvAddressSpace; + +typedef struct MshvMemorySlotManager { + size_t n_slots; + GList *slots; + QemuMutex mutex; +} MshvMemorySlotManager; + +struct MshvState { + AccelState parent_obj; + int vm; + MshvMemoryListener memory_listener; + /* number of listeners */ + int nr_as; + MshvAddressSpace *as; + int fd; + MshvMemorySlotManager msm; +}; + +typedef struct MshvMsiControl { + bool updated; + GHashTable *gsi_routes; +} MshvMsiControl; + +#define mshv_vcpufd(cpu) (cpu->accel->cpufd) + +/* cpu */ +typedef struct MshvFPU { + uint8_t fpr[8][16]; + uint16_t fcw; + uint16_t fsw; + uint8_t ftwx; + uint8_t pad1; + uint16_t last_opcode; + uint64_t last_ip; + uint64_t last_dp; + uint8_t xmm[16][16]; + uint32_t mxcsr; + uint32_t pad2; +} MshvFPU; + +typedef enum MshvVmExit { + MshvVmExitIgnore = 0, + MshvVmExitShutdown = 1, + MshvVmExitSpecial = 2, +} MshvVmExit; + +typedef enum MshvRemapResult { + MshvRemapOk = 0, + MshvRemapNoMapping = 1, + MshvRemapNoOverlap = 2, +} MshvRemapResult; + +void mshv_init_mmio_emu(void); +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); +void mshv_remove_vcpu(int vm_fd, int cpu_fd); +int mshv_configure_vcpu(const CPUState *cpu, const MshvFPU *fpu, uint64_t xcr0); +int mshv_get_standard_regs(CPUState *cpu); +int mshv_get_special_regs(CPUState *cpu); +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); +int mshv_load_regs(CPUState *cpu); +int mshv_store_regs(CPUState *cpu); +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs); +int mshv_arch_put_registers(const CPUState *cpu); +void mshv_arch_init_vcpu(CPUState *cpu); +void mshv_arch_destroy_vcpu(CPUState *cpu); +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features); +int mshv_arch_post_init_vm(int vm_fd); + +#if defined COMPILING_PER_TARGET && defined CONFIG_MSHV_IS_POSSIBLE +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args); +#endif + +/* memory */ +typedef struct MshvMemorySlot { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + bool readonly; + bool mapped; +} MshvMemorySlot; + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa); +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch); +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode); +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add); +void mshv_init_memory_slot_manager(MshvState *mshv_state); + +/* msr */ +typedef struct MshvMsrEntry { + uint32_t index; + uint32_t reserved; + uint64_t data; +} MshvMsrEntry; + +typedef struct MshvMsrEntries { + MshvMsrEntry entries[MSHV_MSR_ENTRIES_COUNT]; + uint32_t nmsrs; +} MshvMsrEntries; + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs); + +/* interrupt */ +void mshv_init_msicontrol(void); +int mshv_reserve_ioapic_msi_routes(int vm_fd); + +#endif diff --git a/linux-headers/linux/mshv.h b/linux-headers/linux/mshv.h new file mode 100644 index 0000000..5bc83db --- /dev/null +++ b/linux-headers/linux/mshv.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * This file is divided into sections containing data structures and IOCTLs for + * a particular set of related devices or derived file descriptors. + * + * The IOCTL definitions are at the end of each section. They are grouped by + * device/fd, so that new IOCTLs can easily be added with a monotonically + * increasing number. + */ +#ifndef _LINUX_MSHV_H +#define _LINUX_MSHV_H + +#include <linux/types.h> + +#define MSHV_IOCTL 0xB8 + +/* + ******************************************* + * Entry point to main VMM APIs: /dev/mshv * + ******************************************* + */ + +enum { + MSHV_PT_BIT_LAPIC, + MSHV_PT_BIT_X2APIC, + MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_COUNT, +}; + +#define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) + +enum { + MSHV_PT_ISOLATION_NONE, + MSHV_PT_ISOLATION_COUNT, +}; + +/** + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* + * @pt_isolation: MSHV_PT_ISOLATION_* + * + * Returns a file descriptor to act as a handle to a guest partition. + * At this point the partition is not yet initialized in the hypervisor. + * Some operations must be done with the partition in this state, e.g. setting + * so-called "early" partition properties. The partition can then be + * initialized with MSHV_INITIALIZE_PARTITION. + */ +struct mshv_create_partition { + __u64 pt_flags; + __u64 pt_isolation; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) + +/* + ************************ + * Child partition APIs * + ************************ + */ + +struct mshv_create_vp { + __u32 vp_index; +}; + +enum { + MSHV_SET_MEM_BIT_WRITABLE, + MSHV_SET_MEM_BIT_EXECUTABLE, + MSHV_SET_MEM_BIT_UNMAP, + MSHV_SET_MEM_BIT_COUNT +}; + +#define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) + +/* The hypervisor's "native" page size */ +#define MSHV_HV_PAGE_SIZE 0x1000 + +/** + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY + * @size: Size of the memory region (bytes). Must be aligned to + * MSHV_HV_PAGE_SIZE + * @guest_pfn: Base guest page number to map + * @userspace_addr: Base address of userspace memory. Must be aligned to + * MSHV_HV_PAGE_SIZE + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) + * is set, ignore other bits. + * @rsvd: MBZ + * + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). + * Mappings can't overlap in GPA space or userspace. + * To unmap, these fields must match an existing mapping. + */ +struct mshv_user_mem_region { + __u64 size; + __u64 guest_pfn; + __u64 userspace_addr; + __u8 flags; + __u8 rsvd[7]; +}; + +enum { + MSHV_IRQFD_BIT_DEASSIGN, + MSHV_IRQFD_BIT_RESAMPLE, + MSHV_IRQFD_BIT_COUNT, +}; + +#define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) + +struct mshv_user_irqfd { + __s32 fd; + __s32 resamplefd; + __u32 gsi; + __u32 flags; +}; + +enum { + MSHV_IOEVENTFD_BIT_DATAMATCH, + MSHV_IOEVENTFD_BIT_PIO, + MSHV_IOEVENTFD_BIT_DEASSIGN, + MSHV_IOEVENTFD_BIT_COUNT, +}; + +#define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) + +struct mshv_user_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 rsvd[4]; +}; + +struct mshv_user_irq_entry { + __u32 gsi; + __u32 address_lo; + __u32 address_hi; + __u32 data; +}; + +struct mshv_user_irq_table { + __u32 nr; + __u32 rsvd; /* MBZ */ + struct mshv_user_irq_entry entries[]; +}; + +enum { + MSHV_GPAP_ACCESS_TYPE_ACCESSED, + MSHV_GPAP_ACCESS_TYPE_DIRTY, + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ +}; + +enum { + MSHV_GPAP_ACCESS_OP_NOOP, + MSHV_GPAP_ACCESS_OP_CLEAR, + MSHV_GPAP_ACCESS_OP_SET, + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ +}; + +/** + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the + * bitmap + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all + * the access states in the range, after retrieving the current + * states. + * @rsvd: MBZ + * @page_count: Number of pages + * @gpap_base: Base gpa page number + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes + * + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest + * memory, and optionally clear or set the bits. + */ +struct mshv_gpap_access_bitmap { + __u8 access_type; + __u8 access_op; + __u8 rsvd[6]; + __u64 page_count; + __u64 gpap_base; + __u64 bitmap_ptr; +}; + +/** + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL + * @code: Hypercall code (HVCALL_*) + * @reps: in: Rep count ('repcount') + * out: Reps completed ('repcomp'). MBZ unless rep hvcall + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 + * @status: in: MBZ + * out: HV_STATUS_* from hypercall + * @rsvd: MBZ + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or + * vp fd, partition id field is populated by kernel. + * @out_ptr: Output data buffer (optional) + */ +struct mshv_root_hvcall { + __u16 code; + __u16 reps; + __u16 in_sz; + __u16 out_sz; + __u16 status; + __u8 rsvd[6]; + __u64 in_ptr; + __u64 out_ptr; +}; + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) +#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +#define MSHV_RUN_VP_BUF_SZ 256 + +/* + * VP state pages may be mapped to userspace via mmap(). + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by + * the system page size. + * e.g. + * long page_size = sysconf(_SC_PAGE_SIZE); + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, + * MAP_SHARED, vp_fd, + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); + */ +enum { + MSHV_VP_MMAP_OFFSET_REGISTERS, + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, + MSHV_VP_MMAP_OFFSET_GHCB, + MSHV_VP_MMAP_OFFSET_COUNT +}; + +/** + * struct mshv_run_vp - argument for MSHV_RUN_VP + * @msg_buf: On success, the intercept message is copied here. It can be + * interpreted using the relevant hypervisor definitions. + */ +struct mshv_run_vp { + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; +}; + +enum { + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ + MSHV_VP_STATE_SIMP, + MSHV_VP_STATE_SIEFP, + MSHV_VP_STATE_SYNTHETIC_TIMERS, + MSHV_VP_STATE_COUNT, +}; + +/** + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE + * @type: MSHV_VP_STATE_* + * @rsvd: MBZ + * @buf_sz: in: 4k page-aligned size of buffer + * out: Actual size of data (on EINVAL, check this to see if buffer + * was too small) + * @buf_ptr: 4k page-aligned data buffer + */ +struct mshv_get_set_vp_state { + __u8 type; + __u8 rsvd[3]; + __u32 buf_sz; + __u64 buf_ptr; +}; + +/* VP fds created with MSHV_CREATE_VP */ +#define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) +#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) +#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) +/* + * Generic hypercall + * Defined above in partition IOCTLs, avoid redefining it here + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + */ + +#endif diff --git a/linux-user/aarch64/cpu_loop.c b/linux-user/aarch64/cpu_loop.c index 50a4c99..7f66a87 100644 --- a/linux-user/aarch64/cpu_loop.c +++ b/linux-user/aarch64/cpu_loop.c @@ -89,6 +89,11 @@ static void signal_for_exception(CPUARMState *env, vaddr addr) si_code = TARGET_ILL_ILLOPN; break; + case EC_GCS: + si_signo = TARGET_SIGSEGV; + si_code = TARGET_SEGV_CPERR; + break; + case EC_MOP: /* * FIXME: The kernel fixes up wrong-option exceptions. diff --git a/linux-user/aarch64/elfload.c b/linux-user/aarch64/elfload.c index 77d03b5..3af5a37 100644 --- a/linux-user/aarch64/elfload.c +++ b/linux-user/aarch64/elfload.c @@ -169,6 +169,7 @@ abi_ulong get_elf_hwcap(CPUState *cs) GET_FEATURE_ID(aa64_dcpop, ARM_HWCAP_A64_DCPOP); GET_FEATURE_ID(aa64_rcpc_8_3, ARM_HWCAP_A64_LRCPC); GET_FEATURE_ID(aa64_rcpc_8_4, ARM_HWCAP_A64_ILRCPC); + GET_FEATURE_ID(aa64_gcs, ARM_HWCAP_A64_GCS); return hwcaps; } diff --git a/linux-user/aarch64/gcs-internal.h b/linux-user/aarch64/gcs-internal.h new file mode 100644 index 0000000..e586c7e --- /dev/null +++ b/linux-user/aarch64/gcs-internal.h @@ -0,0 +1,38 @@ +/* + * AArch64 gcs functions for linux-user + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#ifndef AARCH64_GCS_INTERNAL_H +#define AARCH64_GCS_INTERNAL_H + +#ifndef PR_SHADOW_STACK_ENABLE +# define PR_SHADOW_STACK_ENABLE (1U << 0) +# define PR_SHADOW_STACK_WRITE (1U << 1) +# define PR_SHADOW_STACK_PUSH (1U << 2) +#endif + +static inline uint64_t gcs_get_el0_mode(CPUArchState *env) +{ + uint64_t cr = env->cp15.gcscr_el[0]; + abi_ulong flags = 0; + + flags |= cr & GCSCR_PCRSEL ? PR_SHADOW_STACK_ENABLE : 0; + flags |= cr & GCSCR_STREN ? PR_SHADOW_STACK_WRITE : 0; + flags |= cr & GCSCR_PUSHMEN ? PR_SHADOW_STACK_PUSH : 0; + + return flags; +} + +static inline void gcs_set_el0_mode(CPUArchState *env, uint64_t flags) +{ + uint64_t cr = GCSCRE0_NTR; + + cr |= flags & PR_SHADOW_STACK_ENABLE ? GCSCR_RVCHKEN | GCSCR_PCRSEL : 0; + cr |= flags & PR_SHADOW_STACK_WRITE ? GCSCR_STREN : 0; + cr |= flags & PR_SHADOW_STACK_PUSH ? GCSCR_PUSHMEN : 0; + + env->cp15.gcscr_el[0] = cr; +} + +#endif diff --git a/linux-user/aarch64/signal.c b/linux-user/aarch64/signal.c index ef97be3..f7edfa2 100644 --- a/linux-user/aarch64/signal.c +++ b/linux-user/aarch64/signal.c @@ -22,6 +22,7 @@ #include "signal-common.h" #include "linux-user/trace.h" #include "target/arm/cpu-features.h" +#include "gcs-internal.h" struct target_sigcontext { uint64_t fault_address; @@ -152,6 +153,16 @@ struct target_zt_context { QEMU_BUILD_BUG_ON(TARGET_ZT_SIG_REG_BYTES != \ sizeof_field(CPUARMState, za_state.zt0)); +#define TARGET_GCS_MAGIC 0x47435300 +#define GCS_SIGNAL_CAP(X) ((X) & TARGET_PAGE_MASK) + +struct target_gcs_context { + struct target_aarch64_ctx head; + uint64_t gcspr; + uint64_t features_enabled; + uint64_t reserved; +}; + struct target_rt_sigframe { struct target_siginfo info; struct target_ucontext uc; @@ -322,6 +333,35 @@ static void target_setup_zt_record(struct target_zt_context *zt, } } +static bool target_setup_gcs_record(struct target_gcs_context *ctx, + CPUARMState *env, uint64_t return_addr) +{ + uint64_t mode = gcs_get_el0_mode(env); + uint64_t gcspr = env->cp15.gcspr_el[0]; + + if (mode & PR_SHADOW_STACK_ENABLE) { + /* Push a cap for the signal frame. */ + gcspr -= 8; + if (put_user_u64(GCS_SIGNAL_CAP(gcspr), gcspr)) { + return false; + } + + /* Push a gcs entry for the trampoline. */ + if (put_user_u64(return_addr, gcspr - 8)) { + return false; + } + env->cp15.gcspr_el[0] = gcspr - 8; + } + + __put_user(TARGET_GCS_MAGIC, &ctx->head.magic); + __put_user(sizeof(*ctx), &ctx->head.size); + __put_user(gcspr, &ctx->gcspr); + __put_user(mode, &ctx->features_enabled); + __put_user(0, &ctx->reserved); + + return true; +} + static void target_restore_general_frame(CPUARMState *env, struct target_rt_sigframe *sf) { @@ -502,6 +542,64 @@ static bool target_restore_zt_record(CPUARMState *env, return true; } +static bool target_restore_gcs_record(CPUARMState *env, + struct target_gcs_context *ctx, + bool *rebuild_hflags) +{ + TaskState *ts = get_task_state(env_cpu(env)); + uint64_t cur_mode = gcs_get_el0_mode(env); + uint64_t new_mode, gcspr; + + __get_user(new_mode, &ctx->features_enabled); + __get_user(gcspr, &ctx->gcspr); + + /* + * The kernel pushes the value through the hw register: + * write_sysreg_s(gcspr, SYS_GCSPR_EL0) in restore_gcs_context, + * then read_sysreg_s(SYS_GCSPR_EL0) in gcs_restore_signal. + * Since the bottom 3 bits are RES0, this can (CONSTRAINED UNPREDICTABLE) + * force align the value. Mirror the choice from gcspr_write(). + */ + gcspr &= ~7; + + if (new_mode & ~(PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_WRITE | + PR_SHADOW_STACK_PUSH)) { + return false; + } + if ((new_mode ^ cur_mode) & ts->gcs_el0_locked) { + return false; + } + if (new_mode & ~cur_mode & PR_SHADOW_STACK_ENABLE) { + return false; + } + + if (new_mode & PR_SHADOW_STACK_ENABLE) { + uint64_t cap; + + /* Pop and clear the signal cap. */ + if (get_user_u64(cap, gcspr)) { + return false; + } + if (cap != GCS_SIGNAL_CAP(gcspr)) { + return false; + } + if (put_user_u64(0, gcspr)) { + return false; + } + gcspr += 8; + } else { + new_mode = 0; + } + + env->cp15.gcspr_el[0] = gcspr; + if (new_mode != cur_mode) { + *rebuild_hflags = true; + gcs_set_el0_mode(env, new_mode); + } + return true; +} + static int target_restore_sigframe(CPUARMState *env, struct target_rt_sigframe *sf) { @@ -511,8 +609,10 @@ static int target_restore_sigframe(CPUARMState *env, struct target_za_context *za = NULL; struct target_tpidr2_context *tpidr2 = NULL; struct target_zt_context *zt = NULL; + struct target_gcs_context *gcs = NULL; uint64_t extra_datap = 0; bool used_extra = false; + bool rebuild_hflags = false; int sve_size = 0; int za_size = 0; int zt_size = 0; @@ -582,6 +682,15 @@ static int target_restore_sigframe(CPUARMState *env, zt_size = size; break; + case TARGET_GCS_MAGIC: + if (gcs + || size != sizeof(struct target_gcs_context) + || !cpu_isar_feature(aa64_gcs, env_archcpu(env))) { + goto err; + } + gcs = (struct target_gcs_context *)ctx; + break; + case TARGET_EXTRA_MAGIC: if (extra || size != sizeof(struct target_extra_context)) { goto err; @@ -612,6 +721,10 @@ static int target_restore_sigframe(CPUARMState *env, goto err; } + if (gcs && !target_restore_gcs_record(env, gcs, &rebuild_hflags)) { + goto err; + } + /* SVE data, if present, overwrites FPSIMD data. */ if (sve && !target_restore_sve_record(env, sve, sve_size, &svcr)) { goto err; @@ -631,6 +744,9 @@ static int target_restore_sigframe(CPUARMState *env, } if (env->svcr != svcr) { env->svcr = svcr; + rebuild_hflags = true; + } + if (rebuild_hflags) { arm_rebuild_hflags(env); } unlock_user(extra, extra_datap, 0); @@ -701,7 +817,7 @@ static void target_setup_frame(int usig, struct target_sigaction *ka, uc.tuc_mcontext.__reserved), }; int fpsimd_ofs, fr_ofs, sve_ofs = 0, za_ofs = 0, tpidr2_ofs = 0; - int zt_ofs = 0, esr_ofs = 0; + int zt_ofs = 0, esr_ofs = 0, gcs_ofs = 0; int sve_size = 0, za_size = 0, tpidr2_size = 0, zt_size = 0; struct target_rt_sigframe *frame; struct target_rt_frame_record *fr; @@ -720,6 +836,11 @@ static void target_setup_frame(int usig, struct target_sigaction *ka, &layout); } + if (env->cp15.gcspr_el[0]) { + gcs_ofs = alloc_sigframe_space(sizeof(struct target_gcs_context), + &layout); + } + /* SVE state needs saving only if it exists. */ if (cpu_isar_feature(aa64_sve, env_archcpu(env)) || cpu_isar_feature(aa64_sme, env_archcpu(env))) { @@ -779,6 +900,12 @@ static void target_setup_frame(int usig, struct target_sigaction *ka, goto give_sigsegv; } + if (ka->sa_flags & TARGET_SA_RESTORER) { + return_addr = ka->sa_restorer; + } else { + return_addr = default_rt_sigreturn; + } + target_setup_general_frame(frame, env, set); target_setup_fpsimd_record((void *)frame + fpsimd_ofs, env); if (esr_ofs) { @@ -786,6 +913,10 @@ static void target_setup_frame(int usig, struct target_sigaction *ka, /* Leave ESR_EL1 clear while it's not relevant. */ env->cp15.esr_el[1] = 0; } + if (gcs_ofs && + !target_setup_gcs_record((void *)frame + gcs_ofs, env, return_addr)) { + goto give_sigsegv; + } target_setup_end_record((void *)frame + layout.std_end_ofs); if (layout.extra_ofs) { target_setup_extra_record((void *)frame + layout.extra_ofs, @@ -811,11 +942,6 @@ static void target_setup_frame(int usig, struct target_sigaction *ka, __put_user(env->xregs[29], &fr->fp); __put_user(env->xregs[30], &fr->lr); - if (ka->sa_flags & TARGET_SA_RESTORER) { - return_addr = ka->sa_restorer; - } else { - return_addr = default_rt_sigreturn; - } env->xregs[0] = usig; env->xregs[29] = frame_addr + fr_ofs; env->xregs[30] = return_addr; diff --git a/linux-user/aarch64/target_prctl.h b/linux-user/aarch64/target_prctl.h index ed75b9e..621be57 100644 --- a/linux-user/aarch64/target_prctl.h +++ b/linux-user/aarch64/target_prctl.h @@ -6,8 +6,10 @@ #ifndef AARCH64_TARGET_PRCTL_H #define AARCH64_TARGET_PRCTL_H +#include "qemu/units.h" #include "target/arm/cpu-features.h" #include "mte_user_helper.h" +#include "gcs-internal.h" static abi_long do_prctl_sve_get_vl(CPUArchState *env) { @@ -206,4 +208,98 @@ static abi_long do_prctl_get_tagged_addr_ctrl(CPUArchState *env) } #define do_prctl_get_tagged_addr_ctrl do_prctl_get_tagged_addr_ctrl +static abi_long do_prctl_get_shadow_stack_status(CPUArchState *env, + abi_long arg2) +{ + ARMCPU *cpu = env_archcpu(env); + + if (!cpu_isar_feature(aa64_gcs, cpu)) { + return -TARGET_EINVAL; + } + return put_user_ual(gcs_get_el0_mode(env), arg2); +} +#define do_prctl_get_shadow_stack_status do_prctl_get_shadow_stack_status + +static abi_long gcs_alloc(abi_ulong hint, abi_ulong size) +{ + /* + * Without softmmu, we cannot protect GCS memory properly. + * Make do with normal read/write permissions. This at least allows + * emulation of correct programs which don't access the gcs stack + * with normal instructions. + */ + return target_mmap(hint, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | + (hint ? MAP_FIXED_NOREPLACE : 0), -1, 0); +} + +static abi_ulong gcs_new_stack(TaskState *ts) +{ + /* Use guest_stack_size as a proxy for RLIMIT_STACK. */ + abi_ulong size = MIN(MAX(guest_stack_size / 2, TARGET_PAGE_SIZE), 2 * GiB); + abi_ulong base = gcs_alloc(0, size); + + if (base == -1) { + return -1; + } + + ts->gcs_base = base; + ts->gcs_size = size; + return base + size - 8; +} + +static abi_long do_prctl_set_shadow_stack_status(CPUArchState *env, + abi_long new_mode) +{ + ARMCPU *cpu = env_archcpu(env); + TaskState *ts = get_task_state(env_cpu(env)); + abi_long cur_mode; + + if (!cpu_isar_feature(aa64_gcs, cpu)) { + return -TARGET_EINVAL; + } + if (new_mode & ~(PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_WRITE | + PR_SHADOW_STACK_PUSH)) { + return -TARGET_EINVAL; + } + + cur_mode = gcs_get_el0_mode(env); + if ((new_mode ^ cur_mode) & ts->gcs_el0_locked) { + return -TARGET_EBUSY; + } + + if (new_mode & ~cur_mode & PR_SHADOW_STACK_ENABLE) { + abi_long gcspr; + + if (ts->gcs_base || env->cp15.gcspr_el[0]) { + return -EINVAL; + } + gcspr = gcs_new_stack(ts); + if (gcspr == -1) { + return -TARGET_ENOMEM; + } + env->cp15.gcspr_el[0] = gcspr; + } + + gcs_set_el0_mode(env, new_mode); + arm_rebuild_hflags(env); + return 0; +} +#define do_prctl_set_shadow_stack_status do_prctl_set_shadow_stack_status + +static abi_long do_prctl_lock_shadow_stack_status(CPUArchState *env, + abi_long arg2) +{ + ARMCPU *cpu = env_archcpu(env); + TaskState *ts = get_task_state(env_cpu(env)); + + if (!cpu_isar_feature(aa64_gcs, cpu)) { + return -EINVAL; + } + ts->gcs_el0_locked |= arg2; + return 0; +} +#define do_prctl_lock_shadow_stack_status do_prctl_lock_shadow_stack_status + #endif /* AARCH64_TARGET_PRCTL_H */ diff --git a/linux-user/aarch64/target_signal.h b/linux-user/aarch64/target_signal.h index 6f66a50..e509ac1 100644 --- a/linux-user/aarch64/target_signal.h +++ b/linux-user/aarch64/target_signal.h @@ -7,6 +7,7 @@ #define TARGET_SEGV_MTEAERR 8 /* Asynchronous ARM MTE error */ #define TARGET_SEGV_MTESERR 9 /* Synchronous ARM MTE exception */ +#define TARGET_SEGV_CPERR 10 /* Control protection fault */ #define TARGET_ARCH_HAS_SETUP_FRAME #define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1 diff --git a/linux-user/qemu.h b/linux-user/qemu.h index cabb7bd..85e68ef 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -122,6 +122,11 @@ struct TaskState { #ifdef TARGET_M68K abi_ulong tp_value; #endif +#if defined(TARGET_AARCH64) + vaddr gcs_base; + abi_ulong gcs_size; + abi_ulong gcs_el0_locked; +#endif int used; /* non zero if used */ struct image_info *info; struct linux_binprm *bprm; diff --git a/linux-user/syscall.c b/linux-user/syscall.c index d78b202..8546f48 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -6353,6 +6353,17 @@ abi_long do_arch_prctl(CPUX86State *env, int code, abi_ulong addr) # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) #endif +#ifndef PR_GET_SHADOW_STACK_STATUS +# define PR_GET_SHADOW_STACK_STATUS 74 +# define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_LOCK_SHADOW_STACK_STATUS 76 +#endif +#ifndef SHADOW_STACK_SET_TOKEN +# define SHADOW_STACK_SET_TOKEN (1u << 0) +#endif +#ifndef SHADOW_STACK_SET_MARKER +# define SHADOW_STACK_SET_MARKER (1u << 1) +#endif #include "target_prctl.h" @@ -6399,6 +6410,15 @@ static abi_long do_prctl_inval1(CPUArchState *env, abi_long arg2) #ifndef do_prctl_sme_set_vl #define do_prctl_sme_set_vl do_prctl_inval1 #endif +#ifndef do_prctl_get_shadow_stack_status +#define do_prctl_get_shadow_stack_status do_prctl_inval1 +#endif +#ifndef do_prctl_set_shadow_stack_status +#define do_prctl_set_shadow_stack_status do_prctl_inval1 +#endif +#ifndef do_prctl_lock_shadow_stack_status +#define do_prctl_lock_shadow_stack_status do_prctl_inval1 +#endif static abi_long do_prctl_syscall_user_dispatch(CPUArchState *env, abi_ulong arg2, abi_ulong arg3, @@ -6499,6 +6519,21 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2, return -TARGET_EINVAL; } return do_prctl_get_tagged_addr_ctrl(env); + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) { + return -TARGET_EINVAL; + } + return do_prctl_get_shadow_stack_status(env, arg2); + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) { + return -TARGET_EINVAL; + } + return do_prctl_set_shadow_stack_status(env, arg2); + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) { + return -TARGET_EINVAL; + } + return do_prctl_lock_shadow_stack_status(env, arg2); case PR_GET_UNALIGN: return do_prctl_get_unalign(env, arg2); @@ -6576,6 +6611,54 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2, } } +#ifdef TARGET_AARCH64 +static abi_long do_map_shadow_stack(CPUArchState *env, abi_ulong addr, + abi_ulong size, abi_int flags) +{ + ARMCPU *cpu = env_archcpu(env); + abi_ulong alloc_size; + + if (!cpu_isar_feature(aa64_gcs, cpu)) { + return -TARGET_EOPNOTSUPP; + } + if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER)) { + return -TARGET_EINVAL; + } + if (addr & ~TARGET_PAGE_MASK) { + return -TARGET_EINVAL; + } + if (size == 8 || !QEMU_IS_ALIGNED(size, 8)) { + return -TARGET_EINVAL; + } + + alloc_size = TARGET_PAGE_ALIGN(size); + if (alloc_size < size) { + return -TARGET_EOVERFLOW; + } + + mmap_lock(); + addr = gcs_alloc(addr, alloc_size); + if (addr != -1) { + if (flags & SHADOW_STACK_SET_TOKEN) { + abi_ptr cap_ptr = addr + size - 8; + uint64_t cap_val; + + if (flags & SHADOW_STACK_SET_MARKER) { + /* Leave an extra empty frame at top-of-stack. */ + cap_ptr -= 8; + } + cap_val = (cap_ptr & TARGET_PAGE_MASK) | 1; + if (put_user_u64(cap_val, cap_ptr)) { + /* Allocation succeeded above. */ + g_assert_not_reached(); + } + } + } + mmap_unlock(); + return get_errno(addr); +} +#endif + #define NEW_STACK_SIZE 0x40000 @@ -6657,6 +6740,21 @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp, ts = g_new0(TaskState, 1); init_task_state(ts); +#ifdef TARGET_AARCH64 + /* + * If GCS is enabled in the parent thread, it is also enabled + * in the child thread, but with a newly allocated stack. + */ + abi_long new_gcspr = 0; + if (env->cp15.gcscr_el[0] & GCSCR_PCRSEL) { + new_gcspr = gcs_new_stack(ts); + if (new_gcspr == -1) { + g_free(ts); + return -TARGET_ENOMEM; + } + } +#endif + /* Grab a mutex so that thread setup appears atomic. */ pthread_mutex_lock(&clone_lock); @@ -6678,6 +6776,11 @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp, ts->info = parent_ts->info; ts->signal_mask = parent_ts->signal_mask; +#ifdef TARGET_AARCH64 + ts->gcs_el0_locked = parent_ts->gcs_el0_locked; + new_env->cp15.gcspr_el[0] = new_gcspr; +#endif + if (flags & CLONE_CHILD_CLEARTID) { ts->child_tidptr = child_tidptr; } @@ -9380,6 +9483,12 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); } +#ifdef TARGET_AARCH64 + if (ts->gcs_base) { + target_munmap(ts->gcs_base, ts->gcs_size); + } +#endif + object_unparent(OBJECT(cpu)); object_unref(OBJECT(cpu)); /* @@ -14010,6 +14119,11 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, return do_riscv_hwprobe(cpu_env, arg1, arg2, arg3, arg4, arg5); #endif +#ifdef TARGET_AARCH64 + case TARGET_NR_map_shadow_stack: + return do_map_shadow_stack(cpu_env, arg1, arg2, arg3); +#endif + default: qemu_log_mask(LOG_UNIMP, "Unsupported syscall: %d\n", num); return -TARGET_ENOSYS; diff --git a/meson.build b/meson.build index 55c8202..afaefa0 100644 --- a/meson.build +++ b/meson.build @@ -334,6 +334,7 @@ elif cpu == 'x86_64' 'CONFIG_HVF': ['x86_64-softmmu'], 'CONFIG_NVMM': ['i386-softmmu', 'x86_64-softmmu'], 'CONFIG_WHPX': ['i386-softmmu', 'x86_64-softmmu'], + 'CONFIG_MSHV': ['x86_64-softmmu'], } endif @@ -883,6 +884,14 @@ accelerators = [] if get_option('kvm').allowed() and host_os == 'linux' accelerators += 'CONFIG_KVM' endif + +if get_option('mshv').allowed() and host_os == 'linux' + if get_option('mshv').enabled() and host_machine.cpu() != 'x86_64' + error('mshv accelerator requires x64_64 host') + endif + accelerators += 'CONFIG_MSHV' +endif + if get_option('whpx').allowed() and host_os == 'windows' if get_option('whpx').enabled() and host_machine.cpu() != 'x86_64' error('WHPX requires 64-bit host') @@ -952,6 +961,9 @@ endif if 'CONFIG_WHPX' not in accelerators and get_option('whpx').enabled() error('WHPX not available on this platform') endif +if 'CONFIG_MSHV' not in accelerators and get_option('mshv').enabled() + error('mshv not available on this platform') +endif xen = not_found if get_option('xen').enabled() or (get_option('xen').auto() and have_system) @@ -3656,6 +3668,7 @@ if have_system trace_events_subdirs += [ 'accel/hvf', 'accel/kvm', + 'accel/mshv', 'audio', 'backends', 'backends/tpm', @@ -4239,6 +4252,7 @@ if have_rust '--no-prepend-enum-name', '--allowlist-file', meson.project_source_root() + '/include/.*', '--allowlist-file', meson.project_build_root() + '/.*', + '--blocklist-file', glib_pc.get_variable('includedir') + '/glib-2.0/.*', ] if not rustfmt.found() if bindgen.version().version_compare('<0.65.0') @@ -4826,6 +4840,7 @@ if have_system summary_info += {'HVF support': config_all_accel.has_key('CONFIG_HVF')} summary_info += {'WHPX support': config_all_accel.has_key('CONFIG_WHPX')} summary_info += {'NVMM support': config_all_accel.has_key('CONFIG_NVMM')} + summary_info += {'MSHV support': config_all_accel.has_key('CONFIG_MSHV')} summary_info += {'Xen support': xen.found()} if xen.found() summary_info += {'xen ctrl version': xen.version()} diff --git a/meson_options.txt b/meson_options.txt index fff1521..2836156 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -36,6 +36,8 @@ option('trace_file', type: 'string', value: 'trace', option('coroutine_backend', type: 'combo', choices: ['ucontext', 'sigaltstack', 'windows', 'wasm', 'auto'], value: 'auto', description: 'coroutine backend to use') +option('gdb', type: 'string', value: '', + description: 'Path to GDB') # Everything else can be set via --enable/--disable-* option # on the configure script command line. After adding an option @@ -71,6 +73,8 @@ option('malloc', type : 'combo', choices : ['system', 'tcmalloc', 'jemalloc'], option('kvm', type: 'feature', value: 'auto', description: 'KVM acceleration support') +option('mshv', type: 'feature', value: 'auto', + description: 'MSHV acceleration support') option('whpx', type: 'feature', value: 'auto', description: 'WHPX acceleration support') option('hvf', type: 'feature', value: 'auto', diff --git a/pythondeps.toml b/pythondeps.toml index 16fb2a9..98e99e7 100644 --- a/pythondeps.toml +++ b/pythondeps.toml @@ -33,3 +33,4 @@ sphinx_rtd_theme = { accepted = ">=0.5", installed = "1.2.2" } [testdeps] qemu.qmp = { accepted = ">=0.0.3", installed = "0.0.3" } +pygdbmi = { accepted = ">=0.11.0.0", installed = "0.11.0.0" } diff --git a/qapi/accelerator.json b/qapi/accelerator.json index fb28c8d..664e027 100644 --- a/qapi/accelerator.json +++ b/qapi/accelerator.json @@ -54,3 +54,32 @@ { 'command': 'x-accel-stats', 'returns': 'HumanReadableText', 'features': [ 'unstable' ] } + +## +# @MshvInfo: +# +# Information about support for MSHV acceleration +# +# @enabled: true if MSHV acceleration is active +# +# @present: true if MSHV acceleration is built into this executable +# +# Since: 10.2.0 +## +{ 'struct': 'MshvInfo', 'data': {'enabled': 'bool', 'present': 'bool'} } + +## +# @query-mshv: +# +# Return information about MSHV acceleration +# +# Returns: @MshvInfo +# +# Since: 10.0.92 +# +# .. qmp-example:: +# +# -> { "execute": "query-mshv" } +# <- { "return": { "enabled": true, "present": true } } +## +{ 'command': 'query-mshv', 'returns': 'MshvInfo' } diff --git a/qemu-options.hx b/qemu-options.hx index cc2ef44..0223cef 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -28,7 +28,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ "-machine [type=]name[,prop[=value][,...]]\n" " selects emulated machine ('-machine help' for list)\n" " property accel=accel1[:accel2[:...]] selects accelerator\n" - " supported accelerators are kvm, xen, hvf, nvmm, whpx or tcg (default: tcg)\n" + " supported accelerators are kvm, xen, hvf, nvmm, whpx, mshv or tcg (default: tcg)\n" " vmport=on|off|auto controls emulation of vmport (default: auto)\n" " dump-guest-core=on|off include guest memory in a core dump (default=on)\n" " mem-merge=on|off controls memory merge support (default: on)\n" @@ -66,10 +66,10 @@ SRST ``accel=accels1[:accels2[:...]]`` This is used to enable an accelerator. Depending on the target - architecture, kvm, xen, hvf, nvmm, whpx or tcg can be available. - By default, tcg is used. If there is more than one accelerator - specified, the next one is used if the previous one fails to - initialize. + architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be + available. By default, tcg is used. If there is more than one + accelerator specified, the next one is used if the previous one + fails to initialize. ``vmport=on|off|auto`` Enables emulation of VMWare IO port, for vmmouse etc. auto says @@ -226,7 +226,7 @@ ERST DEF("accel", HAS_ARG, QEMU_OPTION_accel, "-accel [accel=]accelerator[,prop[=value][,...]]\n" - " select accelerator (kvm, xen, hvf, nvmm, whpx or tcg; use 'help' for a list)\n" + " select accelerator (kvm, xen, hvf, nvmm, whpx, mshv or tcg; use 'help' for a list)\n" " igd-passthru=on|off (enable Xen integrated Intel graphics passthrough, default=off)\n" " kernel-irqchip=on|off|split controls accelerated irqchip support (default=on)\n" " kvm-shadow-mem=size of KVM shadow MMU in bytes\n" @@ -241,8 +241,8 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel, SRST ``-accel name[,prop=value[,...]]`` This is used to enable an accelerator. Depending on the target - architecture, kvm, xen, hvf, nvmm, whpx or tcg can be available. By - default, tcg is used. If there is more than one accelerator + architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be available. + By default, tcg is used. If there is more than one accelerator specified, the next one is used if the previous one fails to initialize. diff --git a/replay/replay.c b/replay/replay.c index a3e24c9..b212178 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -263,6 +263,8 @@ bool replay_has_interrupt(void) void replay_shutdown_request(ShutdownCause cause) { + replay_save_instructions(); + if (replay_mode == REPLAY_MODE_RECORD) { g_assert(replay_mutex_locked()); replay_put_event(EVENT_SHUTDOWN + cause); diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 444ef51..1108513 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -58,15 +58,27 @@ dependencies = [ name = "bql" version = "0.1.0" dependencies = [ + "glib-sys", "migration", ] [[package]] +name = "cfg-expr" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a2c5f3bf25ec225351aa1c8e230d04d880d3bd89dea133537dafad4ae291e5c" +dependencies = [ + "smallvec", + "target-lexicon", +] + +[[package]] name = "chardev" version = "0.1.0" dependencies = [ "bql", "common", + "glib-sys", "migration", "qom", "util", @@ -87,6 +99,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] name = "foreign" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -96,6 +114,28 @@ dependencies = [ ] [[package]] +name = "glib-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d09d3d0fddf7239521674e57b0465dfbd844632fec54f059f7f56112e3f927e1" +dependencies = [ + "libc", + "system-deps", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] name = "hpet" version = "0.1.0" dependencies = [ @@ -115,6 +155,7 @@ dependencies = [ "bql", "chardev", "common", + "glib-sys", "migration", "qemu_macros", "qom", @@ -123,6 +164,16 @@ dependencies = [ ] [[package]] +name = "indexmap" +version = "2.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] name = "itertools" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -138,14 +189,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" [[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] name = "migration" version = "0.1.0" dependencies = [ "common", + "glib-sys", "util", ] [[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] name = "pl011" version = "0.1.0" dependencies = [ @@ -155,6 +219,7 @@ dependencies = [ "bql", "chardev", "common", + "glib-sys", "hwcore", "migration", "qom", @@ -211,6 +276,7 @@ version = "0.1.0" dependencies = [ "bql", "common", + "glib-sys", "migration", "qemu_macros", "util", @@ -226,6 +292,50 @@ dependencies = [ ] [[package]] +name = "serde" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dca6411025b24b60bfa7ec1fe1f8e710ac09782dca409ee8237ba74b51295fd" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba2ba63999edb9dac981fb34b3e5c0d111a69b0924e253ed29d83f7c99e966a4" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8db53ae22f34573731bafa1db20f04027b2d25e02d8205921b569171699cdb33" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] name = "syn" version = "2.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -241,11 +351,31 @@ name = "system" version = "0.1.0" dependencies = [ "common", + "glib-sys", "qom", "util", ] [[package]] +name = "system-deps" +version = "7.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4be53aa0cba896d2dc615bd42bbc130acdcffa239e0a2d965ea5b3b2a86ffdb" +dependencies = [ + "cfg-expr", + "heck", + "pkg-config", + "toml", + "version-compare", +] + +[[package]] +name = "target-lexicon" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" + +[[package]] name = "tests" version = "0.1.0" dependencies = [ @@ -260,6 +390,40 @@ dependencies = [ ] [[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + +[[package]] name = "trace" version = "0.1.0" dependencies = [ @@ -279,11 +443,27 @@ dependencies = [ "anyhow", "common", "foreign", + "glib-sys", "libc", ] [[package]] +name = "version-compare" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" + +[[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "winnow" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +dependencies = [ + "memchr", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index f372d7d..783e626 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -29,6 +29,7 @@ authors = ["The QEMU Project Developers <qemu-devel@nongnu.org>"] anyhow = "~1.0" foreign = "~0.3.1" libc = "0.2.162" +glib-sys = { version = "0.21.2", features = ["v2_66"] } [workspace.lints.rust] unexpected_cfgs = { level = "deny", check-cfg = ['cfg(MESON)'] } diff --git a/rust/bql/Cargo.toml b/rust/bql/Cargo.toml index 1041bd4..d5177e5 100644 --- a/rust/bql/Cargo.toml +++ b/rust/bql/Cargo.toml @@ -14,6 +14,7 @@ rust-version.workspace = true [dependencies] migration = { path = "../migration" } +glib-sys.workspace = true [features] default = ["debug_cell"] diff --git a/rust/bql/meson.build b/rust/bql/meson.build index bc51c7f..22d7c9b 100644 --- a/rust/bql/meson.build +++ b/rust/bql/meson.build @@ -38,6 +38,7 @@ _bql_rs = static_library( rust_abi: 'rust', rust_args: _bql_cfg, link_with: [_migration_rs], + dependencies: [glib_sys_rs], ) bql_rs = declare_dependency(link_with: [_bql_rs], diff --git a/rust/bql/src/bindings.rs b/rust/bql/src/bindings.rs index 9ffff12..8c70f3a 100644 --- a/rust/bql/src/bindings.rs +++ b/rust/bql/src/bindings.rs @@ -18,6 +18,10 @@ clippy::too_many_arguments )] +use glib_sys::{ + guint, GArray, GHashTable, GHashTableIter, GList, GPollFD, GPtrArray, GQueue, GSList, GSource, +}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/chardev/Cargo.toml b/rust/chardev/Cargo.toml index 3e77972..f105189 100644 --- a/rust/chardev/Cargo.toml +++ b/rust/chardev/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys = { workspace = true } common = { path = "../common" } bql = { path = "../bql" } migration = { path = "../migration" } diff --git a/rust/chardev/meson.build b/rust/chardev/meson.build index e7ce02b..d365d8d 100644 --- a/rust/chardev/meson.build +++ b/rust/chardev/meson.build @@ -36,7 +36,7 @@ _chardev_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs, _qom_rs, _util_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [glib_sys_rs, common_rs, qemu_macros], ) chardev_rs = declare_dependency(link_with: [_chardev_rs], dependencies: [chardev, qemuutil]) diff --git a/rust/chardev/src/bindings.rs b/rust/chardev/src/bindings.rs index 2d98026..c95dc89 100644 --- a/rust/chardev/src/bindings.rs +++ b/rust/chardev/src/bindings.rs @@ -19,6 +19,10 @@ )] use common::Zeroable; +use glib_sys::{ + gboolean, guint, GArray, GHashTable, GHashTableIter, GIOCondition, GList, GMainContext, + GPollFD, GPtrArray, GQueue, GSList, GSource, GSourceFunc, +}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/hw/char/pl011/Cargo.toml b/rust/hw/char/pl011/Cargo.toml index dc41d0e..5b31945 100644 --- a/rust/hw/char/pl011/Cargo.toml +++ b/rust/hw/char/pl011/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys.workspace = true bilge = { version = "0.2.0" } bilge-impl = { version = "0.2.0" } bits = { path = "../../../bits" } diff --git a/rust/hw/char/pl011/meson.build b/rust/hw/char/pl011/meson.build index 07b3da1..33b91f2 100644 --- a/rust/hw/char/pl011/meson.build +++ b/rust/hw/char/pl011/meson.build @@ -33,6 +33,7 @@ _libpl011_rs = static_library( bilge_impl_rs, bits_rs, common_rs, + glib_sys_rs, util_rs, migration_rs, bql_rs, diff --git a/rust/hw/char/pl011/src/bindings.rs b/rust/hw/char/pl011/src/bindings.rs index bd5ea84..52a76d0 100644 --- a/rust/hw/char/pl011/src/bindings.rs +++ b/rust/hw/char/pl011/src/bindings.rs @@ -20,6 +20,11 @@ //! `bindgen`-generated declarations. +use glib_sys::{ + gboolean, guint, GArray, GByteArray, GHashTable, GHashTableIter, GIOCondition, GList, + GMainContext, GPollFD, GPtrArray, GQueue, GSList, GSource, GSourceFunc, GString, +}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/hw/core/Cargo.toml b/rust/hw/core/Cargo.toml index 9a9aa51..ecfb564 100644 --- a/rust/hw/core/Cargo.toml +++ b/rust/hw/core/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys.workspace = true qemu_macros = { path = "../../qemu-macros" } common = { path = "../../common" } bql = { path = "../../bql" } diff --git a/rust/hw/core/meson.build b/rust/hw/core/meson.build index e1ae95e..1560dd2 100644 --- a/rust/hw/core/meson.build +++ b/rust/hw/core/meson.build @@ -59,7 +59,7 @@ _hwcore_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _chardev_rs, _migration_rs, _qom_rs, _system_rs, _util_rs], - dependencies: [qemu_macros, common_rs], + dependencies: [glib_sys_rs, qemu_macros, common_rs], ) hwcore_rs = declare_dependency(link_with: [_hwcore_rs], diff --git a/rust/hw/core/src/bindings.rs b/rust/hw/core/src/bindings.rs index 919c02b..65b9aae 100644 --- a/rust/hw/core/src/bindings.rs +++ b/rust/hw/core/src/bindings.rs @@ -20,6 +20,9 @@ use chardev::bindings::Chardev; use common::Zeroable; +use glib_sys::{ + GArray, GByteArray, GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList, GString, +}; use migration::bindings::VMStateDescription; use qom::bindings::ObjectClass; use system::bindings::MemoryRegion; diff --git a/rust/meson.build b/rust/meson.build index 695d5a6..6ba075c 100644 --- a/rust/meson.build +++ b/rust/meson.build @@ -2,12 +2,14 @@ subproject('anyhow-1-rs', required: true) subproject('bilge-0.2-rs', required: true) subproject('bilge-impl-0.2-rs', required: true) subproject('foreign-0.3-rs', required: true) +subproject('glib-sys-0.21-rs', required: true) subproject('libc-0.2-rs', required: true) anyhow_rs = dependency('anyhow-1-rs') bilge_rs = dependency('bilge-0.2-rs') bilge_impl_rs = dependency('bilge-impl-0.2-rs') foreign_rs = dependency('foreign-0.3-rs') +glib_sys_rs = dependency('glib-sys-0.21-rs') libc_rs = dependency('libc-0.2-rs') subproject('proc-macro2-1-rs', required: true) diff --git a/rust/migration/Cargo.toml b/rust/migration/Cargo.toml index 708bfaa..94504f3 100644 --- a/rust/migration/Cargo.toml +++ b/rust/migration/Cargo.toml @@ -15,6 +15,7 @@ rust-version.workspace = true [dependencies] common = { path = "../common" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/migration/meson.build b/rust/migration/meson.build index ddf5c2f..18be65c 100644 --- a/rust/migration/meson.build +++ b/rust/migration/meson.build @@ -38,7 +38,7 @@ _migration_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_util_rs], - dependencies: [common_rs], + dependencies: [common_rs, glib_sys_rs], ) migration_rs = declare_dependency(link_with: [_migration_rs], diff --git a/rust/migration/src/bindings.rs b/rust/migration/src/bindings.rs index 8ce13a9..24503eb 100644 --- a/rust/migration/src/bindings.rs +++ b/rust/migration/src/bindings.rs @@ -19,6 +19,7 @@ )] use common::Zeroable; +use glib_sys::{GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/qom/Cargo.toml b/rust/qom/Cargo.toml index 060ad2e..4be3c25 100644 --- a/rust/qom/Cargo.toml +++ b/rust/qom/Cargo.toml @@ -18,6 +18,7 @@ bql = { path = "../bql" } migration = { path = "../migration" } qemu_macros = { path = "../qemu-macros" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/qom/meson.build b/rust/qom/meson.build index 71fdac6..e50f418 100644 --- a/rust/qom/meson.build +++ b/rust/qom/meson.build @@ -29,7 +29,7 @@ _qom_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [common_rs, glib_sys_rs, qemu_macros], ) qom_rs = declare_dependency(link_with: [_qom_rs], dependencies: [qemu_macros, qom]) diff --git a/rust/qom/src/bindings.rs b/rust/qom/src/bindings.rs index 9ffff12..91de42f 100644 --- a/rust/qom/src/bindings.rs +++ b/rust/qom/src/bindings.rs @@ -18,6 +18,8 @@ clippy::too_many_arguments )] +use glib_sys::{GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/system/Cargo.toml b/rust/system/Cargo.toml index 7fd369b..186ea00 100644 --- a/rust/system/Cargo.toml +++ b/rust/system/Cargo.toml @@ -16,6 +16,7 @@ rust-version.workspace = true common = { path = "../common" } qom = { path = "../qom" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/system/meson.build b/rust/system/meson.build index 0859f39..73d6199 100644 --- a/rust/system/meson.build +++ b/rust/system/meson.build @@ -36,7 +36,7 @@ _system_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs, _qom_rs, _util_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [glib_sys_rs, common_rs, qemu_macros], ) system_rs = declare_dependency(link_with: [_system_rs], diff --git a/rust/system/src/bindings.rs b/rust/system/src/bindings.rs index 43edd98..6cbb588 100644 --- a/rust/system/src/bindings.rs +++ b/rust/system/src/bindings.rs @@ -19,6 +19,10 @@ )] use common::Zeroable; +use glib_sys::{ + guint, GArray, GByteArray, GHashTable, GHashTableIter, GList, GPollFD, GPtrArray, GQueue, + GSList, GString, +}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/util/Cargo.toml b/rust/util/Cargo.toml index 1f6767e..85f9143 100644 --- a/rust/util/Cargo.toml +++ b/rust/util/Cargo.toml @@ -15,6 +15,7 @@ rust-version.workspace = true [dependencies] anyhow = { workspace = true } foreign = { workspace = true } +glib-sys = { workspace = true } libc = { workspace = true } common = { path = "../common" } diff --git a/rust/util/meson.build b/rust/util/meson.build index 094b433..b0b75e9 100644 --- a/rust/util/meson.build +++ b/rust/util/meson.build @@ -40,7 +40,7 @@ _util_rs = static_library( ), override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', - dependencies: [anyhow_rs, libc_rs, foreign_rs, common_rs, qom, qemuutil], + dependencies: [anyhow_rs, libc_rs, foreign_rs, glib_sys_rs, common_rs, qom, qemuutil], ) util_rs = declare_dependency(link_with: [_util_rs], dependencies: [qemuutil, qom]) diff --git a/rust/util/src/bindings.rs b/rust/util/src/bindings.rs index 9ffff12..c277a29 100644 --- a/rust/util/src/bindings.rs +++ b/rust/util/src/bindings.rs @@ -18,6 +18,8 @@ clippy::too_many_arguments )] +use glib_sys::{guint, GList, GPollFD, GQueue, GSList, GString}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/scripts/archive-source.sh b/scripts/archive-source.sh index a725dd9..8f97b19 100755 --- a/scripts/archive-source.sh +++ b/scripts/archive-source.sh @@ -36,6 +36,7 @@ subprojects=( bilge-impl-0.2-rs either-1-rs foreign-0.3-rs + glib-sys-0.21-rs itertools-0.11-rs keycodemapdb libc-0.2-rs diff --git a/scripts/ci/setup/gitlab-runner.yml b/scripts/ci/setup/gitlab-runner.yml index 57e7fae..7025935 100644 --- a/scripts/ci/setup/gitlab-runner.yml +++ b/scripts/ci/setup/gitlab-runner.yml @@ -16,7 +16,7 @@ tasks: - debug: msg: 'Checking for a valid GitLab registration token' - failed_when: "gitlab_runner_registration_token == 'PLEASE_PROVIDE_A_VALID_TOKEN'" + failed_when: "gitlab_runner_authentication_token == 'PLEASE_PROVIDE_A_VALID_TOKEN'" - name: Create a group for the gitlab-runner service group: @@ -95,15 +95,7 @@ # Register Runners - name: Register the gitlab-runner - command: "/usr/bin/gitlab-runner register --non-interactive --url {{ gitlab_runner_server_url }} --registration-token {{ gitlab_runner_registration_token }} --executor shell --tag-list {{ ansible_facts[\"architecture\"] }},{{ ansible_facts[\"distribution\"]|lower }}_{{ ansible_facts[\"distribution_version\"] }} --description '{{ ansible_facts[\"distribution\"] }} {{ ansible_facts[\"distribution_version\"] }} {{ ansible_facts[\"architecture\"] }} ({{ ansible_facts[\"os_family\"] }})'" - - # The secondary runner will still run under the single gitlab-runner service - - name: Register secondary gitlab-runner - command: "/usr/bin/gitlab-runner register --non-interactive --url {{ gitlab_runner_server_url }} --registration-token {{ gitlab_runner_registration_token }} --executor shell --tag-list aarch32,{{ ansible_facts[\"distribution\"]|lower }}_{{ ansible_facts[\"distribution_version\"] }} --description '{{ ansible_facts[\"distribution\"] }} {{ ansible_facts[\"distribution_version\"] }} {{ ansible_facts[\"architecture\"] }} ({{ ansible_facts[\"os_family\"] }})'" - when: - - ansible_facts['distribution'] == 'Ubuntu' - - ansible_facts['architecture'] == 'aarch64' - - ansible_facts['distribution_version'] == '22.04' + command: "/usr/bin/gitlab-runner register --non-interactive --url {{ gitlab_runner_server_url }} --token {{ gitlab_runner_authentication_token }} --executor shell" - name: Install the gitlab-runner service using its own functionality command: "/usr/bin/gitlab-runner install --user gitlab-runner --working-directory /home/gitlab-runner" diff --git a/scripts/ci/setup/ubuntu/build-environment.yml b/scripts/ci/setup/ubuntu/build-environment.yml index 56b5160..6042750 100644 --- a/scripts/ci/setup/ubuntu/build-environment.yml +++ b/scripts/ci/setup/ubuntu/build-environment.yml @@ -35,19 +35,19 @@ # the package lists are updated by "make lcitool-refresh" - name: Include package lists based on OS and architecture include_vars: - file: "ubuntu-2204-{{ ansible_facts['architecture'] }}.yaml" + file: "ubuntu-2404-{{ ansible_facts['architecture'] }}.yaml" when: - ansible_facts['distribution'] == 'Ubuntu' - - ansible_facts['distribution_version'] == '22.04' + - ansible_facts['distribution_version'] == '24.04' - - name: Install packages for QEMU on Ubuntu 22.04 + - name: Install packages for QEMU on Ubuntu 24.04 package: name: "{{ packages }}" when: - ansible_facts['distribution'] == 'Ubuntu' - - ansible_facts['distribution_version'] == '22.04' + - ansible_facts['distribution_version'] == '24.04' - - name: Install armhf cross-compile packages to build QEMU on AArch64 Ubuntu 22.04 + - name: Install armhf cross-compile packages to build QEMU on AArch64 Ubuntu 24.04 package: name: - binutils-arm-linux-gnueabihf @@ -62,6 +62,6 @@ - zlib1g-dev:armhf when: - ansible_facts['distribution'] == 'Ubuntu' - - ansible_facts['distribution_version'] == '22.04' + - ansible_facts['distribution_version'] == '24.04' - ansible_facts['architecture'] == 'aarch64' diff --git a/scripts/ci/setup/ubuntu/ubuntu-2204-aarch64.yaml b/scripts/ci/setup/ubuntu/ubuntu-2404-aarch64.yaml index 2ca4a53..ce632d9 100644 --- a/scripts/ci/setup/ubuntu/ubuntu-2204-aarch64.yaml +++ b/scripts/ci/setup/ubuntu/ubuntu-2404-aarch64.yaml @@ -1,12 +1,13 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool variables --host-arch aarch64 ubuntu-2204 qemu +# $ lcitool variables --host-arch aarch64 ubuntu-2404 qemu # # https://gitlab.com/libvirt/libvirt-ci packages: - bash - bc + - bindgen - bison - bsdextrautils - bzip2 @@ -92,6 +93,7 @@ packages: - libvdeplug-dev - libvirglrenderer-dev - libvte-2.91-dev + - libxdp-dev - libxen-dev - libzstd-dev - llvm diff --git a/scripts/ci/setup/ubuntu/ubuntu-2204-s390x.yaml b/scripts/ci/setup/ubuntu/ubuntu-2404-s390x.yaml index 7198fbb..f45f75c 100644 --- a/scripts/ci/setup/ubuntu/ubuntu-2204-s390x.yaml +++ b/scripts/ci/setup/ubuntu/ubuntu-2404-s390x.yaml @@ -1,12 +1,13 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool variables --host-arch s390x ubuntu-2204 qemu +# $ lcitool variables --host-arch s390x ubuntu-2404 qemu # # https://gitlab.com/libvirt/libvirt-ci packages: - bash - bc + - bindgen - bison - bsdextrautils - bzip2 @@ -91,6 +92,7 @@ packages: - libvdeplug-dev - libvirglrenderer-dev - libvte-2.91-dev + - libxdp-dev - libzstd-dev - llvm - locales diff --git a/scripts/ci/setup/vars.yml.template b/scripts/ci/setup/vars.yml.template index 4b355fb..e9ddc05 100644 --- a/scripts/ci/setup/vars.yml.template +++ b/scripts/ci/setup/vars.yml.template @@ -6,5 +6,6 @@ ansible_to_gitlab_arch: x86_64: amd64 aarch64: arm64 s390x: s390x -# A unique token made available by GitLab to your project for registering runners -gitlab_runner_registration_token: PLEASE_PROVIDE_A_VALID_TOKEN +# A unique token made obtained from GitLab for each runner +# see: https://gitlab.com/PROJECT/REPO/-/runners/new +gitlab_runner_authentication_token: PLEASE_PROVIDE_A_VALID_TOKEN diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 0ebe6bc..3d0d132 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -58,6 +58,7 @@ meson_options_help() { printf "%s\n" ' --enable-ubsan enable undefined behaviour sanitizer' printf "%s\n" ' --firmwarepath=VALUES search PATH for firmware files [share/qemu-' printf "%s\n" ' firmware]' + printf "%s\n" ' --gdb=VALUE Path to GDB' printf "%s\n" ' --iasl=VALUE Path to ACPI disassembler' printf "%s\n" ' --includedir=VALUE Header file directory [include]' printf "%s\n" ' --interp-prefix=VALUE where to find shared libraries etc., use %M for' @@ -154,6 +155,7 @@ meson_options_help() { printf "%s\n" ' membarrier membarrier system call (for Linux 4.14+ or Windows' printf "%s\n" ' modules modules support (non Windows)' printf "%s\n" ' mpath Multipath persistent reservation passthrough' + printf "%s\n" ' mshv MSHV acceleration support' printf "%s\n" ' multiprocess Out of process device emulation support' printf "%s\n" ' netmap netmap network backend support' printf "%s\n" ' nettle nettle cryptography support' @@ -323,6 +325,7 @@ _meson_option_parse() { --disable-fuzzing) printf "%s" -Dfuzzing=false ;; --enable-gcrypt) printf "%s" -Dgcrypt=enabled ;; --disable-gcrypt) printf "%s" -Dgcrypt=disabled ;; + --gdb=*) quote_sh "-Dgdb=$2" ;; --enable-gettext) printf "%s" -Dgettext=enabled ;; --disable-gettext) printf "%s" -Dgettext=disabled ;; --enable-gio) printf "%s" -Dgio=enabled ;; @@ -408,6 +411,8 @@ _meson_option_parse() { --disable-modules) printf "%s" -Dmodules=disabled ;; --enable-mpath) printf "%s" -Dmpath=enabled ;; --disable-mpath) printf "%s" -Dmpath=disabled ;; + --enable-mshv) printf "%s" -Dmshv=enabled ;; + --disable-mshv) printf "%s" -Dmshv=disabled ;; --enable-multiprocess) printf "%s" -Dmultiprocess=enabled ;; --disable-multiprocess) printf "%s" -Dmultiprocess=disabled ;; --enable-netmap) printf "%s" -Dnetmap=enabled ;; diff --git a/scripts/rust/rust_root_crate.sh b/scripts/rust/rust_root_crate.sh index 975bddf..f05b8d0 100755 --- a/scripts/rust/rust_root_crate.sh +++ b/scripts/rust/rust_root_crate.sh @@ -4,7 +4,7 @@ set -eu cat <<EOF // @generated -// This file is autogenerated by scripts/rust_root_crate.sh +// This file is autogenerated by scripts/rust/rust_root_crate.sh EOF diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 64c0d7c..844d9cb 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -196,7 +196,7 @@ rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" for header in const.h stddef.h kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ psci.h psp-sev.h userfaultfd.h memfd.h mman.h nvme_ioctl.h \ - vduse.h iommufd.h bits.h; do + vduse.h iommufd.h bits.h mshv.h; do cp "$hdrdir/include/linux/$header" "$output/linux-headers/linux" done diff --git a/subprojects/.gitignore b/subprojects/.gitignore index 58a29f0..c00c847 100644 --- a/subprojects/.gitignore +++ b/subprojects/.gitignore @@ -6,21 +6,22 @@ /keycodemapdb /libvfio-user /slirp -/anyhow-1.0.98 -/arbitrary-int-1.2.7 -/attrs-0.2.9 -/bilge-0.2.0 -/bilge-impl-0.2.0 -/either-1.12.0 -/foreign-0.3.1 -/itertools-0.11.0 -/libc-0.2.162 -/proc-macro-error-1.0.4 -/proc-macro-error-attr-1.0.4 -/proc-macro2-1.0.95 -/quote-1.0.36 -/syn-2.0.66 -/unicode-ident-1.0.12 +/anyhow-* +/arbitrary-int-* +/attrs-* +/bilge-* +/bilge-impl-* +/either-* +/foreign-* +/glib-sys-* +/itertools-* +/libc-* +/proc-macro-error-* +/proc-macro-error-attr-* +/proc-macro* +/quote-* +/syn-* +/unicode-ident-* # Workaround for Meson v1.9.0 https://github.com/mesonbuild/meson/issues/14948 /.wraplock diff --git a/subprojects/glib-sys-0.21-rs.wrap b/subprojects/glib-sys-0.21-rs.wrap new file mode 100644 index 0000000..313ced7 --- /dev/null +++ b/subprojects/glib-sys-0.21-rs.wrap @@ -0,0 +1,7 @@ +[wrap-file] +directory = glib-sys-0.21.2 +source_url = https://crates.io/api/v1/crates/glib-sys/0.21.2/download +source_filename = glib-sys-0.21.2.tar.gz +source_hash = d09d3d0fddf7239521674e57b0465dfbd844632fec54f059f7f56112e3f927e1 +#method = cargo +patch_directory = glib-sys-0.21-rs diff --git a/subprojects/packagefiles/glib-sys-0.21-rs/meson.build b/subprojects/packagefiles/glib-sys-0.21-rs/meson.build new file mode 100644 index 0000000..8c54833 --- /dev/null +++ b/subprojects/packagefiles/glib-sys-0.21-rs/meson.build @@ -0,0 +1,33 @@ +project('glib-sys-0.21-rs', 'rust', + meson_version: '>=1.5.0', + version: '0.21.2', + license: 'MIT', + default_options: []) + +subproject('libc-0.2-rs', required: true) +libc_rs = dependency('libc-0.2-rs') + +_glib_sys_rs = static_library( + 'glib_sys', + files('src/lib.rs'), + gnu_symbol_visibility: 'hidden', + override_options: ['rust_std=2021', 'build.rust_std=2021'], + rust_abi: 'rust', + rust_args: [ + '--cap-lints', 'allow', + '--cfg', 'feature="v2_66"', + '--cfg', 'feature="v2_64"', + '--cfg', 'feature="v2_62"', + '--cfg', 'feature="v2_60"', + '--cfg', 'feature="v2_58"', + ], + # should also link with glib; don't bother doing it here since all + # QEMU targets have it + dependencies: [libc_rs], +) + +glib_sys_dep = declare_dependency( + link_with: _glib_sys_rs, +) + +meson.override_dependency('glib-sys-0.21-rs', glib_sys_dep) diff --git a/target/arm/cpregs-gcs.c b/target/arm/cpregs-gcs.c new file mode 100644 index 0000000..1ed52a2 --- /dev/null +++ b/target/arm/cpregs-gcs.c @@ -0,0 +1,156 @@ +/* + * QEMU ARM CP Register GCS regiters and instructions + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/timer.h" +#include "exec/icount.h" +#include "hw/irq.h" +#include "cpu.h" +#include "cpu-features.h" +#include "cpregs.h" +#include "internals.h" + + +static CPAccessResult access_gcs(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_current_el(env) < 3 + && arm_feature(env, ARM_FEATURE_EL3) + && !(env->cp15.scr_el3 & SCR_GCSEN)) { + return CP_ACCESS_TRAP_EL3; + } + return CP_ACCESS_OK; +} + +static CPAccessResult access_gcs_el0(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_current_el(env) == 0 && !(env->cp15.gcscr_el[0] & GCSCRE0_NTR)) { + return CP_ACCESS_TRAP_EL1; + } + return access_gcs(env, ri, isread); +} + +static void gcspr_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + /* + * Bits [2:0] are RES0, so we might as well clear them now, + * rather than upon each usage a-la GetCurrentGCSPointer. + */ + raw_write(env, ri, value & ~7); +} + +static CPAccessResult access_gcspushm(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + int el = arm_current_el(env); + if (!(env->cp15.gcscr_el[el] & GCSCR_PUSHMEN)) { + return CP_ACCESS_TRAP_BIT | (el ? el : 1); + } + return CP_ACCESS_OK; +} + +static CPAccessResult access_gcspushx(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + /* Trap if lock taken, and enabled. */ + if (!(env->pstate & PSTATE_EXLOCK)) { + int el = arm_current_el(env); + if (env->cp15.gcscr_el[el] & GCSCR_EXLOCKEN) { + return CP_ACCESS_EXLOCK; + } + } + return CP_ACCESS_OK; +} + +static CPAccessResult access_gcspopcx(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + /* Trap if lock not taken, and enabled. */ + if (env->pstate & PSTATE_EXLOCK) { + int el = arm_current_el(env); + if (env->cp15.gcscr_el[el] & GCSCR_EXLOCKEN) { + return CP_ACCESS_EXLOCK; + } + } + return CP_ACCESS_OK; +} + +static const ARMCPRegInfo gcs_reginfo[] = { + { .name = "GCSCRE0_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 2, .crm = 5, .opc2 = 2, + .access = PL1_RW, .accessfn = access_gcs, .fgt = FGT_NGCS_EL0, + .fieldoffset = offsetof(CPUARMState, cp15.gcscr_el[0]) }, + { .name = "GCSCR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 2, .crm = 5, .opc2 = 0, + .access = PL1_RW, .accessfn = access_gcs, .fgt = FGT_NGCS_EL1, + .nv2_redirect_offset = 0x8d0 | NV2_REDIR_NV1, + .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 2, 5, 0), + .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 2, 5, 0), + .fieldoffset = offsetof(CPUARMState, cp15.gcscr_el[1]) }, + { .name = "GCSCR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 5, .opc2 = 0, + .access = PL2_RW, .accessfn = access_gcs, + .fieldoffset = offsetof(CPUARMState, cp15.gcscr_el[2]) }, + { .name = "GCSCR_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .crn = 2, .crm = 5, .opc2 = 0, + .access = PL3_RW, + .fieldoffset = offsetof(CPUARMState, cp15.gcscr_el[3]) }, + + { .name = "GCSPR_EL0", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 2, .crm = 5, .opc2 = 1, + .access = PL0_R | PL1_W, .accessfn = access_gcs_el0, + .fgt = FGT_NGCS_EL0, .writefn = gcspr_write, + .fieldoffset = offsetof(CPUARMState, cp15.gcspr_el[0]) }, + { .name = "GCSPR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 2, .crm = 5, .opc2 = 1, + .access = PL1_RW, .accessfn = access_gcs, + .fgt = FGT_NGCS_EL1, .writefn = gcspr_write, + .nv2_redirect_offset = 0x8c0 | NV2_REDIR_NV1, + .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 2, 5, 1), + .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 2, 5, 1), + .fieldoffset = offsetof(CPUARMState, cp15.gcspr_el[1]) }, + { .name = "GCSPR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 5, .opc2 = 1, + .access = PL2_RW, .accessfn = access_gcs, .writefn = gcspr_write, + .fieldoffset = offsetof(CPUARMState, cp15.gcspr_el[2]) }, + { .name = "GCSPR_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .crn = 2, .crm = 5, .opc2 = 1, + .access = PL3_RW, .writefn = gcspr_write, + .fieldoffset = offsetof(CPUARMState, cp15.gcspr_el[2]) }, + + { .name = "GCSPUSHM", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 7, .opc2 = 0, + .access = PL0_W, .accessfn = access_gcspushm, + .fgt = FGT_NGCSPUSHM_EL1, .type = ARM_CP_GCSPUSHM }, + { .name = "GCSPOPM", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 7, .opc2 = 1, + .access = PL0_R, .type = ARM_CP_GCSPOPM }, + { .name = "GCSSS1", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 7, .opc2 = 2, + .access = PL0_W, .type = ARM_CP_GCSSS1 }, + { .name = "GCSSS2", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 7, .opc2 = 3, + .access = PL0_R, .type = ARM_CP_GCSSS2 }, + { .name = "GCSPUSHX", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 7, .opc2 = 4, + .access = PL1_W, .accessfn = access_gcspushx, .fgt = FGT_NGCSEPP, + .type = ARM_CP_GCSPUSHX }, + { .name = "GCSPOPCX", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 7, .opc2 = 5, + .access = PL1_W, .accessfn = access_gcspopcx, .fgt = FGT_NGCSEPP, + .type = ARM_CP_GCSPOPCX }, + { .name = "GCSPOPX", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 7, .opc2 = 6, + .access = PL1_W, .type = ARM_CP_GCSPOPX }, +}; + +void define_gcs_cpregs(ARMCPU *cpu) +{ + if (cpu_isar_feature(aa64_gcs, cpu)) { + define_arm_cp_regs(cpu, gcs_reginfo); + } +} diff --git a/target/arm/cpregs.h b/target/arm/cpregs.h index 57fde5f..763de5e 100644 --- a/target/arm/cpregs.h +++ b/target/arm/cpregs.h @@ -47,6 +47,14 @@ enum { ARM_CP_DC_ZVA = 0x0005, ARM_CP_DC_GVA = 0x0006, ARM_CP_DC_GZVA = 0x0007, + /* Special: gcs instructions */ + ARM_CP_GCSPUSHM = 0x0008, + ARM_CP_GCSPOPM = 0x0009, + ARM_CP_GCSPUSHX = 0x000a, + ARM_CP_GCSPOPX = 0x000b, + ARM_CP_GCSPOPCX = 0x000c, + ARM_CP_GCSSS1 = 0x000d, + ARM_CP_GCSSS2 = 0x000e, /* Flag: reads produce resetvalue; writes ignored. */ ARM_CP_CONST = 1 << 4, @@ -136,6 +144,11 @@ enum { * identically to the normal one, other than FGT trapping handling.) */ ARM_CP_ADD_TLBI_NXS = 1 << 21, + /* + * Flag: even though this sysreg has opc1 == 4 or 5, it + * should not trap to EL2 when HCR_EL2.NV is set. + */ + ARM_CP_NV_NO_TRAP = 1 << 22, }; /* @@ -351,6 +364,14 @@ typedef enum CPAccessResult { * specified target EL. */ CP_ACCESS_UNDEFINED = (2 << 2), + + /* + * Access fails with EXLOCK, a GCS exception syndrome. + * These traps are always to the current execution EL, + * which is the same as the usual target EL because + * they cannot occur from EL0. + */ + CP_ACCESS_EXLOCK = (3 << 2), } CPAccessResult; /* Indexes into fgt_read[] */ @@ -779,8 +800,12 @@ typedef enum FGTBit { DO_BIT(HFGRTR, VBAR_EL1), DO_BIT(HFGRTR, ICC_IGRPENN_EL1), DO_BIT(HFGRTR, ERRIDR_EL1), + DO_REV_BIT(HFGRTR, NGCS_EL0), + DO_REV_BIT(HFGRTR, NGCS_EL1), DO_REV_BIT(HFGRTR, NSMPRI_EL1), DO_REV_BIT(HFGRTR, NTPIDR2_EL0), + DO_REV_BIT(HFGRTR, NPIRE0_EL1), + DO_REV_BIT(HFGRTR, NPIR_EL1), /* Trap bits in HDFGRTR_EL2 / HDFGWTR_EL2, starting from bit 0. */ DO_BIT(HDFGRTR, DBGBCRN_EL1), @@ -859,6 +884,8 @@ typedef enum FGTBit { DO_BIT(HFGITR, DVPRCTX), DO_BIT(HFGITR, CPPRCTX), DO_BIT(HFGITR, DCCVAC), + DO_REV_BIT(HFGITR, NGCSPUSHM_EL1), + DO_REV_BIT(HFGITR, NGCSEPP), DO_BIT(HFGITR, ATS1E1A), } FGTBit; @@ -1156,12 +1183,17 @@ static inline bool arm_cpreg_traps_in_nv(const ARMCPRegInfo *ri) * fragile to future new sysregs, but this seems the least likely * to break. * - * In particular, note that the released sysreg XML defines that - * the FEAT_MEC sysregs and instructions do not follow this FEAT_NV - * trapping rule, so we will need to add an ARM_CP_* flag to indicate - * "register does not trap on NV" to handle those if/when we implement - * FEAT_MEC. + * In particular, note that the FEAT_MEC sysregs and instructions + * are exceptions to this trapping rule, so they are marked as + * ARM_CP_NV_NO_TRAP to indicate that they should not be trapped + * to EL2. (They are an exception because the FEAT_MEC sysregs UNDEF + * unless in Realm, and Realm is not expected to be virtualized.) */ + + if (ri->type & ARM_CP_NV_NO_TRAP) { + return false; + } + return ri->opc1 == 4 || ri->opc1 == 5; } diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h index f59c18b..37f1eca 100644 --- a/target/arm/cpu-features.h +++ b/target/arm/cpu-features.h @@ -1149,6 +1149,11 @@ static inline bool isar_feature_aa64_nmi(const ARMISARegisters *id) return FIELD_EX64_IDREG(id, ID_AA64PFR1, NMI) != 0; } +static inline bool isar_feature_aa64_gcs(const ARMISARegisters *id) +{ + return FIELD_EX64_IDREG(id, ID_AA64PFR1, GCS) != 0; +} + static inline bool isar_feature_aa64_tgran4_lpa2(const ARMISARegisters *id) { return FIELD_SEX64_IDREG(id, ID_AA64MMFR0, TGRAN4) >= 1; @@ -1349,6 +1354,21 @@ static inline bool isar_feature_aa64_sctlr2(const ARMISARegisters *id) return FIELD_EX64_IDREG(id, ID_AA64MMFR3, SCTLRX) != 0; } +static inline bool isar_feature_aa64_s1pie(const ARMISARegisters *id) +{ + return FIELD_EX64_IDREG(id, ID_AA64MMFR3, S1PIE) != 0; +} + +static inline bool isar_feature_aa64_s2pie(const ARMISARegisters *id) +{ + return FIELD_EX64_IDREG(id, ID_AA64MMFR3, S2PIE) != 0; +} + +static inline bool isar_feature_aa64_mec(const ARMISARegisters *id) +{ + return FIELD_EX64_IDREG(id, ID_AA64MMFR3, MEC) != 0; +} + static inline bool isar_feature_aa64_pmuv3p1(const ARMISARegisters *id) { return FIELD_EX64_IDREG(id, ID_AA64DFR0, PMUVER) >= 4 && diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 30e29fd..3b556f1 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -311,6 +311,10 @@ static void arm_cpu_reset_hold(Object *obj, ResetType type) env->cp15.mdscr_el1 |= 1 << 12; /* Enable FEAT_MOPS */ env->cp15.sctlr_el[1] |= SCTLR_MSCEN; + /* For Linux, GCSPR_EL0 is always readable. */ + if (cpu_isar_feature(aa64_gcs, cpu)) { + env->cp15.gcscr_el[0] = GCSCRE0_NTR; + } #else /* Reset into the highest available EL */ if (arm_feature(env, ARM_FEATURE_EL3)) { @@ -635,12 +639,22 @@ void arm_emulate_firmware_reset(CPUState *cpustate, int target_el) if (cpu_isar_feature(aa64_fgt, cpu)) { env->cp15.scr_el3 |= SCR_FGTEN; } + if (cpu_isar_feature(aa64_gcs, cpu)) { + env->cp15.scr_el3 |= SCR_GCSEN; + } if (cpu_isar_feature(aa64_tcr2, cpu)) { env->cp15.scr_el3 |= SCR_TCR2EN; } if (cpu_isar_feature(aa64_sctlr2, cpu)) { env->cp15.scr_el3 |= SCR_SCTLR2EN; } + if (cpu_isar_feature(aa64_s1pie, cpu) || + cpu_isar_feature(aa64_s2pie, cpu)) { + env->cp15.scr_el3 |= SCR_PIEN; + } + if (cpu_isar_feature(aa64_mec, cpu)) { + env->cp15.scr_el3 |= SCR_MECEN; + } } if (target_el == 2) { @@ -819,7 +833,7 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, int flags) { ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; - uint32_t psr = pstate_read(env); + uint64_t psr = pstate_read(env); int i, j; int el = arm_current_el(env); uint64_t hcr = arm_hcr_el2_eff(env); @@ -841,7 +855,7 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, int flags) } else { ns_status = ""; } - qemu_fprintf(f, "PSTATE=%08x %c%c%c%c %sEL%d%c", + qemu_fprintf(f, "PSTATE=%016" PRIx64 " %c%c%c%c %sEL%d%c", psr, psr & PSTATE_N ? 'N' : '-', psr & PSTATE_Z ? 'Z' : '-', @@ -858,7 +872,7 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, int flags) (FIELD_EX64(env->svcr, SVCR, SM) ? 'S' : '-')); } if (cpu_isar_feature(aa64_bti, cpu)) { - qemu_fprintf(f, " BTYPE=%d", (psr & PSTATE_BTYPE) >> 10); + qemu_fprintf(f, " BTYPE=%d", (int)(psr & PSTATE_BTYPE) >> 10); } qemu_fprintf(f, "%s%s%s", (hcr & HCR_NV) ? " NV" : "", diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 41414ac..1d4e133 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -33,6 +33,7 @@ #include "target/arm/multiprocessing.h" #include "target/arm/gtimer.h" #include "target/arm/cpu-sysregs.h" +#include "target/arm/mmuidx.h" #define EXCP_UDEF 1 /* undefined instruction */ #define EXCP_SWI 2 /* software interrupt */ @@ -267,7 +268,7 @@ typedef struct CPUArchState { uint64_t xregs[32]; uint64_t pc; /* PSTATE isn't an architectural register for ARMv8. However, it is - * convenient for us to assemble the underlying state into a 32 bit format + * convenient for us to assemble the underlying state into a 64 bit format * identical to the architectural format used for the SPSR. (This is also * what the Linux kernel's 'pstate' field in signal handlers and KVM's * 'pstate' register are.) Of the PSTATE bits: @@ -279,7 +280,7 @@ typedef struct CPUArchState { * SM and ZA are kept in env->svcr * all other bits are stored in their correct places in env->pstate */ - uint32_t pstate; + uint64_t pstate; bool aarch64; /* True if CPU is in aarch64 state; inverse of PSTATE.nRW */ bool thumb; /* True if CPU is in thumb mode; cpsr[5] */ @@ -368,6 +369,9 @@ typedef struct CPUArchState { uint64_t tcr2_el[3]; uint64_t vtcr_el2; /* Virtualization Translation Control. */ uint64_t vstcr_el2; /* Secure Virtualization Translation Control. */ + uint64_t pir_el[4]; /* PIRE0_EL1, PIR_EL1, PIR_EL2, PIR_EL3 */ + uint64_t pire0_el2; + uint64_t s2pir_el2; uint32_t c2_data; /* MPU data cacheable bits. */ uint32_t c2_insn; /* MPU instruction cacheable bits. */ union { /* MMU domain access control register @@ -576,6 +580,18 @@ typedef struct CPUArchState { /* NV2 register */ uint64_t vncr_el2; + + uint64_t gcscr_el[4]; /* GCSCRE0_EL1, GCSCR_EL[123] */ + uint64_t gcspr_el[4]; /* GCSPR_EL[0123] */ + + /* MEC registers */ + uint64_t mecid_p0_el2; + uint64_t mecid_a0_el2; + uint64_t mecid_p1_el2; + uint64_t mecid_a1_el2; + uint64_t mecid_rl_a_el3; + uint64_t vmecid_p_el2; + uint64_t vmecid_a_el2; } cp15; struct { @@ -630,13 +646,10 @@ typedef struct CPUArchState { * entry process. */ struct { - uint32_t syndrome; /* AArch64 format syndrome register */ - uint32_t fsr; /* AArch32 format fault status register info */ + uint64_t syndrome; /* AArch64 format syndrome register */ uint64_t vaddress; /* virtual addr associated with exception, if any */ + uint32_t fsr; /* AArch32 format fault status register info */ uint32_t target_el; /* EL the exception should be targeted for */ - /* If we implement EL2 we will also need to store information - * about the intermediate physical address for stage 2 faults. - */ } exception; /* Information associated with an SError */ @@ -1498,6 +1511,7 @@ void pmu_init(ARMCPU *cpu); #define PSTATE_C (1U << 29) #define PSTATE_Z (1U << 30) #define PSTATE_N (1U << 31) +#define PSTATE_EXLOCK (1ULL << 34) #define PSTATE_NZCV (PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V) #define PSTATE_DAIF (PSTATE_D | PSTATE_A | PSTATE_I | PSTATE_F) #define CACHED_PSTATE_BITS (PSTATE_NZCV | PSTATE_DAIF | PSTATE_BTYPE) @@ -1534,7 +1548,7 @@ static inline unsigned int aarch64_pstate_mode(unsigned int el, bool handler) * interprocessing, so we don't attempt to sync with the cpsr state used by * the 32 bit decoder. */ -static inline uint32_t pstate_read(CPUARMState *env) +static inline uint64_t pstate_read(CPUARMState *env) { int ZF; @@ -1544,7 +1558,7 @@ static inline uint32_t pstate_read(CPUARMState *env) | env->pstate | env->daif | (env->btype << 10); } -static inline void pstate_write(CPUARMState *env, uint32_t val) +static inline void pstate_write(CPUARMState *env, uint64_t val) { env->ZF = (~val) & PSTATE_Z; env->NF = val; @@ -1716,13 +1730,24 @@ static inline void xpsr_write(CPUARMState *env, uint32_t val, uint32_t mask) #define SCR_ENAS0 (1ULL << 36) #define SCR_ADEN (1ULL << 37) #define SCR_HXEN (1ULL << 38) +#define SCR_GCSEN (1ULL << 39) #define SCR_TRNDR (1ULL << 40) #define SCR_ENTP2 (1ULL << 41) #define SCR_TCR2EN (1ULL << 43) #define SCR_SCTLR2EN (1ULL << 44) +#define SCR_PIEN (1ULL << 45) #define SCR_GPF (1ULL << 48) +#define SCR_MECEN (1ULL << 49) #define SCR_NSE (1ULL << 62) +/* GCSCR_ELx fields */ +#define GCSCR_PCRSEL (1ULL << 0) +#define GCSCR_RVCHKEN (1ULL << 5) +#define GCSCR_EXLOCKEN (1ULL << 6) +#define GCSCR_PUSHMEN (1ULL << 8) +#define GCSCR_STREN (1ULL << 9) +#define GCSCRE0_NTR (1ULL << 10) + /* Return the current FPSCR value. */ uint32_t vfp_get_fpscr(CPUARMState *env); void vfp_set_fpscr(CPUARMState *env, uint32_t val); @@ -2221,6 +2246,7 @@ static inline bool arm_is_el2_enabled(CPUARMState *env) */ uint64_t arm_hcr_el2_eff_secstate(CPUARMState *env, ARMSecuritySpace space); uint64_t arm_hcr_el2_eff(CPUARMState *env); +uint64_t arm_hcr_el2_nvx_eff(CPUARMState *env); uint64_t arm_hcrx_el2_eff(CPUARMState *env); /* @@ -2300,212 +2326,6 @@ bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync); #define TYPE_ARM_HOST_CPU "host-" TYPE_ARM_CPU -/* ARM has the following "translation regimes" (as the ARM ARM calls them): - * - * If EL3 is 64-bit: - * + NonSecure EL1 & 0 stage 1 - * + NonSecure EL1 & 0 stage 2 - * + NonSecure EL2 - * + NonSecure EL2 & 0 (ARMv8.1-VHE) - * + Secure EL1 & 0 stage 1 - * + Secure EL1 & 0 stage 2 (FEAT_SEL2) - * + Secure EL2 (FEAT_SEL2) - * + Secure EL2 & 0 (FEAT_SEL2) - * + Realm EL1 & 0 stage 1 (FEAT_RME) - * + Realm EL1 & 0 stage 2 (FEAT_RME) - * + Realm EL2 (FEAT_RME) - * + EL3 - * If EL3 is 32-bit: - * + NonSecure PL1 & 0 stage 1 - * + NonSecure PL1 & 0 stage 2 - * + NonSecure PL2 - * + Secure PL1 & 0 - * (reminder: for 32 bit EL3, Secure PL1 is *EL3*, not EL1.) - * - * For QEMU, an mmu_idx is not quite the same as a translation regime because: - * 1. we need to split the "EL1 & 0" and "EL2 & 0" regimes into two mmu_idxes, - * because they may differ in access permissions even if the VA->PA map is - * the same - * 2. we want to cache in our TLB the full VA->IPA->PA lookup for a stage 1+2 - * translation, which means that we have one mmu_idx that deals with two - * concatenated translation regimes [this sort of combined s1+2 TLB is - * architecturally permitted] - * 3. we don't need to allocate an mmu_idx to translations that we won't be - * handling via the TLB. The only way to do a stage 1 translation without - * the immediate stage 2 translation is via the ATS or AT system insns, - * which can be slow-pathed and always do a page table walk. - * The only use of stage 2 translations is either as part of an s1+2 - * lookup or when loading the descriptors during a stage 1 page table walk, - * and in both those cases we don't use the TLB. - * 4. we can also safely fold together the "32 bit EL3" and "64 bit EL3" - * translation regimes, because they map reasonably well to each other - * and they can't both be active at the same time. - * 5. we want to be able to use the TLB for accesses done as part of a - * stage1 page table walk, rather than having to walk the stage2 page - * table over and over. - * 6. we need separate EL1/EL2 mmu_idx for handling the Privileged Access - * Never (PAN) bit within PSTATE. - * 7. we fold together most secure and non-secure regimes for A-profile, - * because there are no banked system registers for aarch64, so the - * process of switching between secure and non-secure is - * already heavyweight. - * 8. we cannot fold together Stage 2 Secure and Stage 2 NonSecure, - * because both are in use simultaneously for Secure EL2. - * - * This gives us the following list of cases: - * - * EL0 EL1&0 stage 1+2 (aka NS PL0 PL1&0 stage 1+2) - * EL1 EL1&0 stage 1+2 (aka NS PL1 PL1&0 stage 1+2) - * EL1 EL1&0 stage 1+2 +PAN (aka NS PL1 P1&0 stage 1+2 +PAN) - * EL0 EL2&0 - * EL2 EL2&0 - * EL2 EL2&0 +PAN - * EL2 (aka NS PL2) - * EL3 (aka AArch32 S PL1 PL1&0) - * AArch32 S PL0 PL1&0 (we call this EL30_0) - * AArch32 S PL1 PL1&0 +PAN (we call this EL30_3_PAN) - * Stage2 Secure - * Stage2 NonSecure - * plus one TLB per Physical address space: S, NS, Realm, Root - * - * for a total of 16 different mmu_idx. - * - * R profile CPUs have an MPU, but can use the same set of MMU indexes - * as A profile. They only need to distinguish EL0 and EL1 (and - * EL2 for cores like the Cortex-R52). - * - * M profile CPUs are rather different as they do not have a true MMU. - * They have the following different MMU indexes: - * User - * Privileged - * User, execution priority negative (ie the MPU HFNMIENA bit may apply) - * Privileged, execution priority negative (ditto) - * If the CPU supports the v8M Security Extension then there are also: - * Secure User - * Secure Privileged - * Secure User, execution priority negative - * Secure Privileged, execution priority negative - * - * The ARMMMUIdx and the mmu index value used by the core QEMU TLB code - * are not quite the same -- different CPU types (most notably M profile - * vs A/R profile) would like to use MMU indexes with different semantics, - * but since we don't ever need to use all of those in a single CPU we - * can avoid having to set NB_MMU_MODES to "total number of A profile MMU - * modes + total number of M profile MMU modes". The lower bits of - * ARMMMUIdx are the core TLB mmu index, and the higher bits are always - * the same for any particular CPU. - * Variables of type ARMMUIdx are always full values, and the core - * index values are in variables of type 'int'. - * - * Our enumeration includes at the end some entries which are not "true" - * mmu_idx values in that they don't have corresponding TLBs and are only - * valid for doing slow path page table walks. - * - * The constant names here are patterned after the general style of the names - * of the AT/ATS operations. - * The values used are carefully arranged to make mmu_idx => EL lookup easy. - * For M profile we arrange them to have a bit for priv, a bit for negpri - * and a bit for secure. - */ -#define ARM_MMU_IDX_A 0x10 /* A profile */ -#define ARM_MMU_IDX_NOTLB 0x20 /* does not have a TLB */ -#define ARM_MMU_IDX_M 0x40 /* M profile */ - -/* Meanings of the bits for M profile mmu idx values */ -#define ARM_MMU_IDX_M_PRIV 0x1 -#define ARM_MMU_IDX_M_NEGPRI 0x2 -#define ARM_MMU_IDX_M_S 0x4 /* Secure */ - -#define ARM_MMU_IDX_TYPE_MASK \ - (ARM_MMU_IDX_A | ARM_MMU_IDX_M | ARM_MMU_IDX_NOTLB) -#define ARM_MMU_IDX_COREIDX_MASK 0xf - -typedef enum ARMMMUIdx { - /* - * A-profile. - */ - ARMMMUIdx_E10_0 = 0 | ARM_MMU_IDX_A, - ARMMMUIdx_E20_0 = 1 | ARM_MMU_IDX_A, - ARMMMUIdx_E10_1 = 2 | ARM_MMU_IDX_A, - ARMMMUIdx_E20_2 = 3 | ARM_MMU_IDX_A, - ARMMMUIdx_E10_1_PAN = 4 | ARM_MMU_IDX_A, - ARMMMUIdx_E20_2_PAN = 5 | ARM_MMU_IDX_A, - ARMMMUIdx_E2 = 6 | ARM_MMU_IDX_A, - ARMMMUIdx_E3 = 7 | ARM_MMU_IDX_A, - ARMMMUIdx_E30_0 = 8 | ARM_MMU_IDX_A, - ARMMMUIdx_E30_3_PAN = 9 | ARM_MMU_IDX_A, - - /* - * Used for second stage of an S12 page table walk, or for descriptor - * loads during first stage of an S1 page table walk. Note that both - * are in use simultaneously for SecureEL2: the security state for - * the S2 ptw is selected by the NS bit from the S1 ptw. - */ - ARMMMUIdx_Stage2_S = 10 | ARM_MMU_IDX_A, - ARMMMUIdx_Stage2 = 11 | ARM_MMU_IDX_A, - - /* TLBs with 1-1 mapping to the physical address spaces. */ - ARMMMUIdx_Phys_S = 12 | ARM_MMU_IDX_A, - ARMMMUIdx_Phys_NS = 13 | ARM_MMU_IDX_A, - ARMMMUIdx_Phys_Root = 14 | ARM_MMU_IDX_A, - ARMMMUIdx_Phys_Realm = 15 | ARM_MMU_IDX_A, - - /* - * These are not allocated TLBs and are used only for AT system - * instructions or for the first stage of an S12 page table walk. - */ - ARMMMUIdx_Stage1_E0 = 0 | ARM_MMU_IDX_NOTLB, - ARMMMUIdx_Stage1_E1 = 1 | ARM_MMU_IDX_NOTLB, - ARMMMUIdx_Stage1_E1_PAN = 2 | ARM_MMU_IDX_NOTLB, - - /* - * M-profile. - */ - ARMMMUIdx_MUser = ARM_MMU_IDX_M, - ARMMMUIdx_MPriv = ARM_MMU_IDX_M | ARM_MMU_IDX_M_PRIV, - ARMMMUIdx_MUserNegPri = ARMMMUIdx_MUser | ARM_MMU_IDX_M_NEGPRI, - ARMMMUIdx_MPrivNegPri = ARMMMUIdx_MPriv | ARM_MMU_IDX_M_NEGPRI, - ARMMMUIdx_MSUser = ARMMMUIdx_MUser | ARM_MMU_IDX_M_S, - ARMMMUIdx_MSPriv = ARMMMUIdx_MPriv | ARM_MMU_IDX_M_S, - ARMMMUIdx_MSUserNegPri = ARMMMUIdx_MUserNegPri | ARM_MMU_IDX_M_S, - ARMMMUIdx_MSPrivNegPri = ARMMMUIdx_MPrivNegPri | ARM_MMU_IDX_M_S, -} ARMMMUIdx; - -/* - * Bit macros for the core-mmu-index values for each index, - * for use when calling tlb_flush_by_mmuidx() and friends. - */ -#define TO_CORE_BIT(NAME) \ - ARMMMUIdxBit_##NAME = 1 << (ARMMMUIdx_##NAME & ARM_MMU_IDX_COREIDX_MASK) - -typedef enum ARMMMUIdxBit { - TO_CORE_BIT(E10_0), - TO_CORE_BIT(E20_0), - TO_CORE_BIT(E10_1), - TO_CORE_BIT(E10_1_PAN), - TO_CORE_BIT(E2), - TO_CORE_BIT(E20_2), - TO_CORE_BIT(E20_2_PAN), - TO_CORE_BIT(E3), - TO_CORE_BIT(E30_0), - TO_CORE_BIT(E30_3_PAN), - TO_CORE_BIT(Stage2), - TO_CORE_BIT(Stage2_S), - - TO_CORE_BIT(MUser), - TO_CORE_BIT(MPriv), - TO_CORE_BIT(MUserNegPri), - TO_CORE_BIT(MPrivNegPri), - TO_CORE_BIT(MSUser), - TO_CORE_BIT(MSPriv), - TO_CORE_BIT(MSUserNegPri), - TO_CORE_BIT(MSPrivNegPri), -} ARMMMUIdxBit; - -#undef TO_CORE_BIT - -#define MMU_USER_IDX 0 - /* Indexes used when registering address spaces with cpu_address_space_init */ typedef enum ARMASIdx { ARMASIdx_NS = 0, @@ -2667,6 +2487,9 @@ FIELD(TBFLAG_A64, NV2_MEM_BE, 36, 1) FIELD(TBFLAG_A64, AH, 37, 1) /* FPCR.AH */ FIELD(TBFLAG_A64, NEP, 38, 1) /* FPCR.NEP */ FIELD(TBFLAG_A64, ZT0EXC_EL, 39, 2) +FIELD(TBFLAG_A64, GCS_EN, 41, 1) +FIELD(TBFLAG_A64, GCS_RVCEN, 42, 1) +FIELD(TBFLAG_A64, GCSSTR_EL, 43, 2) /* * Helpers for using the above. Note that only the A64 accessors use diff --git a/target/arm/gdbstub64.c b/target/arm/gdbstub64.c index 3bccde2..65d6bbe 100644 --- a/target/arm/gdbstub64.c +++ b/target/arm/gdbstub64.c @@ -47,6 +47,7 @@ int aarch64_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n) case 32: return gdb_get_reg64(mem_buf, env->pc); case 33: + /* pstate is now a 64-bit value; can we simply adjust the xml? */ return gdb_get_reg32(mem_buf, pstate_read(env)); } /* Unknown register. */ @@ -75,6 +76,7 @@ int aarch64_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) return 8; case 33: /* CPSR */ + /* pstate is now a 64-bit value; can we simply adjust the xml? */ pstate_write(env, tmp); return 4; } diff --git a/target/arm/helper.c b/target/arm/helper.c index b7bf45a..167f290 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -420,7 +420,9 @@ int alle1_tlbmask(CPUARMState *env) */ return (ARMMMUIdxBit_E10_1 | ARMMMUIdxBit_E10_1_PAN | + ARMMMUIdxBit_E10_1_GCS | ARMMMUIdxBit_E10_0 | + ARMMMUIdxBit_E10_0_GCS | ARMMMUIdxBit_Stage2 | ARMMMUIdxBit_Stage2_S); } @@ -764,12 +766,22 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) if (cpu_isar_feature(aa64_ecv, cpu)) { valid_mask |= SCR_ECVEN; } + if (cpu_isar_feature(aa64_gcs, cpu)) { + valid_mask |= SCR_GCSEN; + } if (cpu_isar_feature(aa64_tcr2, cpu)) { valid_mask |= SCR_TCR2EN; } if (cpu_isar_feature(aa64_sctlr2, cpu)) { valid_mask |= SCR_SCTLR2EN; } + if (cpu_isar_feature(aa64_s1pie, cpu) || + cpu_isar_feature(aa64_s2pie, cpu)) { + valid_mask |= SCR_PIEN; + } + if (cpu_isar_feature(aa64_mec, cpu)) { + valid_mask |= SCR_MECEN; + } } else { valid_mask &= ~(SCR_RW | SCR_ST); if (cpu_isar_feature(aa32_ras, cpu)) { @@ -804,12 +816,17 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) */ if (changed & (SCR_NS | SCR_NSE)) { tlb_flush_by_mmuidx(env_cpu(env), (ARMMMUIdxBit_E10_0 | + ARMMMUIdxBit_E10_0_GCS | ARMMMUIdxBit_E20_0 | + ARMMMUIdxBit_E20_0_GCS | ARMMMUIdxBit_E10_1 | - ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E10_1_PAN | + ARMMMUIdxBit_E10_1_GCS | + ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E20_2_PAN | - ARMMMUIdxBit_E2)); + ARMMMUIdxBit_E20_2_GCS | + ARMMMUIdxBit_E2 | + ARMMMUIdxBit_E2_GCS)); } } @@ -2783,7 +2800,9 @@ static void vmsa_tcr_ttbr_el2_write(CPUARMState *env, const ARMCPRegInfo *ri, (arm_hcr_el2_eff(env) & HCR_E2H)) { uint16_t mask = ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E20_2_PAN | - ARMMMUIdxBit_E20_0; + ARMMMUIdxBit_E20_2_GCS | + ARMMMUIdxBit_E20_0 | + ARMMMUIdxBit_E20_0_GCS; tlb_flush_by_mmuidx(env_cpu(env), mask); } raw_write(env, ri, value); @@ -3407,15 +3426,71 @@ static void mdcr_el2_write(CPUARMState *env, const ARMCPRegInfo *ri, } } +static CPAccessResult access_nv1_with_nvx(uint64_t hcr_nv) +{ + return hcr_nv == (HCR_NV | HCR_NV1) ? CP_ACCESS_TRAP_EL2 : CP_ACCESS_OK; +} + static CPAccessResult access_nv1(CPUARMState *env, const ARMCPRegInfo *ri, bool isread) { if (arm_current_el(env) == 1) { - uint64_t hcr_nv = arm_hcr_el2_eff(env) & (HCR_NV | HCR_NV1 | HCR_NV2); + return access_nv1_with_nvx(arm_hcr_el2_nvx_eff(env)); + } + return CP_ACCESS_OK; +} + +static CPAccessResult access_nv1_or_exlock_el1(CPUARMState *env, + const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_current_el(env) == 1) { + uint64_t nvx = arm_hcr_el2_nvx_eff(env); - if (hcr_nv == (HCR_NV | HCR_NV1)) { - return CP_ACCESS_TRAP_EL2; + if (!isread && + (env->pstate & PSTATE_EXLOCK) && + (env->cp15.gcscr_el[1] & GCSCR_EXLOCKEN) && + !(nvx & HCR_NV1)) { + return CP_ACCESS_EXLOCK; } + return access_nv1_with_nvx(nvx); + } + + /* + * At EL2, since VHE redirection is done at translation time, + * el_is_in_host is always false here, so EXLOCK does not apply. + */ + return CP_ACCESS_OK; +} + +static CPAccessResult access_exlock_el2(CPUARMState *env, + const ARMCPRegInfo *ri, bool isread) +{ + int el = arm_current_el(env); + + if (el == 3) { + return CP_ACCESS_OK; + } + + /* + * Access to the EL2 register from EL1 means NV is set, and + * EXLOCK has priority over an NV1 trap to EL2. + */ + if (!isread && + (env->pstate & PSTATE_EXLOCK) && + (env->cp15.gcscr_el[el] & GCSCR_EXLOCKEN)) { + return CP_ACCESS_EXLOCK; + } + return CP_ACCESS_OK; +} + +static CPAccessResult access_exlock_el3(CPUARMState *env, + const ARMCPRegInfo *ri, bool isread) +{ + if (!isread && + (env->pstate & PSTATE_EXLOCK) && + (env->cp15.gcscr_el[3] & GCSCR_EXLOCKEN)) { + return CP_ACCESS_EXLOCK; } return CP_ACCESS_OK; } @@ -3591,7 +3666,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = { { .name = "ELR_EL1", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS, .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 0, .opc2 = 1, - .access = PL1_RW, .accessfn = access_nv1, + .access = PL1_RW, .accessfn = access_nv1_or_exlock_el1, .nv2_redirect_offset = 0x230 | NV2_REDIR_NV1, .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 4, 0, 1), .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 4, 0, 1), @@ -3599,7 +3674,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = { { .name = "SPSR_EL1", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS, .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 0, .opc2 = 0, - .access = PL1_RW, .accessfn = access_nv1, + .access = PL1_RW, .accessfn = access_nv1_or_exlock_el1, .nv2_redirect_offset = 0x160 | NV2_REDIR_NV1, .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 4, 0, 0), .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 4, 0, 0), @@ -3888,6 +3963,16 @@ uint64_t arm_hcr_el2_eff(CPUARMState *env) return arm_hcr_el2_eff_secstate(env, arm_security_space_below_el3(env)); } +uint64_t arm_hcr_el2_nvx_eff(CPUARMState *env) +{ + uint64_t hcr = arm_hcr_el2_eff(env); + + if (!(hcr & HCR_NV)) { + return 0; /* CONSTRAINED UNPREDICTABLE wrt NV1 */ + } + return hcr & (HCR_NV2 | HCR_NV1 | HCR_NV); +} + /* * Corresponds to ARM pseudocode function ELIsInHost(). */ @@ -3940,6 +4025,9 @@ static void hcrx_write(CPUARMState *env, const ARMCPRegInfo *ri, if (cpu_isar_feature(aa64_sctlr2, cpu)) { valid_mask |= HCRX_SCTLR2EN; } + if (cpu_isar_feature(aa64_gcs, cpu)) { + valid_mask |= HCRX_GCSEN; + } /* Clear RES0 bits. */ env->cp15.hcrx_el2 = value & valid_mask; @@ -4010,6 +4098,9 @@ uint64_t arm_hcrx_el2_eff(CPUARMState *env) if (cpu_isar_feature(aa64_sctlr2, cpu)) { hcrx |= HCRX_SCTLR2EN; } + if (cpu_isar_feature(aa64_gcs, cpu)) { + hcrx |= HCRX_GCSEN; + } return hcrx; } if (arm_feature(env, ARM_FEATURE_EL3) && !(env->cp15.scr_el3 & SCR_HXEN)) { @@ -4067,7 +4158,7 @@ static const ARMCPRegInfo el2_cp_reginfo[] = { { .name = "ELR_EL2", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS | ARM_CP_NV2_REDIRECT, .opc0 = 3, .opc1 = 4, .crn = 4, .crm = 0, .opc2 = 1, - .access = PL2_RW, + .access = PL2_RW, .accessfn = access_exlock_el2, .fieldoffset = offsetof(CPUARMState, elr_el[2]) }, { .name = "ESR_EL2", .state = ARM_CP_STATE_BOTH, .type = ARM_CP_NV2_REDIRECT, @@ -4085,7 +4176,7 @@ static const ARMCPRegInfo el2_cp_reginfo[] = { { .name = "SPSR_EL2", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS | ARM_CP_NV2_REDIRECT, .opc0 = 3, .opc1 = 4, .crn = 4, .crm = 0, .opc2 = 0, - .access = PL2_RW, + .access = PL2_RW, .accessfn = access_exlock_el2, .fieldoffset = offsetof(CPUARMState, banked_spsr[BANK_HYP]) }, { .name = "VBAR_EL2", .state = ARM_CP_STATE_BOTH, .opc0 = 3, .opc1 = 4, .crn = 12, .crm = 0, .opc2 = 0, @@ -4367,7 +4458,7 @@ static const ARMCPRegInfo el3_cp_reginfo[] = { { .name = "ELR_EL3", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS, .opc0 = 3, .opc1 = 6, .crn = 4, .crm = 0, .opc2 = 1, - .access = PL3_RW, + .access = PL3_RW, .accessfn = access_exlock_el3, .fieldoffset = offsetof(CPUARMState, elr_el[3]) }, { .name = "ESR_EL3", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 6, .crn = 5, .crm = 2, .opc2 = 0, @@ -4378,7 +4469,7 @@ static const ARMCPRegInfo el3_cp_reginfo[] = { { .name = "SPSR_EL3", .state = ARM_CP_STATE_AA64, .type = ARM_CP_ALIAS, .opc0 = 3, .opc1 = 6, .crn = 4, .crm = 0, .opc2 = 0, - .access = PL3_RW, + .access = PL3_RW, .accessfn = access_exlock_el3, .fieldoffset = offsetof(CPUARMState, banked_spsr[BANK_MON]) }, { .name = "VBAR_EL3", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 6, .crn = 12, .crm = 0, .opc2 = 0, @@ -5000,6 +5091,96 @@ static const ARMCPRegInfo nmi_reginfo[] = { .resetfn = arm_cp_reset_ignore }, }; +static CPAccessResult mecid_access(CPUARMState *env, + const ARMCPRegInfo *ri, bool isread) +{ + int el = arm_current_el(env); + + if (el == 2) { + if (arm_security_space(env) != ARMSS_Realm) { + return CP_ACCESS_UNDEFINED; + } + + if (!(env->cp15.scr_el3 & SCR_MECEN)) { + return CP_ACCESS_TRAP_EL3; + } + } + + return CP_ACCESS_OK; +} + +static void mecid_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + value = extract64(value, 0, MECID_WIDTH); + raw_write(env, ri, value); +} + +static CPAccessResult cipae_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + switch (arm_security_space(env)) { + case ARMSS_Root: /* EL3 */ + case ARMSS_Realm: /* Realm EL2 */ + return CP_ACCESS_OK; + default: + return CP_ACCESS_UNDEFINED; + } +} + +static const ARMCPRegInfo mec_reginfo[] = { + { .name = "MECIDR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 7, .crn = 10, .crm = 8, + .access = PL2_R, .type = ARM_CP_CONST | ARM_CP_NV_NO_TRAP, + .resetvalue = MECID_WIDTH - 1 }, + { .name = "MECID_P0_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 0, .crn = 10, .crm = 8, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.mecid_p0_el2) }, + { .name = "MECID_A0_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 1, .crn = 10, .crm = 8, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.mecid_a0_el2) }, + { .name = "MECID_P1_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 2, .crn = 10, .crm = 8, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.mecid_p1_el2) }, + { .name = "MECID_A1_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 3, .crn = 10, .crm = 8, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.mecid_a1_el2) }, + { .name = "MECID_RL_A_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .opc2 = 1, .crn = 10, .crm = 10, + .access = PL3_RW, .accessfn = mecid_access, + .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.mecid_rl_a_el3) }, + { .name = "VMECID_P_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 0, .crn = 10, .crm = 9, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.vmecid_p_el2) }, + { .name = "VMECID_A_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 1, .crn = 10, .crm = 9, + .access = PL2_RW, .type = ARM_CP_NV_NO_TRAP, + .accessfn = mecid_access, .writefn = mecid_write, + .fieldoffset = offsetof(CPUARMState, cp15.vmecid_a_el2) }, + { .name = "DC_CIPAE", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 14, .opc2 = 0, + .access = PL2_W, .type = ARM_CP_NOP | ARM_CP_NV_NO_TRAP, + .accessfn = cipae_access }, +}; + +static const ARMCPRegInfo mec_mte_reginfo[] = { + { .name = "DC_CIGDPAE", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 14, .opc2 = 7, + .access = PL2_W, .type = ARM_CP_NOP | ARM_CP_NV_NO_TRAP, + .accessfn = cipae_access }, +}; + #ifndef CONFIG_USER_ONLY /* * We don't know until after realize whether there's a GICv3 @@ -5842,6 +6023,9 @@ static void sctlr2_el2_write(CPUARMState *env, const ARMCPRegInfo *ri, { uint64_t valid_mask = 0; + if (cpu_isar_feature(aa64_mec, env_archcpu(env))) { + valid_mask |= SCTLR2_EMEC; + } value &= valid_mask; raw_write(env, ri, value); } @@ -5851,6 +6035,9 @@ static void sctlr2_el3_write(CPUARMState *env, const ARMCPRegInfo *ri, { uint64_t valid_mask = 0; + if (cpu_isar_feature(aa64_mec, env_archcpu(env))) { + valid_mask |= SCTLR2_EMEC; + } value &= valid_mask; raw_write(env, ri, value); } @@ -5902,8 +6089,12 @@ static CPAccessResult tcr2_el1_access(CPUARMState *env, const ARMCPRegInfo *ri, static void tcr2_el1_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { + ARMCPU *cpu = env_archcpu(env); uint64_t valid_mask = 0; + if (cpu_isar_feature(aa64_s1pie, cpu)) { + valid_mask |= TCR2_PIE; + } value &= valid_mask; raw_write(env, ri, value); } @@ -5911,8 +6102,15 @@ static void tcr2_el1_write(CPUARMState *env, const ARMCPRegInfo *ri, static void tcr2_el2_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { + ARMCPU *cpu = env_archcpu(env); uint64_t valid_mask = 0; + if (cpu_isar_feature(aa64_s1pie, cpu)) { + valid_mask |= TCR2_PIE; + } + if (cpu_isar_feature(aa64_mec, env_archcpu(env))) { + valid_mask |= TCR2_AMEC0 | TCR2_AMEC1; + } value &= valid_mask; raw_write(env, ri, value); } @@ -5933,6 +6131,64 @@ static const ARMCPRegInfo tcr2_reginfo[] = { .fieldoffset = offsetof(CPUARMState, cp15.tcr2_el[2]) }, }; +static CPAccessResult pien_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_feature(env, ARM_FEATURE_EL3) + && !(env->cp15.scr_el3 & SCR_PIEN) + && arm_current_el(env) < 3) { + return CP_ACCESS_TRAP_EL3; + } + return CP_ACCESS_OK; +} + +static CPAccessResult pien_el1_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + CPAccessResult ret = access_tvm_trvm(env, ri, isread); + if (ret == CP_ACCESS_OK) { + ret = pien_access(env, ri, isread); + } + return ret; +} + +static const ARMCPRegInfo s1pie_reginfo[] = { + { .name = "PIR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .opc2 = 3, .crn = 10, .crm = 2, + .access = PL1_RW, .accessfn = pien_el1_access, + .fgt = FGT_NPIR_EL1, .nv2_redirect_offset = 0x2a0 | NV2_REDIR_NV1, + .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 10, 2, 3), + .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 10, 2, 3), + .fieldoffset = offsetof(CPUARMState, cp15.pir_el[1]) }, + { .name = "PIR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 3, .crn = 10, .crm = 2, + .access = PL2_RW, .accessfn = pien_access, + .fieldoffset = offsetof(CPUARMState, cp15.pir_el[2]) }, + { .name = "PIR_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .opc2 = 3, .crn = 10, .crm = 2, + .access = PL3_RW, + .fieldoffset = offsetof(CPUARMState, cp15.pir_el[3]) }, + { .name = "PIRE0_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 10, .crm = 2, + .access = PL1_RW, .accessfn = pien_el1_access, + .fgt = FGT_NPIRE0_EL1, .nv2_redirect_offset = 0x290 | NV2_REDIR_NV1, + .vhe_redir_to_el2 = ENCODE_AA64_CP_REG(3, 4, 10, 2, 2), + .vhe_redir_to_el01 = ENCODE_AA64_CP_REG(3, 5, 10, 2, 2), + .fieldoffset = offsetof(CPUARMState, cp15.pir_el[0]) }, + { .name = "PIRE0_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 2, .crn = 10, .crm = 2, + .access = PL2_RW, .accessfn = pien_access, + .fieldoffset = offsetof(CPUARMState, cp15.pire0_el2) }, +}; + +static const ARMCPRegInfo s2pie_reginfo[] = { + { .name = "S2PIR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .opc2 = 5, .crn = 10, .crm = 2, + .access = PL2_RW, .accessfn = pien_access, + .nv2_redirect_offset = 0x2b0, + .fieldoffset = offsetof(CPUARMState, cp15.s2pir_el2) }, +}; + void register_cp_regs_for_features(ARMCPU *cpu) { /* Register all the coprocessor registers based on feature bits */ @@ -7165,6 +7421,19 @@ void register_cp_regs_for_features(ARMCPU *cpu) define_arm_cp_regs(cpu, tcr2_reginfo); } + if (cpu_isar_feature(aa64_s1pie, cpu)) { + define_arm_cp_regs(cpu, s1pie_reginfo); + } + if (cpu_isar_feature(aa64_s2pie, cpu)) { + define_arm_cp_regs(cpu, s2pie_reginfo); + } + if (cpu_isar_feature(aa64_mec, cpu)) { + define_arm_cp_regs(cpu, mec_reginfo); + if (cpu_isar_feature(aa64_mte, cpu)) { + define_arm_cp_regs(cpu, mec_mte_reginfo); + } + } + if (cpu_isar_feature(any_predinv, cpu)) { define_arm_cp_regs(cpu, predinv_reginfo); } @@ -7174,6 +7443,7 @@ void register_cp_regs_for_features(ARMCPU *cpu) } define_pm_cpregs(cpu); + define_gcs_cpregs(cpu); } /* @@ -8800,7 +9070,7 @@ static int aarch64_regnum(CPUARMState *env, int aarch32_reg) } } -static uint32_t cpsr_read_for_spsr_elx(CPUARMState *env) +uint32_t cpsr_read_for_spsr_elx(CPUARMState *env) { uint32_t ret = cpsr_read(env); @@ -8815,6 +9085,24 @@ static uint32_t cpsr_read_for_spsr_elx(CPUARMState *env) return ret; } +void cpsr_write_from_spsr_elx(CPUARMState *env, uint32_t val) +{ + uint32_t mask; + + /* Save SPSR_ELx.SS into PSTATE. */ + env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS); + val &= ~PSTATE_SS; + + /* Move DIT to the correct location for CPSR */ + if (val & PSTATE_DIT) { + val &= ~PSTATE_DIT; + val |= CPSR_DIT; + } + + mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar); + cpsr_write(env, val, mask, CPSRWriteRaw); +} + static bool syndrome_is_sync_extabt(uint32_t syndrome) { /* Return true if this syndrome value is a synchronous external abort */ @@ -8847,8 +9135,8 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) CPUARMState *env = &cpu->env; unsigned int new_el = env->exception.target_el; vaddr addr = env->cp15.vbar_el[new_el]; - unsigned int new_mode = aarch64_pstate_mode(new_el, true); - unsigned int old_mode; + uint64_t new_mode = aarch64_pstate_mode(new_el, true); + uint64_t old_mode; unsigned int cur_el = arm_current_el(env); int rt; @@ -8891,8 +9179,13 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) } else { addr += 0x600; } - } else if (pstate_read(env) & PSTATE_SP) { - addr += 0x200; + } else { + if (pstate_read(env) & PSTATE_SP) { + addr += 0x200; + } + if (is_a64(env) && (env->cp15.gcscr_el[new_el] & GCSCR_EXLOCKEN)) { + new_mode |= PSTATE_EXLOCK; + } } switch (cs->exception_index) { @@ -8996,7 +9289,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) * If NV2 is disabled, change SPSR when NV,NV1 == 1,0 (I_ZJRNN) * If NV2 is enabled, change SPSR when NV is 1 (I_DBTLM) */ - old_mode = deposit32(old_mode, 2, 2, 2); + old_mode = deposit64(old_mode, 2, 2, 2); } } } else { @@ -9009,7 +9302,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) } env->banked_spsr[aarch64_banked_spsr_index(new_el)] = old_mode; - qemu_log_mask(CPU_LOG_INT, "...with SPSR 0x%x\n", old_mode); + qemu_log_mask(CPU_LOG_INT, "...with SPSR 0x%" PRIx64 "\n", old_mode); qemu_log_mask(CPU_LOG_INT, "...with ELR 0x%" PRIx64 "\n", env->elr_el[new_el]); @@ -9063,7 +9356,8 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) env->pc = addr; - qemu_log_mask(CPU_LOG_INT, "...to EL%d PC 0x%" PRIx64 " PSTATE 0x%x\n", + qemu_log_mask(CPU_LOG_INT, "...to EL%d PC 0x%" PRIx64 + " PSTATE 0x%" PRIx64 "\n", new_el, env->pc, pstate_read(env)); } @@ -9119,7 +9413,7 @@ void arm_cpu_do_interrupt(CPUState *cs) new_el); if (qemu_loglevel_mask(CPU_LOG_INT) && !excp_is_internal(cs->exception_index)) { - qemu_log_mask(CPU_LOG_INT, "...with ESR 0x%x/0x%" PRIx32 "\n", + qemu_log_mask(CPU_LOG_INT, "...with ESR 0x%x/0x%" PRIx64 "\n", syn_get_ec(env->exception.syndrome), env->exception.syndrome); } @@ -9309,21 +9603,34 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, bool el1_is_aa32) { uint64_t tcr = regime_tcr(env, mmu_idx); - bool epd, hpd, tsz_oob, ds, ha, hd; + bool epd, hpd, tsz_oob, ds, ha, hd, pie = false; int select, tsz, tbi, max_tsz, min_tsz, ps, sh; ARMGranuleSize gran; ARMCPU *cpu = env_archcpu(env); bool stage2 = regime_is_stage2(mmu_idx); + int r_el = regime_el(mmu_idx); if (!regime_has_2_ranges(mmu_idx)) { select = 0; tsz = extract32(tcr, 0, 6); gran = tg0_to_gran_size(extract32(tcr, 14, 2)); if (stage2) { - /* VTCR_EL2 */ - hpd = false; + /* + * Stage2 does not have hierarchical permissions. + * Thus disabling them makes things easier during ptw. + */ + hpd = true; + pie = extract64(tcr, 36, 1) && cpu_isar_feature(aa64_s2pie, cpu); } else { hpd = extract32(tcr, 24, 1); + if (r_el == 3) { + pie = (extract64(tcr, 35, 1) + && cpu_isar_feature(aa64_s1pie, cpu)); + } else { + pie = ((env->cp15.tcr2_el[2] & TCR2_PIE) + && (!arm_feature(env, ARM_FEATURE_EL3) + || (env->cp15.scr_el3 & SCR_TCR2EN))); + } } epd = false; sh = extract32(tcr, 12, 2); @@ -9360,10 +9667,16 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, ds = extract64(tcr, 59, 1); if (e0pd && cpu_isar_feature(aa64_e0pd, cpu) && - regime_is_user(env, mmu_idx)) { + regime_is_user(mmu_idx)) { epd = true; } + + pie = ((env->cp15.tcr2_el[r_el] & TCR2_PIE) + && (!arm_feature(env, ARM_FEATURE_EL3) + || (env->cp15.scr_el3 & SCR_TCR2EN)) + && (r_el == 2 || (arm_hcrx_el2_eff(env) & HCRX_TCR2EN))); } + hpd |= pie; gran = sanitize_gran_size(cpu, gran, stage2); @@ -9442,6 +9755,7 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, .ha = ha, .hd = ha && hd, .gran = gran, + .pie = pie, }; } @@ -9556,33 +9870,6 @@ int fp_exception_el(CPUARMState *env, int cur_el) return 0; } -/* Return the exception level we're running at if this is our mmu_idx */ -int arm_mmu_idx_to_el(ARMMMUIdx mmu_idx) -{ - if (mmu_idx & ARM_MMU_IDX_M) { - return mmu_idx & ARM_MMU_IDX_M_PRIV; - } - - switch (mmu_idx) { - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E20_0: - case ARMMMUIdx_E30_0: - return 0; - case ARMMMUIdx_E10_1: - case ARMMMUIdx_E10_1_PAN: - return 1; - case ARMMMUIdx_E2: - case ARMMMUIdx_E20_2: - case ARMMMUIdx_E20_2_PAN: - return 2; - case ARMMMUIdx_E3: - case ARMMMUIdx_E30_3_PAN: - return 3; - default: - g_assert_not_reached(); - } -} - #ifndef CONFIG_TCG ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate) { diff --git a/target/arm/internals.h b/target/arm/internals.h index 1d958db..f539bbe 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -34,6 +34,7 @@ #include "system/memory.h" #include "syndrome.h" #include "cpu-features.h" +#include "mmuidx-internal.h" /* register banks for CPU modes */ #define BANK_USRSYS 0 @@ -250,6 +251,7 @@ FIELD(VSTCR, SA, 30, 1) #define HCRX_MSCEN (1ULL << 11) #define HCRX_TCR2EN (1ULL << 14) #define HCRX_SCTLR2EN (1ULL << 15) +#define HCRX_GCSEN (1ULL << 22) #define HPFAR_NS (1ULL << 63) @@ -304,14 +306,14 @@ FIELD(CNTHCTL, CNTPMASK, 19, 1) * and never returns because we will longjump back up to the CPU main loop. */ G_NORETURN void raise_exception(CPUARMState *env, uint32_t excp, - uint32_t syndrome, uint32_t target_el); + uint64_t syndrome, uint32_t target_el); /* * Similarly, but also use unwinding to restore cpu state. */ G_NORETURN void raise_exception_ra(CPUARMState *env, uint32_t excp, - uint32_t syndrome, uint32_t target_el, - uintptr_t ra); + uint64_t syndrome, uint32_t target_el, + uintptr_t ra); /* * For AArch64, map a given EL to an index in the banked_spsr array. @@ -752,6 +754,7 @@ struct ARMMMUFaultInfo { bool s1ptw; bool s1ns; bool ea; + bool dirtybit; /* FEAT_S1PIE, FEAT_S2PIE */ }; /** @@ -983,8 +986,6 @@ static inline ARMMMUIdx core_to_aa64_mmu_idx(int mmu_idx) return mmu_idx | ARM_MMU_IDX_A; } -int arm_mmu_idx_to_el(ARMMMUIdx mmu_idx); - /* Return the MMU index for a v7M CPU in the specified security state */ ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate); @@ -1027,108 +1028,10 @@ static inline void arm_call_el_change_hook(ARMCPU *cpu) } } -/* - * Return true if this address translation regime has two ranges. - * Note that this will not return the correct answer for AArch32 - * Secure PL1&0 (i.e. mmu indexes E3, E30_0, E30_3_PAN), but it is - * never called from a context where EL3 can be AArch32. (The - * correct return value for ARMMMUIdx_E3 would be different for - * that case, so we can't just make the function return the - * correct value anyway; we would need an extra "bool e3_is_aarch32" - * argument which all the current callsites would pass as 'false'.) - */ -static inline bool regime_has_2_ranges(ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_Stage1_E1: - case ARMMMUIdx_Stage1_E1_PAN: - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E10_1: - case ARMMMUIdx_E10_1_PAN: - case ARMMMUIdx_E20_0: - case ARMMMUIdx_E20_2: - case ARMMMUIdx_E20_2_PAN: - return true; - default: - return false; - } -} - -static inline bool regime_is_pan(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_Stage1_E1_PAN: - case ARMMMUIdx_E10_1_PAN: - case ARMMMUIdx_E20_2_PAN: - case ARMMMUIdx_E30_3_PAN: - return true; - default: - return false; - } -} - -static inline bool regime_is_stage2(ARMMMUIdx mmu_idx) -{ - return mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S; -} - -/* Return the exception level which controls this address translation regime */ -static inline uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_E20_0: - case ARMMMUIdx_E20_2: - case ARMMMUIdx_E20_2_PAN: - case ARMMMUIdx_Stage2: - case ARMMMUIdx_Stage2_S: - case ARMMMUIdx_E2: - return 2; - case ARMMMUIdx_E3: - case ARMMMUIdx_E30_0: - case ARMMMUIdx_E30_3_PAN: - return 3; - case ARMMMUIdx_E10_0: - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_Stage1_E1: - case ARMMMUIdx_Stage1_E1_PAN: - case ARMMMUIdx_E10_1: - case ARMMMUIdx_E10_1_PAN: - case ARMMMUIdx_MPrivNegPri: - case ARMMMUIdx_MUserNegPri: - case ARMMMUIdx_MPriv: - case ARMMMUIdx_MUser: - case ARMMMUIdx_MSPrivNegPri: - case ARMMMUIdx_MSUserNegPri: - case ARMMMUIdx_MSPriv: - case ARMMMUIdx_MSUser: - return 1; - default: - g_assert_not_reached(); - } -} - -static inline bool regime_is_user(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E20_0: - case ARMMMUIdx_E30_0: - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_MUser: - case ARMMMUIdx_MSUser: - case ARMMMUIdx_MUserNegPri: - case ARMMMUIdx_MSUserNegPri: - return true; - default: - return false; - } -} - /* Return the SCTLR value which controls this address translation regime */ static inline uint64_t regime_sctlr(CPUARMState *env, ARMMMUIdx mmu_idx) { - return env->cp15.sctlr_el[regime_el(env, mmu_idx)]; + return env->cp15.sctlr_el[regime_el(mmu_idx)]; } /* @@ -1160,13 +1063,13 @@ static inline uint64_t regime_tcr(CPUARMState *env, ARMMMUIdx mmu_idx) v |= env->cp15.vtcr_el2 & VTCR_SHARED_FIELD_MASK; return v; } - return env->cp15.tcr_el[regime_el(env, mmu_idx)]; + return env->cp15.tcr_el[regime_el(mmu_idx)]; } /* Return true if the translation regime is using LPAE format page tables */ static inline bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) { - int el = regime_el(env, mmu_idx); + int el = regime_el(mmu_idx); if (el == 2 || arm_el_is_aa64(env, el)) { return true; } @@ -1378,25 +1281,6 @@ ARMMMUIdx stage_1_mmu_idx(ARMMMUIdx mmu_idx); ARMMMUIdx arm_stage1_mmu_idx(CPUARMState *env); #endif -/** - * arm_mmu_idx_is_stage1_of_2: - * @mmu_idx: The ARMMMUIdx to test - * - * Return true if @mmu_idx is a NOTLB mmu_idx that is the - * first stage of a two stage regime. - */ -static inline bool arm_mmu_idx_is_stage1_of_2(ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_Stage1_E1: - case ARMMMUIdx_Stage1_E1_PAN: - return true; - default: - return false; - } -} - static inline uint32_t aarch32_cpsr_valid_mask(uint64_t features, const ARMISARegisters *id) { @@ -1491,7 +1375,7 @@ static inline int arm_granule_bits(ARMGranuleSize gran) /* * Parameters of a given virtual address, as extracted from the - * translation control register (TCR) for a given regime. + * translation controls for a given regime. */ typedef struct ARMVAParameters { unsigned tsz : 8; @@ -1506,6 +1390,7 @@ typedef struct ARMVAParameters { bool ha : 1; bool hd : 1; ARMGranuleSize gran : 2; + bool pie : 1; } ARMVAParameters; /** @@ -1576,6 +1461,13 @@ typedef struct ARMCacheAttrs { typedef struct GetPhysAddrResult { CPUTLBEntryFull f; ARMCacheAttrs cacheattrs; + /* + * For ARMMMUIdx_Stage2*, the protection installed into f.prot + * is the result for AccessType_TTW, i.e. the page table walk itself. + * The protection installed info s2prot is the one to be merged + * with the stage1 protection. + */ + int s2prot; } GetPhysAddrResult; /** @@ -1892,6 +1784,8 @@ void define_tlb_insn_regs(ARMCPU *cpu); void define_at_insn_regs(ARMCPU *cpu); /* Add the cpreg definitions for PM cpregs */ void define_pm_cpregs(ARMCPU *cpu); +/* Add the cpreg definitions for GCS cpregs */ +void define_gcs_cpregs(ARMCPU *cpu); /* Effective value of MDCR_EL2 */ static inline uint64_t arm_mdcr_el2_eff(CPUARMState *env) @@ -2003,8 +1897,13 @@ void vfp_clear_float_status_exc_flags(CPUARMState *env); */ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask); bool arm_pan_enabled(CPUARMState *env); +uint32_t cpsr_read_for_spsr_elx(CPUARMState *env); +void cpsr_write_from_spsr_elx(CPUARMState *env, uint32_t val); /* Compare uint64_t for qsort and bsearch. */ int compare_u64(const void *a, const void *b); +/* Used in FEAT_MEC to set the MECIDWidthm1 field in the MECIDR_EL2 register. */ +#define MECID_WIDTH 16 + #endif diff --git a/target/arm/machine.c b/target/arm/machine.c index 6666a0c..44a0cf8 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -816,6 +816,80 @@ static const VMStateInfo vmstate_cpsr = { .put = put_cpsr, }; +static int get_pstate64(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + uint64_t val = qemu_get_be64(f); + + env->aarch64 = ((val & PSTATE_nRW) == 0); + if (is_a64(env)) { + pstate_write(env, val); + } else { + cpsr_write_from_spsr_elx(env, val); + } + return 0; +} + +static int put_pstate64(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + uint64_t val; + + if (is_a64(env)) { + val = pstate_read(env); + } else { + val = cpsr_read_for_spsr_elx(env); + } + qemu_put_be64(f, val); + return 0; +} + +static bool pstate64_needed(void *opaque) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + uint64_t val; + + if (arm_feature(env, ARM_FEATURE_M)) { + return false; + } + if (is_a64(env)) { + val = pstate_read(env); + } else { + val = cpsr_read_for_spsr_elx(env); + if (val & PSTATE_SS) { + return true; + } + } + return val > UINT32_MAX; +} + +static const VMStateDescription vmstate_pstate64 = { + .name = "cpu/pstate64", + .version_id = 1, + .minimum_version_id = 1, + .needed = pstate64_needed, + .fields = (const VMStateField[]) { + { + .name = "pstate64", + .version_id = 0, + .size = sizeof(uint64_t), + .info = &(const VMStateInfo) { + .name = "pstate64", + .get = get_pstate64, + .put = put_pstate64, + }, + .flags = VMS_SINGLE, + .offset = 0, + }, + VMSTATE_END_OF_LIST() + }, +}; + static int get_power(QEMUFile *f, void *opaque, size_t size, const VMStateField *field) { @@ -848,6 +922,23 @@ static const VMStateInfo vmstate_powered_off = { .put = put_power, }; +static bool syndrome64_needed(void *opaque) +{ + ARMCPU *cpu = opaque; + return cpu->env.exception.syndrome > UINT32_MAX; +} + +static const VMStateDescription vmstate_syndrome64 = { + .name = "cpu/syndrome64", + .version_id = 1, + .minimum_version_id = 1, + .needed = syndrome64_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(env.exception.syndrome, ARMCPU), + VMSTATE_END_OF_LIST() + }, +}; + static int cpu_pre_save(void *opaque) { ARMCPU *cpu = opaque; @@ -1035,6 +1126,12 @@ const VMStateDescription vmstate_arm_cpu = { VMSTATE_UINT32_ARRAY(env.regs, ARMCPU, 16), VMSTATE_UINT64_ARRAY(env.xregs, ARMCPU, 32), VMSTATE_UINT64(env.pc, ARMCPU), + /* + * If any bits are set in the upper 32 bits of cpsr/pstate, + * or if the cpu is in aa32 mode and PSTATE.SS is set, then + * the cpu/pstate64 subsection will override this with the + * full 64 bit state. + */ { .name = "cpsr", .version_id = 0, @@ -1065,7 +1162,19 @@ const VMStateDescription vmstate_arm_cpu = { VMSTATE_UINT64(env.exclusive_val, ARMCPU), VMSTATE_UINT64(env.exclusive_high, ARMCPU), VMSTATE_UNUSED(sizeof(uint64_t)), - VMSTATE_UINT32(env.exception.syndrome, ARMCPU), + /* + * If any bits are set in the upper 32 bits of syndrome, + * then the cpu/syndrome64 subsection will override this + * with the full 64 bit state. + */ + { + .name = "env.exception.syndrome", + .version_id = 0, + .size = sizeof(uint32_t), + .info = &vmstate_info_uint32, + .flags = VMS_SINGLE, + .offset = offsetoflow32(ARMCPU, env.exception.syndrome), + }, VMSTATE_UINT32(env.exception.fsr, ARMCPU), VMSTATE_UINT64(env.exception.vaddress, ARMCPU), VMSTATE_TIMER_PTR(gt_timer[GTIMER_PHYS], ARMCPU), @@ -1098,6 +1207,8 @@ const VMStateDescription vmstate_arm_cpu = { &vmstate_serror, &vmstate_irq_line_state, &vmstate_wfxt_timer, + &vmstate_syndrome64, + &vmstate_pstate64, NULL } }; diff --git a/target/arm/meson.build b/target/arm/meson.build index 638ee62..3df7e03 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -6,7 +6,12 @@ arm_ss.add(files( arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'cpu64.c', - 'gdbstub64.c')) + 'gdbstub64.c' +)) + +arm_common_ss.add(files( + 'mmuidx.c', +)) arm_system_ss = ss.source_set() arm_common_system_ss = ss.source_set() @@ -22,6 +27,7 @@ arm_user_ss.add(when: 'TARGET_AARCH64', if_false: files( 'cpu32-stubs.c', )) arm_user_ss.add(files( + 'cpregs-gcs.c', 'cpregs-pmu.c', 'debug_helper.c', 'helper.c', @@ -42,6 +48,7 @@ arm_common_system_ss.add(files( 'arch_dump.c', 'arm-powerctl.c', 'cortex-regs.c', + 'cpregs-gcs.c', 'cpregs-pmu.c', 'cpu-irq.c', 'debug_helper.c', diff --git a/target/arm/mmuidx-internal.h b/target/arm/mmuidx-internal.h new file mode 100644 index 0000000..962b053 --- /dev/null +++ b/target/arm/mmuidx-internal.h @@ -0,0 +1,113 @@ +/* + * QEMU Arm software mmu index internal definitions + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef TARGET_ARM_MMUIDX_INTERNAL_H +#define TARGET_ARM_MMUIDX_INTERNAL_H + +#include "mmuidx.h" +#include "tcg/debug-assert.h" +#include "hw/registerfields.h" + + +FIELD(MMUIDXINFO, EL, 0, 2) +FIELD(MMUIDXINFO, ELVALID, 2, 1) +FIELD(MMUIDXINFO, REL, 3, 2) +FIELD(MMUIDXINFO, RELVALID, 5, 1) +FIELD(MMUIDXINFO, 2RANGES, 6, 1) +FIELD(MMUIDXINFO, PAN, 7, 1) +FIELD(MMUIDXINFO, USER, 8, 1) +FIELD(MMUIDXINFO, STAGE1, 9, 1) +FIELD(MMUIDXINFO, STAGE2, 10, 1) +FIELD(MMUIDXINFO, GCS, 11, 1) +FIELD(MMUIDXINFO, TG, 12, 5) + +extern const uint32_t arm_mmuidx_table[ARM_MMU_IDX_M + 8]; + +#define arm_mmuidx_is_valid(x) ((unsigned)(x) < ARRAY_SIZE(arm_mmuidx_table)) + +/* Return the exception level associated with this mmu index. */ +static inline int arm_mmu_idx_to_el(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + tcg_debug_assert(FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, ELVALID)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, EL); +} + +/* + * Return the exception level for the address translation regime + * associated with this mmu index. + */ +static inline uint32_t regime_el(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + tcg_debug_assert(FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, RELVALID)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, REL); +} + +/* + * Return true if this address translation regime has two ranges. + * Note that this will not return the correct answer for AArch32 + * Secure PL1&0 (i.e. mmu indexes E3, E30_0, E30_3_PAN), but it is + * never called from a context where EL3 can be AArch32. (The + * correct return value for ARMMMUIdx_E3 would be different for + * that case, so we can't just make the function return the + * correct value anyway; we would need an extra "bool e3_is_aarch32" + * argument which all the current callsites would pass as 'false'.) + */ +static inline bool regime_has_2_ranges(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, 2RANGES); +} + +/* Return true if Privileged Access Never is enabled for this mmu index. */ +static inline bool regime_is_pan(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, PAN); +} + +/* + * Return true if the exception level associated with this mmu index is 0. + * Differs from arm_mmu_idx_to_el(idx) == 0 in that this allows querying + * Stage1 and Stage2 mmu indexes. + */ +static inline bool regime_is_user(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, USER); +} + +/* Return true if this mmu index is stage 1 of a 2-stage translation. */ +static inline bool arm_mmu_idx_is_stage1_of_2(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, STAGE1); +} + +/* Return true if this mmu index is stage 2 of a 2-stage translation. */ +static inline bool regime_is_stage2(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, STAGE2); +} + +/* Return true if this mmu index implies AccessType_GCS. */ +static inline bool regime_is_gcs(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + return FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, GCS); +} + +/* Return the GCS MMUIdx for a given regime. */ +static inline ARMMMUIdx regime_to_gcs(ARMMMUIdx idx) +{ + tcg_debug_assert(arm_mmuidx_is_valid(idx)); + uint32_t core = FIELD_EX32(arm_mmuidx_table[idx], MMUIDXINFO, TG); + tcg_debug_assert(core != 0); /* core 0 is E10_0, not a GCS index */ + return core | ARM_MMU_IDX_A; +} + +#endif /* TARGET_ARM_MMUIDX_INTERNAL_H */ diff --git a/target/arm/mmuidx.c b/target/arm/mmuidx.c new file mode 100644 index 0000000..a4663c8 --- /dev/null +++ b/target/arm/mmuidx.c @@ -0,0 +1,66 @@ +/* + * QEMU Arm software mmu index definitions + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "mmuidx-internal.h" + + +#define EL(X) ((X << R_MMUIDXINFO_EL_SHIFT) | R_MMUIDXINFO_ELVALID_MASK | \ + ((X == 0) << R_MMUIDXINFO_USER_SHIFT)) +#define REL(X) ((X << R_MMUIDXINFO_REL_SHIFT) | R_MMUIDXINFO_RELVALID_MASK) +#define R2 R_MMUIDXINFO_2RANGES_MASK +#define PAN R_MMUIDXINFO_PAN_MASK +#define USER R_MMUIDXINFO_USER_MASK +#define S1 R_MMUIDXINFO_STAGE1_MASK +#define S2 R_MMUIDXINFO_STAGE2_MASK +#define GCS R_MMUIDXINFO_GCS_MASK +#define TG(X) \ + ((ARMMMUIdx_##X##_GCS & ARM_MMU_IDX_COREIDX_MASK) << R_MMUIDXINFO_TG_SHIFT) + +const uint32_t arm_mmuidx_table[ARM_MMU_IDX_M + 8] = { + /* + * A-profile. + */ + [ARMMMUIdx_E10_0] = EL(0) | REL(1) | R2 | TG(E10_0), + [ARMMMUIdx_E10_0_GCS] = EL(0) | REL(1) | R2 | GCS, + [ARMMMUIdx_E10_1] = EL(1) | REL(1) | R2 | TG(E10_1), + [ARMMMUIdx_E10_1_PAN] = EL(1) | REL(1) | R2 | TG(E10_1) | PAN, + [ARMMMUIdx_E10_1_GCS] = EL(1) | REL(1) | R2 | GCS, + + [ARMMMUIdx_E20_0] = EL(0) | REL(2) | R2 | TG(E20_0), + [ARMMMUIdx_E20_0_GCS] = EL(0) | REL(2) | R2 | GCS, + [ARMMMUIdx_E20_2] = EL(2) | REL(2) | R2 | TG(E20_2), + [ARMMMUIdx_E20_2_PAN] = EL(2) | REL(2) | R2 | TG(E20_2) | PAN, + [ARMMMUIdx_E20_2_GCS] = EL(2) | REL(2) | R2 | GCS, + + [ARMMMUIdx_E2] = EL(2) | REL(2) | TG(E2), + [ARMMMUIdx_E2_GCS] = EL(2) | REL(2) | GCS, + + [ARMMMUIdx_E3] = EL(3) | REL(3) | TG(E3), + [ARMMMUIdx_E3_GCS] = EL(3) | REL(3) | GCS, + [ARMMMUIdx_E30_0] = EL(0) | REL(3), + [ARMMMUIdx_E30_3_PAN] = EL(3) | REL(3) | PAN, + + [ARMMMUIdx_Stage2_S] = REL(2) | S2, + [ARMMMUIdx_Stage2] = REL(2) | S2, + + [ARMMMUIdx_Stage1_E0] = REL(1) | R2 | S1 | USER | TG(Stage1_E0), + [ARMMMUIdx_Stage1_E0_GCS] = REL(1) | R2 | S1 | USER | GCS, + [ARMMMUIdx_Stage1_E1] = REL(1) | R2 | S1 | TG(Stage1_E1), + [ARMMMUIdx_Stage1_E1_PAN] = REL(1) | R2 | S1 | TG(Stage1_E1) | PAN, + [ARMMMUIdx_Stage1_E1_GCS] = REL(1) | R2 | S1 | GCS, + + /* + * M-profile. + */ + [ARMMMUIdx_MUser] = EL(0) | REL(1), + [ARMMMUIdx_MPriv] = EL(1) | REL(1), + [ARMMMUIdx_MUserNegPri] = EL(0) | REL(1), + [ARMMMUIdx_MPrivNegPri] = EL(1) | REL(1), + [ARMMMUIdx_MSUser] = EL(0) | REL(1), + [ARMMMUIdx_MSPriv] = EL(1) | REL(1), + [ARMMMUIdx_MSUserNegPri] = EL(0) | REL(1), + [ARMMMUIdx_MSPrivNegPri] = EL(1) | REL(1), +}; diff --git a/target/arm/mmuidx.h b/target/arm/mmuidx.h new file mode 100644 index 0000000..8d8d273 --- /dev/null +++ b/target/arm/mmuidx.h @@ -0,0 +1,241 @@ +/* + * QEMU Arm software mmu index definitions + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef TARGET_ARM_MMUIDX_H +#define TARGET_ARM_MMUIDX_H + +/* + * Arm has the following "translation regimes" (as the Arm ARM calls them): + * + * If EL3 is 64-bit: + * + NonSecure EL1 & 0 stage 1 + * + NonSecure EL1 & 0 stage 2 + * + NonSecure EL2 + * + NonSecure EL2 & 0 (ARMv8.1-VHE) + * + Secure EL1 & 0 stage 1 + * + Secure EL1 & 0 stage 2 (FEAT_SEL2) + * + Secure EL2 (FEAT_SEL2) + * + Secure EL2 & 0 (FEAT_SEL2) + * + Realm EL1 & 0 stage 1 (FEAT_RME) + * + Realm EL1 & 0 stage 2 (FEAT_RME) + * + Realm EL2 (FEAT_RME) + * + EL3 + * If EL3 is 32-bit: + * + NonSecure PL1 & 0 stage 1 + * + NonSecure PL1 & 0 stage 2 + * + NonSecure PL2 + * + Secure PL1 & 0 + * (reminder: for 32 bit EL3, Secure PL1 is *EL3*, not EL1.) + * + * For QEMU, an mmu_idx is not quite the same as a translation regime because: + * 1. we need to split the "EL1 & 0" and "EL2 & 0" regimes into two mmu_idxes, + * because they may differ in access permissions even if the VA->PA map is + * the same + * 2. we want to cache in our TLB the full VA->IPA->PA lookup for a stage 1+2 + * translation, which means that we have one mmu_idx that deals with two + * concatenated translation regimes [this sort of combined s1+2 TLB is + * architecturally permitted] + * 3. we don't need to allocate an mmu_idx to translations that we won't be + * handling via the TLB. The only way to do a stage 1 translation without + * the immediate stage 2 translation is via the ATS or AT system insns, + * which can be slow-pathed and always do a page table walk. + * The only use of stage 2 translations is either as part of an s1+2 + * lookup or when loading the descriptors during a stage 1 page table walk, + * and in both those cases we don't use the TLB. + * 4. we can also safely fold together the "32 bit EL3" and "64 bit EL3" + * translation regimes, because they map reasonably well to each other + * and they can't both be active at the same time. + * 5. we want to be able to use the TLB for accesses done as part of a + * stage1 page table walk, rather than having to walk the stage2 page + * table over and over. + * 6. we need separate EL1/EL2 mmu_idx for handling the Privileged Access + * Never (PAN) bit within PSTATE. + * 7. we fold together most secure and non-secure regimes for A-profile, + * because there are no banked system registers for aarch64, so the + * process of switching between secure and non-secure is + * already heavyweight. + * 8. we cannot fold together Stage 2 Secure and Stage 2 NonSecure, + * because both are in use simultaneously for Secure EL2. + * 9. we need separate indexes for handling AccessType_GCS. + * + * This gives us the following list of cases: + * + * EL0 EL1&0 stage 1+2 (aka NS PL0 PL1&0 stage 1+2) + * EL0 EL1&0 stage 1+2 +GCS + * EL1 EL1&0 stage 1+2 (aka NS PL1 PL1&0 stage 1+2) + * EL1 EL1&0 stage 1+2 +PAN (aka NS PL1 P1&0 stage 1+2 +PAN) + * EL1 EL1&0 stage 1+2 +GCS + * EL0 EL2&0 + * EL0 EL2&0 +GCS + * EL2 EL2&0 + * EL2 EL2&0 +PAN + * EL2 EL2&0 +GCS + * EL2 (aka NS PL2) + * EL2 +GCS + * EL3 (aka AArch32 S PL1 PL1&0) + * EL3 +GCS + * AArch32 S PL0 PL1&0 (we call this EL30_0) + * AArch32 S PL1 PL1&0 +PAN (we call this EL30_3_PAN) + * Stage2 Secure + * Stage2 NonSecure + * plus one TLB per Physical address space: S, NS, Realm, Root + * + * for a total of 22 different mmu_idx. + * + * R profile CPUs have an MPU, but can use the same set of MMU indexes + * as A profile. They only need to distinguish EL0 and EL1 (and + * EL2 for cores like the Cortex-R52). + * + * M profile CPUs are rather different as they do not have a true MMU. + * They have the following different MMU indexes: + * User + * Privileged + * User, execution priority negative (ie the MPU HFNMIENA bit may apply) + * Privileged, execution priority negative (ditto) + * If the CPU supports the v8M Security Extension then there are also: + * Secure User + * Secure Privileged + * Secure User, execution priority negative + * Secure Privileged, execution priority negative + * + * The ARMMMUIdx and the mmu index value used by the core QEMU TLB code + * are not quite the same -- different CPU types (most notably M profile + * vs A/R profile) would like to use MMU indexes with different semantics, + * but since we don't ever need to use all of those in a single CPU we + * can avoid having to set NB_MMU_MODES to "total number of A profile MMU + * modes + total number of M profile MMU modes". The lower bits of + * ARMMMUIdx are the core TLB mmu index, and the higher bits are always + * the same for any particular CPU. + * Variables of type ARMMUIdx are always full values, and the core + * index values are in variables of type 'int'. + * + * Our enumeration includes at the end some entries which are not "true" + * mmu_idx values in that they don't have corresponding TLBs and are only + * valid for doing slow path page table walks. + * + * The constant names here are patterned after the general style of the names + * of the AT/ATS operations. + * The values used are carefully arranged to make mmu_idx => EL lookup easy. + * For M profile we arrange them to have a bit for priv, a bit for negpri + * and a bit for secure. + */ +#define ARM_MMU_IDX_A 0x20 /* A profile */ +#define ARM_MMU_IDX_NOTLB 0x40 /* does not have a TLB */ +#define ARM_MMU_IDX_M 0x80 /* M profile */ + +/* Meanings of the bits for M profile mmu idx values */ +#define ARM_MMU_IDX_M_PRIV 0x1 +#define ARM_MMU_IDX_M_NEGPRI 0x2 +#define ARM_MMU_IDX_M_S 0x4 /* Secure */ + +#define ARM_MMU_IDX_TYPE_MASK \ + (ARM_MMU_IDX_A | ARM_MMU_IDX_M | ARM_MMU_IDX_NOTLB) +#define ARM_MMU_IDX_COREIDX_MASK 0x1f + +typedef enum ARMMMUIdx { + /* + * A-profile. + */ + + ARMMMUIdx_E10_0 = 0 | ARM_MMU_IDX_A, + ARMMMUIdx_E10_0_GCS = 1 | ARM_MMU_IDX_A, + ARMMMUIdx_E10_1 = 2 | ARM_MMU_IDX_A, + ARMMMUIdx_E10_1_PAN = 3 | ARM_MMU_IDX_A, + ARMMMUIdx_E10_1_GCS = 4 | ARM_MMU_IDX_A, + + ARMMMUIdx_E20_0 = 5 | ARM_MMU_IDX_A, + ARMMMUIdx_E20_0_GCS = 6 | ARM_MMU_IDX_A, + ARMMMUIdx_E20_2 = 7 | ARM_MMU_IDX_A, + ARMMMUIdx_E20_2_PAN = 8 | ARM_MMU_IDX_A, + ARMMMUIdx_E20_2_GCS = 9 | ARM_MMU_IDX_A, + + ARMMMUIdx_E2 = 10 | ARM_MMU_IDX_A, + ARMMMUIdx_E2_GCS = 11 | ARM_MMU_IDX_A, + + ARMMMUIdx_E3 = 12 | ARM_MMU_IDX_A, + ARMMMUIdx_E3_GCS = 13 | ARM_MMU_IDX_A, + ARMMMUIdx_E30_0 = 14 | ARM_MMU_IDX_A, + ARMMMUIdx_E30_3_PAN = 15 | ARM_MMU_IDX_A, + + /* + * Used for second stage of an S12 page table walk, or for descriptor + * loads during first stage of an S1 page table walk. Note that both + * are in use simultaneously for SecureEL2: the security state for + * the S2 ptw is selected by the NS bit from the S1 ptw. + */ + ARMMMUIdx_Stage2_S = 16 | ARM_MMU_IDX_A, + ARMMMUIdx_Stage2 = 17 | ARM_MMU_IDX_A, + + /* TLBs with 1-1 mapping to the physical address spaces. */ + ARMMMUIdx_Phys_S = 18 | ARM_MMU_IDX_A, + ARMMMUIdx_Phys_NS = 19 | ARM_MMU_IDX_A, + ARMMMUIdx_Phys_Root = 20 | ARM_MMU_IDX_A, + ARMMMUIdx_Phys_Realm = 21 | ARM_MMU_IDX_A, + + /* + * These are not allocated TLBs and are used only for AT system + * instructions or for the first stage of an S12 page table walk. + */ + ARMMMUIdx_Stage1_E0 = 0 | ARM_MMU_IDX_NOTLB, + ARMMMUIdx_Stage1_E1 = 1 | ARM_MMU_IDX_NOTLB, + ARMMMUIdx_Stage1_E1_PAN = 2 | ARM_MMU_IDX_NOTLB, + ARMMMUIdx_Stage1_E0_GCS = 3 | ARM_MMU_IDX_NOTLB, + ARMMMUIdx_Stage1_E1_GCS = 4 | ARM_MMU_IDX_NOTLB, + + /* + * M-profile. + */ + ARMMMUIdx_MUser = ARM_MMU_IDX_M, + ARMMMUIdx_MPriv = ARM_MMU_IDX_M | ARM_MMU_IDX_M_PRIV, + ARMMMUIdx_MUserNegPri = ARMMMUIdx_MUser | ARM_MMU_IDX_M_NEGPRI, + ARMMMUIdx_MPrivNegPri = ARMMMUIdx_MPriv | ARM_MMU_IDX_M_NEGPRI, + ARMMMUIdx_MSUser = ARMMMUIdx_MUser | ARM_MMU_IDX_M_S, + ARMMMUIdx_MSPriv = ARMMMUIdx_MPriv | ARM_MMU_IDX_M_S, + ARMMMUIdx_MSUserNegPri = ARMMMUIdx_MUserNegPri | ARM_MMU_IDX_M_S, + ARMMMUIdx_MSPrivNegPri = ARMMMUIdx_MPrivNegPri | ARM_MMU_IDX_M_S, +} ARMMMUIdx; + +/* + * Bit macros for the core-mmu-index values for each index, + * for use when calling tlb_flush_by_mmuidx() and friends. + */ +#define TO_CORE_BIT(NAME) \ + ARMMMUIdxBit_##NAME = 1 << (ARMMMUIdx_##NAME & ARM_MMU_IDX_COREIDX_MASK) + +typedef enum ARMMMUIdxBit { + TO_CORE_BIT(E10_0), + TO_CORE_BIT(E10_0_GCS), + TO_CORE_BIT(E10_1), + TO_CORE_BIT(E10_1_PAN), + TO_CORE_BIT(E10_1_GCS), + TO_CORE_BIT(E20_0), + TO_CORE_BIT(E20_0_GCS), + TO_CORE_BIT(E20_2), + TO_CORE_BIT(E20_2_PAN), + TO_CORE_BIT(E20_2_GCS), + TO_CORE_BIT(E2), + TO_CORE_BIT(E2_GCS), + TO_CORE_BIT(E3), + TO_CORE_BIT(E3_GCS), + TO_CORE_BIT(E30_0), + TO_CORE_BIT(E30_3_PAN), + TO_CORE_BIT(Stage2), + TO_CORE_BIT(Stage2_S), + + TO_CORE_BIT(MUser), + TO_CORE_BIT(MPriv), + TO_CORE_BIT(MUserNegPri), + TO_CORE_BIT(MPrivNegPri), + TO_CORE_BIT(MSUser), + TO_CORE_BIT(MSPriv), + TO_CORE_BIT(MSUserNegPri), + TO_CORE_BIT(MSPrivNegPri), +} ARMMMUIdxBit; + +#undef TO_CORE_BIT + +#define MMU_USER_IDX 0 + +#endif /* TARGET_ARM_MMUIDX_H */ diff --git a/target/arm/ptw.c b/target/arm/ptw.c index e03657f..d4386ed 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -79,6 +79,8 @@ typedef struct S1Translate { * may be suppressed for debug or AT insns. */ uint8_t in_prot_check; + /* Cached EffectiveHCR_EL2_NVx() bit */ + bool in_nv1; bool out_rw; bool out_be; ARMSecuritySpace out_space; @@ -167,6 +169,10 @@ ARMMMUIdx stage_1_mmu_idx(ARMMMUIdx mmu_idx) return ARMMMUIdx_Stage1_E1; case ARMMMUIdx_E10_1_PAN: return ARMMMUIdx_Stage1_E1_PAN; + case ARMMMUIdx_E10_0_GCS: + return ARMMMUIdx_Stage1_E0_GCS; + case ARMMMUIdx_E10_1_GCS: + return ARMMMUIdx_Stage1_E1_GCS; default: return mmu_idx; } @@ -233,9 +239,9 @@ static uint64_t regime_ttbr(CPUARMState *env, ARMMMUIdx mmu_idx, int ttbrn) return env->cp15.vsttbr_el2; } if (ttbrn == 0) { - return env->cp15.ttbr0_el[regime_el(env, mmu_idx)]; + return env->cp15.ttbr0_el[regime_el(mmu_idx)]; } else { - return env->cp15.ttbr1_el[regime_el(env, mmu_idx)]; + return env->cp15.ttbr1_el[regime_el(mmu_idx)]; } } @@ -274,8 +280,10 @@ static bool regime_translation_disabled(CPUARMState *env, ARMMMUIdx mmu_idx, return (hcr_el2 & (HCR_DC | HCR_VM)) == 0; case ARMMMUIdx_E10_0: + case ARMMMUIdx_E10_0_GCS: case ARMMMUIdx_E10_1: case ARMMMUIdx_E10_1_PAN: + case ARMMMUIdx_E10_1_GCS: /* TGE means that EL0/1 act as if SCTLR_EL1.M is zero */ hcr_el2 = arm_hcr_el2_eff_secstate(env, space); if (hcr_el2 & HCR_TGE) { @@ -284,8 +292,10 @@ static bool regime_translation_disabled(CPUARMState *env, ARMMMUIdx mmu_idx, break; case ARMMMUIdx_Stage1_E0: + case ARMMMUIdx_Stage1_E0_GCS: case ARMMMUIdx_Stage1_E1: case ARMMMUIdx_Stage1_E1_PAN: + case ARMMMUIdx_Stage1_E1_GCS: /* HCR.DC means SCTLR_EL1.M behaves as 0 */ hcr_el2 = arm_hcr_el2_eff_secstate(env, space); if (hcr_el2 & HCR_DC) { @@ -294,10 +304,14 @@ static bool regime_translation_disabled(CPUARMState *env, ARMMMUIdx mmu_idx, break; case ARMMMUIdx_E20_0: + case ARMMMUIdx_E20_0_GCS: case ARMMMUIdx_E20_2: case ARMMMUIdx_E20_2_PAN: + case ARMMMUIdx_E20_2_GCS: case ARMMMUIdx_E2: + case ARMMMUIdx_E2_GCS: case ARMMMUIdx_E3: + case ARMMMUIdx_E3_GCS: case ARMMMUIdx_E30_0: case ARMMMUIdx_E30_3_PAN: break; @@ -998,7 +1012,7 @@ static int ap_to_rw_prot(CPUARMState *env, ARMMMUIdx mmu_idx, int ap, int domain_prot) { return ap_to_rw_prot_is_user(env, mmu_idx, ap, domain_prot, - regime_is_user(env, mmu_idx)); + regime_is_user(mmu_idx)); } /* @@ -1024,7 +1038,7 @@ static int simple_ap_to_rw_prot_is_user(int ap, bool is_user) static int simple_ap_to_rw_prot(CPUARMState *env, ARMMMUIdx mmu_idx, int ap) { - return simple_ap_to_rw_prot_is_user(ap, regime_is_user(env, mmu_idx)); + return simple_ap_to_rw_prot_is_user(ap, regime_is_user(mmu_idx)); } static bool get_phys_addr_v5(CPUARMState *env, S1Translate *ptw, @@ -1057,7 +1071,7 @@ static bool get_phys_addr_v5(CPUARMState *env, S1Translate *ptw, } type = (desc & 3); domain = (desc >> 5) & 0x0f; - if (regime_el(env, ptw->in_mmu_idx) == 1) { + if (regime_el(ptw->in_mmu_idx) == 1) { dacr = env->cp15.dacr_ns; } else { dacr = env->cp15.dacr_s; @@ -1196,7 +1210,7 @@ static bool get_phys_addr_v6(CPUARMState *env, S1Translate *ptw, /* Page or Section. */ domain = (desc >> 5) & 0x0f; } - if (regime_el(env, mmu_idx) == 1) { + if (regime_el(mmu_idx) == 1) { dacr = env->cp15.dacr_ns; } else { dacr = env->cp15.dacr_s; @@ -1314,7 +1328,7 @@ do_fault: * @xn: XN (execute-never) bits * @s1_is_el0: true if this is S2 of an S1+2 walk for EL0 */ -static int get_S2prot_noexecute(int s2ap) +static int get_S2prot(CPUARMState *env, int s2ap, int xn, bool s1_is_el0) { int prot = 0; @@ -1324,12 +1338,6 @@ static int get_S2prot_noexecute(int s2ap) if (s2ap & 2) { prot |= PAGE_WRITE; } - return prot; -} - -static int get_S2prot(CPUARMState *env, int s2ap, int xn, bool s1_is_el0) -{ - int prot = get_S2prot_noexecute(s2ap); if (cpu_isar_feature(any_tts2uxn, env_archcpu(env))) { switch (xn) { @@ -1361,6 +1369,44 @@ static int get_S2prot(CPUARMState *env, int s2ap, int xn, bool s1_is_el0) return prot; } +static int get_S2prot_indirect(CPUARMState *env, GetPhysAddrResult *result, + int pi_index, int po_index, bool s1_is_el0) +{ + /* Last index is (priv, unpriv, ttw) */ + static const uint8_t perm_table[16][3] = { + /* 0 */ { 0, 0, 0 }, /* no access */ + /* 1 */ { 0, 0, 0 }, /* reserved */ + /* 2 */ { PAGE_READ, PAGE_READ, PAGE_READ | PAGE_WRITE }, + /* 3 */ { PAGE_READ, PAGE_READ, PAGE_READ | PAGE_WRITE }, + /* 4 */ { PAGE_WRITE, PAGE_WRITE, 0 }, + /* 5 */ { 0, 0, 0 }, /* reserved */ + /* 6 */ { PAGE_READ, PAGE_READ, PAGE_READ | PAGE_WRITE }, + /* 7 */ { PAGE_READ, PAGE_READ, PAGE_READ | PAGE_WRITE }, + /* 8 */ { PAGE_READ, PAGE_READ, PAGE_READ }, + /* 9 */ { PAGE_READ, PAGE_READ | PAGE_EXEC, PAGE_READ }, + /* A */ { PAGE_READ | PAGE_EXEC, PAGE_READ, PAGE_READ }, + /* B */ { PAGE_READ | PAGE_EXEC, PAGE_READ | PAGE_EXEC, PAGE_READ }, + /* C */ { PAGE_READ | PAGE_WRITE, + PAGE_READ | PAGE_WRITE, + PAGE_READ | PAGE_WRITE }, + /* D */ { PAGE_READ | PAGE_WRITE, + PAGE_READ | PAGE_WRITE | PAGE_EXEC, + PAGE_READ | PAGE_WRITE }, + /* E */ { PAGE_READ | PAGE_WRITE | PAGE_EXEC, + PAGE_READ | PAGE_WRITE, + PAGE_READ | PAGE_WRITE }, + /* F */ { PAGE_READ | PAGE_WRITE | PAGE_EXEC, + PAGE_READ | PAGE_WRITE | PAGE_EXEC, + PAGE_READ | PAGE_WRITE }, + }; + + uint64_t pir = (env->cp15.scr_el3 & SCR_PIEN ? env->cp15.s2pir_el2 : 0); + int s2pi = extract64(pir, pi_index * 4, 4); + + result->f.prot = perm_table[s2pi][2]; + return perm_table[s2pi][s1_is_el0]; +} + /* * Translate section/page access permissions to protection flags * @env: CPUARMState @@ -1378,7 +1424,7 @@ static int get_S1prot(CPUARMState *env, ARMMMUIdx mmu_idx, bool is_aa64, ARMSecuritySpace in_pa, ARMSecuritySpace out_pa) { ARMCPU *cpu = env_archcpu(env); - bool is_user = regime_is_user(env, mmu_idx); + bool is_user = regime_is_user(mmu_idx); bool have_wxn; int wxn = 0; @@ -1395,10 +1441,10 @@ static int get_S1prot(CPUARMState *env, ARMMMUIdx mmu_idx, bool is_aa64, * We make the IMPDEF choices that SCR_EL3.SIF and Realm EL2&0 * do not affect EPAN. */ - if (user_rw && regime_is_pan(env, mmu_idx)) { + if (user_rw && regime_is_pan(mmu_idx)) { prot_rw = 0; } else if (cpu_isar_feature(aa64_pan3, cpu) && is_aa64 && - regime_is_pan(env, mmu_idx) && + regime_is_pan(mmu_idx) && (regime_sctlr(env, mmu_idx) & SCTLR_EPAN) && !xn) { prot_rw = 0; } @@ -1455,7 +1501,7 @@ static int get_S1prot(CPUARMState *env, ARMMMUIdx mmu_idx, bool is_aa64, xn = pxn || (user_rw & PAGE_WRITE); } } else if (arm_feature(env, ARM_FEATURE_V7)) { - switch (regime_el(env, mmu_idx)) { + switch (regime_el(mmu_idx)) { case 1: case 3: if (is_user) { @@ -1482,11 +1528,115 @@ static int get_S1prot(CPUARMState *env, ARMMMUIdx mmu_idx, bool is_aa64, return prot_rw | PAGE_EXEC; } +/* Extra page permission bits, during get_S1prot_indirect only. */ +#define PAGE_GCS (1 << 3) +#define PAGE_WXN (1 << 4) +#define PAGE_OVERLAY (1 << 5) +QEMU_BUILD_BUG_ON(PAGE_RWX & (PAGE_GCS | PAGE_WXN | PAGE_OVERLAY)); + +static int get_S1prot_indirect(CPUARMState *env, S1Translate *ptw, + ARMMMUIdx mmu_idx, int pi_index, int po_index, + ARMSecuritySpace in_pa, ARMSecuritySpace out_pa) +{ + static const uint8_t perm_table[16] = { + /* 0 */ PAGE_OVERLAY, /* no access */ + /* 1 */ PAGE_OVERLAY | PAGE_READ, + /* 2 */ PAGE_OVERLAY | PAGE_EXEC, + /* 3 */ PAGE_OVERLAY | PAGE_READ | PAGE_EXEC, + /* 4 */ PAGE_OVERLAY, /* reserved */ + /* 5 */ PAGE_OVERLAY | PAGE_READ | PAGE_WRITE, + /* 6 */ PAGE_OVERLAY | PAGE_READ | PAGE_WRITE | PAGE_EXEC | PAGE_WXN, + /* 7 */ PAGE_OVERLAY | PAGE_READ | PAGE_WRITE | PAGE_EXEC, + /* 8 */ PAGE_READ, + /* 9 */ PAGE_READ | PAGE_GCS, + /* A */ PAGE_READ | PAGE_EXEC, + /* B */ 0, /* reserved */ + /* C */ PAGE_READ | PAGE_WRITE, + /* D */ 0, /* reserved */ + /* E */ PAGE_READ | PAGE_WRITE | PAGE_EXEC, + /* F */ 0, /* reserved */ + }; + + uint32_t el = regime_el(mmu_idx); + uint64_t pir = env->cp15.pir_el[el]; + uint64_t pire0 = 0; + int perm; + + if (el < 3) { + if (arm_feature(env, ARM_FEATURE_EL3) + && !(env->cp15.scr_el3 & SCR_PIEN)) { + pir = 0; + } else if (el == 2) { + pire0 = env->cp15.pire0_el2; + } else if (!ptw->in_nv1) { + pire0 = env->cp15.pir_el[0]; + } + } + perm = perm_table[extract64(pir, pi_index * 4, 4)]; + + if (regime_has_2_ranges(mmu_idx)) { + int p_perm = perm; + int u_perm = perm_table[extract64(pire0, pi_index * 4, 4)]; + + if ((p_perm & (PAGE_EXEC | PAGE_GCS)) && + (u_perm & (PAGE_WRITE | PAGE_GCS))) { + p_perm &= ~(PAGE_RWX | PAGE_GCS); + u_perm &= ~(PAGE_RWX | PAGE_GCS); + } + if ((u_perm & (PAGE_RWX | PAGE_GCS)) && regime_is_pan(mmu_idx)) { + p_perm &= ~(PAGE_READ | PAGE_WRITE); + } + perm = regime_is_user(mmu_idx) ? u_perm : p_perm; + } + + if (in_pa != out_pa) { + switch (in_pa) { + case ARMSS_Root: + /* + * R_ZWRVD: permission fault for insn fetched from non-Root, + * I_WWBFB: SIF has no effect in EL3. + */ + perm &= ~(PAGE_EXEC | PAGE_GCS); + break; + case ARMSS_Realm: + /* + * R_PKTDS: permission fault for insn fetched from non-Realm, + * for Realm EL2 or EL2&0. The corresponding fault for EL1&0 + * happens during any stage2 translation. + */ + if (el == 2) { + perm &= ~(PAGE_EXEC | PAGE_GCS); + } + break; + case ARMSS_Secure: + if (env->cp15.scr_el3 & SCR_SIF) { + perm &= ~(PAGE_EXEC | PAGE_GCS); + } + break; + default: + /* Input NonSecure must have output NonSecure. */ + g_assert_not_reached(); + } + } + + if (regime_is_gcs(mmu_idx)) { + /* + * Note that the one s1perms.gcs bit controls both read and write + * access via AccessType_GCS. See AArch64.S1CheckPermissions. + */ + perm = (perm & PAGE_GCS ? PAGE_READ | PAGE_WRITE : 0); + } else if (perm & PAGE_WXN) { + perm &= ~PAGE_EXEC; + } + + return perm & PAGE_RWX; +} + static ARMVAParameters aa32_va_parameters(CPUARMState *env, uint32_t va, ARMMMUIdx mmu_idx) { uint64_t tcr = regime_tcr(env, mmu_idx); - uint32_t el = regime_el(env, mmu_idx); + uint32_t el = regime_el(mmu_idx); int select, tsz; bool epd, hpd; @@ -1507,8 +1657,12 @@ static ARMVAParameters aa32_va_parameters(CPUARMState *env, uint32_t va, } tsz = sextract32(tcr, 0, 4) + 8; select = 0; - hpd = false; epd = false; + /* + * Stage2 does not have hierarchical permissions. + * Thus disabling them makes things easier during ptw. + */ + hpd = true; } else if (el == 2) { /* HTCR */ tsz = extract32(tcr, 0, 3); @@ -1673,12 +1827,6 @@ static bool lpae_block_desc_valid(ARMCPU *cpu, bool ds, } } -static bool nv_nv1_enabled(CPUARMState *env, S1Translate *ptw) -{ - uint64_t hcr = arm_hcr_el2_eff_secstate(env, ptw->in_space); - return (hcr & (HCR_NV | HCR_NV1)) == (HCR_NV | HCR_NV1); -} - /** * get_phys_addr_lpae: perform one stage of page table walk, LPAE format * @@ -1713,8 +1861,8 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, int32_t stride; int addrsize, inputsize, outputsize; uint64_t tcr = regime_tcr(env, mmu_idx); - int ap, xn, pxn; - uint32_t el = regime_el(env, mmu_idx); + int ap, prot; + uint32_t el = regime_el(mmu_idx); uint64_t descaddrmask; bool aarch64 = arm_el_is_aa64(env, el); uint64_t descriptor, new_descriptor; @@ -1731,6 +1879,16 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, level = 0; /* + * Cache NV1 before we adjust ptw->in_space for NSTable. + * Note that this is only relevant for EL1&0, and that + * computing it would assert for ARMSS_Root. + */ + if (el == 1) { + uint64_t hcr = arm_hcr_el2_eff_secstate(env, ptw->in_space); + ptw->in_nv1 = (hcr & (HCR_NV | HCR_NV1)) == (HCR_NV | HCR_NV1); + } + + /* * If TxSZ is programmed to a value larger than the maximum, * or smaller than the effective minimum, it is IMPLEMENTATION * DEFINED whether we behave as if the field were programmed @@ -2014,21 +2172,31 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, * except NSTable (which we have already handled). */ attrs = new_descriptor & (MAKE_64BIT_MASK(2, 10) | MAKE_64BIT_MASK(50, 14)); - if (!regime_is_stage2(mmu_idx)) { - if (!param.hpd) { - attrs |= extract64(tableattrs, 0, 2) << 53; /* XN, PXN */ - /* - * The sense of AP[1] vs APTable[0] is reversed, as APTable[0] == 1 - * means "force PL1 access only", which means forcing AP[1] to 0. - */ - attrs &= ~(extract64(tableattrs, 2, 1) << 6); /* !APT[0] => AP[1] */ - attrs |= extract32(tableattrs, 3, 1) << 7; /* APT[1] => AP[2] */ - } + if (!param.hpd) { + attrs |= extract64(tableattrs, 0, 2) << 53; /* XN, PXN */ + /* + * The sense of AP[1] vs APTable[0] is reversed, as APTable[0] == 1 + * means "force PL1 access only", which means forcing AP[1] to 0. + */ + attrs &= ~(extract64(tableattrs, 2, 1) << 6); /* !APT[0] => AP[1] */ + attrs |= extract32(tableattrs, 3, 1) << 7; /* APT[1] => AP[2] */ } ap = extract32(attrs, 6, 2); out_space = ptw->cur_space; if (regime_is_stage2(mmu_idx)) { + if (param.pie) { + int pi = extract64(attrs, 6, 1) + | (extract64(attrs, 51, 1) << 1) + | (extract64(attrs, 53, 2) << 2); + int po = extract64(attrs, 60, 3); + prot = get_S2prot_indirect(env, result, pi, po, ptw->in_s1_is_el0); + } else { + int xn = extract64(attrs, 53, 2); + prot = get_S2prot(env, ap, xn, ptw->in_s1_is_el0); + /* Install TTW permissions in f.prot. */ + result->f.prot = prot & (PAGE_READ | PAGE_WRITE); + } /* * R_GYNXY: For stage2 in Realm security state, bit 55 is NS. * The bit remains ignored for other security states. @@ -2037,11 +2205,9 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, */ if (out_space == ARMSS_Realm && extract64(attrs, 55, 1)) { out_space = ARMSS_NonSecure; - result->f.prot = get_S2prot_noexecute(ap); - } else { - xn = extract64(attrs, 53, 2); - result->f.prot = get_S2prot(env, ap, xn, ptw->in_s1_is_el0); + prot &= ~PAGE_EXEC; } + result->s2prot = prot; result->cacheattrs.is_s2_format = true; result->cacheattrs.attrs = extract32(attrs, 2, 4); @@ -2055,7 +2221,6 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, int nse, ns = extract32(attrs, 5, 1); uint8_t attrindx; uint64_t mair; - int user_rw, prot_rw; switch (out_space) { case ARMSS_Root: @@ -2104,33 +2269,57 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, default: g_assert_not_reached(); } - xn = extract64(attrs, 54, 1); - pxn = extract64(attrs, 53, 1); - if (el == 1 && nv_nv1_enabled(env, ptw)) { + if (param.pie) { + int pi = extract64(attrs, 6, 1) + | (extract64(attrs, 51, 1) << 1) + | (extract64(attrs, 53, 2) << 2); + int po = extract64(attrs, 60, 3); /* - * With FEAT_NV, when HCR_EL2.{NV,NV1} == {1,1}, the block/page - * descriptor bit 54 holds PXN, 53 is RES0, and the effective value - * of UXN is 0. Similarly for bits 59 and 60 in table descriptors - * (which we have already folded into bits 53 and 54 of attrs). - * AP[1] (descriptor bit 6, our ap bit 0) is treated as 0. - * Similarly, APTable[0] from the table descriptor is treated as 0; - * we already folded this into AP[1] and squashing that to 0 does - * the right thing. + * Note that we modified ptw->in_space earlier for NSTable, but + * result->f.attrs retains a copy of the original security space. */ - pxn = xn; - xn = 0; - ap &= ~1; - } + prot = get_S1prot_indirect(env, ptw, mmu_idx, pi, po, + result->f.attrs.space, out_space); + } else if (regime_is_gcs(mmu_idx)) { + /* + * While one must use indirect permissions to successfully + * use GCS instructions, AArch64.S1DirectBasePermissions + * faithfully supplies s1perms.gcs = 0, Just In Case. + */ + prot = 0; + } else { + int xn = extract64(attrs, 54, 1); + int pxn = extract64(attrs, 53, 1); + int user_rw, prot_rw; - user_rw = simple_ap_to_rw_prot_is_user(ap, true); - prot_rw = simple_ap_to_rw_prot_is_user(ap, false); - result->f.prot = get_S1prot(env, mmu_idx, aarch64, user_rw, prot_rw, - xn, pxn, ptw->in_space, out_space); + if (el == 1 && ptw->in_nv1) { + /* + * With FEAT_NV, when HCR_EL2.{NV,NV1} == {1,1}, + * the block/page descriptor bit 54 holds PXN, + * 53 is RES0, and the effective value of UXN is 0. + * Similarly for bits 59 and 60 in table descriptors + * (which we have already folded into bits 53 and 54 of attrs). + * AP[1] (descriptor bit 6, our ap bit 0) is treated as 0. + * Similarly, APTable[0] from the table descriptor is treated + * as 0; we already folded this into AP[1] and squashing + * that to 0 does the right thing. + */ + pxn = xn; + xn = 0; + ap &= ~1; + } + + user_rw = simple_ap_to_rw_prot_is_user(ap, true); + prot_rw = simple_ap_to_rw_prot_is_user(ap, false); + prot = get_S1prot(env, mmu_idx, aarch64, user_rw, prot_rw, + xn, pxn, ptw->in_space, out_space); + } + result->f.prot = prot; /* Index into MAIR registers for cache attributes */ attrindx = extract32(attrs, 2, 3); - mair = env->cp15.mair_el[regime_el(env, mmu_idx)]; + mair = env->cp15.mair_el[regime_el(mmu_idx)]; assert(attrindx <= 7); result->cacheattrs.is_s2_format = false; result->cacheattrs.attrs = extract64(mair, attrindx * 8, 8); @@ -2172,11 +2361,27 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, result->f.tlb_fill_flags = 0; } - if (ptw->in_prot_check & ~result->f.prot) { + if (ptw->in_prot_check & ~prot) { fi->type = ARMFault_Permission; goto do_fault; } + /* S1PIE and S2PIE both have a bit for software dirty page tracking. */ + if (access_type == MMU_DATA_STORE && param.pie) { + /* + * For S1PIE, bit 7 is nDirty and both HA and HD are checked. + * For S2PIE, bit 7 is Dirty and only HD is checked. + */ + bool bit7 = extract64(attrs, 7, 1); + if (regime_is_stage2(mmu_idx) + ? !bit7 && !param.hd + : bit7 && !(param.ha && param.hd)) { + fi->type = ARMFault_Permission; + fi->dirtybit = true; + goto do_fault; + } + } + /* If FEAT_HAFDBS has made changes, update the PTE. */ if (new_descriptor != descriptor) { new_descriptor = arm_casq_ptw(env, descriptor, new_descriptor, ptw, fi); @@ -2239,7 +2444,7 @@ static bool get_phys_addr_pmsav5(CPUARMState *env, uint32_t mask; uint32_t base; ARMMMUIdx mmu_idx = ptw->in_mmu_idx; - bool is_user = regime_is_user(env, mmu_idx); + bool is_user = regime_is_user(mmu_idx); if (regime_translation_disabled(env, mmu_idx, ptw->in_space)) { /* MPU disabled. */ @@ -2406,7 +2611,7 @@ static bool get_phys_addr_pmsav7(CPUARMState *env, ARMCPU *cpu = env_archcpu(env); int n; ARMMMUIdx mmu_idx = ptw->in_mmu_idx; - bool is_user = regime_is_user(env, mmu_idx); + bool is_user = regime_is_user(mmu_idx); bool secure = arm_space_is_secure(ptw->in_space); result->f.phys_addr = address; @@ -2592,7 +2797,7 @@ static bool get_phys_addr_pmsav7(CPUARMState *env, static uint32_t *regime_rbar(CPUARMState *env, ARMMMUIdx mmu_idx, uint32_t secure) { - if (regime_el(env, mmu_idx) == 2) { + if (regime_el(mmu_idx) == 2) { return env->pmsav8.hprbar; } else { return env->pmsav8.rbar[secure]; @@ -2602,7 +2807,7 @@ static uint32_t *regime_rbar(CPUARMState *env, ARMMMUIdx mmu_idx, static uint32_t *regime_rlar(CPUARMState *env, ARMMMUIdx mmu_idx, uint32_t secure) { - if (regime_el(env, mmu_idx) == 2) { + if (regime_el(mmu_idx) == 2) { return env->pmsav8.hprlar; } else { return env->pmsav8.rlar[secure]; @@ -2626,7 +2831,7 @@ bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, * memory system to use a subpage. */ ARMCPU *cpu = env_archcpu(env); - bool is_user = regime_is_user(env, mmu_idx); + bool is_user = regime_is_user(mmu_idx); int n; int matchregion = -1; bool hit = false; @@ -2634,7 +2839,7 @@ bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, uint32_t addr_page_limit = addr_page_base + (TARGET_PAGE_SIZE - 1); int region_counter; - if (regime_el(env, mmu_idx) == 2) { + if (regime_el(mmu_idx) == 2) { region_counter = cpu->pmsav8r_hdregion; } else { region_counter = cpu->pmsav7_dregion; @@ -2760,7 +2965,7 @@ bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, xn = 1; } - if (regime_el(env, mmu_idx) == 2) { + if (regime_el(mmu_idx) == 2) { result->f.prot = simple_ap_to_rw_prot_is_user(ap, mmu_idx != ARMMMUIdx_E2); } else { @@ -2769,7 +2974,7 @@ bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, if (!arm_feature(env, ARM_FEATURE_M)) { uint8_t attrindx = extract32(matched_rlar, 1, 3); - uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)]; + uint64_t mair = env->cp15.mair_el[regime_el(mmu_idx)]; uint8_t sh = extract32(matched_rlar, 3, 2); if (regime_sctlr(env, mmu_idx) & SCTLR_WXN && @@ -2777,7 +2982,7 @@ bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, xn = 0x1; } - if ((regime_el(env, mmu_idx) == 1) && + if ((regime_el(mmu_idx) == 1) && regime_sctlr(env, mmu_idx) & SCTLR_UWXN && ap == 0x1) { pxn = 0x1; } @@ -3262,7 +3467,7 @@ static bool get_phys_addr_disabled(CPUARMState *env, break; default: - r_el = regime_el(env, mmu_idx); + r_el = regime_el(mmu_idx); if (arm_el_is_aa64(env, r_el)) { int pamax = arm_pamax(env_archcpu(env)); uint64_t tcr = env->cp15.tcr_el[r_el]; @@ -3370,7 +3575,7 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, fi->s2addr = ipa; /* Combine the S1 and S2 perms. */ - result->f.prot &= s1_prot; + result->f.prot = s1_prot & result->s2prot; /* If S2 fails, return early. */ if (ret) { @@ -3507,7 +3712,7 @@ static bool get_phys_addr_nogpc(CPUARMState *env, S1Translate *ptw, break; } - result->f.attrs.user = regime_is_user(env, mmu_idx); + result->f.attrs.user = regime_is_user(mmu_idx); /* * Fast Context Switch Extension. This doesn't exist at all in v8. @@ -3515,7 +3720,7 @@ static bool get_phys_addr_nogpc(CPUARMState *env, S1Translate *ptw, */ if (address < 0x02000000 && mmu_idx != ARMMMUIdx_Stage2 && !arm_feature(env, ARM_FEATURE_V8)) { - if (regime_el(env, mmu_idx) == 3) { + if (regime_el(mmu_idx) == 3) { address += env->cp15.fcseidr_s; } else { address += env->cp15.fcseidr_ns; @@ -3617,15 +3822,22 @@ arm_mmu_idx_to_security_space(CPUARMState *env, ARMMMUIdx mmu_idx) switch (mmu_idx) { case ARMMMUIdx_E10_0: + case ARMMMUIdx_E10_0_GCS: case ARMMMUIdx_E10_1: case ARMMMUIdx_E10_1_PAN: + case ARMMMUIdx_E10_1_GCS: case ARMMMUIdx_E20_0: + case ARMMMUIdx_E20_0_GCS: case ARMMMUIdx_E20_2: case ARMMMUIdx_E20_2_PAN: + case ARMMMUIdx_E20_2_GCS: case ARMMMUIdx_Stage1_E0: + case ARMMMUIdx_Stage1_E0_GCS: case ARMMMUIdx_Stage1_E1: case ARMMMUIdx_Stage1_E1_PAN: + case ARMMMUIdx_Stage1_E1_GCS: case ARMMMUIdx_E2: + case ARMMMUIdx_E2_GCS: ss = arm_security_space_below_el3(env); break; case ARMMMUIdx_Stage2: @@ -3654,6 +3866,7 @@ arm_mmu_idx_to_security_space(CPUARMState *env, ARMMMUIdx mmu_idx) ss = ARMSS_Secure; break; case ARMMMUIdx_E3: + case ARMMMUIdx_E3_GCS: case ARMMMUIdx_E30_0: case ARMMMUIdx_E30_3_PAN: if (arm_feature(env, ARM_FEATURE_AARCH64) && diff --git a/target/arm/syndrome.h b/target/arm/syndrome.h index c48d3b8..bff61f0 100644 --- a/target/arm/syndrome.h +++ b/target/arm/syndrome.h @@ -63,6 +63,7 @@ enum arm_exception_class { EC_MOP = 0x27, EC_AA32_FPTRAP = 0x28, EC_AA64_FPTRAP = 0x2c, + EC_GCS = 0x2d, EC_SERROR = 0x2f, EC_BREAKPOINT = 0x30, EC_BREAKPOINT_SAME_EL = 0x31, @@ -83,6 +84,23 @@ typedef enum { SME_ET_InaccessibleZT0, } SMEExceptionType; +typedef enum { + GCS_ET_DataCheck, + GCS_ET_EXLOCK, + GCS_ET_GCSSTR_GCSSTTR, +} GCSExceptionType; + +typedef enum { + GCS_IT_RET_nPauth = 0, + GCS_IT_GCSPOPM = 1, + GCS_IT_RET_PauthA = 2, + GCS_IT_RET_PauthB = 3, + GCS_IT_GCSSS1 = 4, + GCS_IT_GCSSS2 = 5, + GCS_IT_GCSPOPCX = 8, + GCS_IT_GCSPOPX = 9, +} GCSInstructionType; + #define ARM_EL_EC_LENGTH 6 #define ARM_EL_EC_SHIFT 26 #define ARM_EL_IL_SHIFT 25 @@ -351,6 +369,23 @@ static inline uint32_t syn_pcalignment(void) return (EC_PCALIGNMENT << ARM_EL_EC_SHIFT) | ARM_EL_IL; } +static inline uint32_t syn_gcs_data_check(GCSInstructionType it, int rn) +{ + return ((EC_GCS << ARM_EL_EC_SHIFT) | ARM_EL_IL | + (GCS_ET_DataCheck << 20) | (rn << 5) | it); +} + +static inline uint32_t syn_gcs_exlock(void) +{ + return (EC_GCS << ARM_EL_EC_SHIFT) | ARM_EL_IL | (GCS_ET_EXLOCK << 20); +} + +static inline uint32_t syn_gcs_gcsstr(int ra, int rn) +{ + return ((EC_GCS << ARM_EL_EC_SHIFT) | ARM_EL_IL | + (GCS_ET_GCSSTR_GCSSTTR << 20) | (ra << 10) | (rn << 5)); +} + static inline uint32_t syn_serror(uint32_t extra) { return (EC_SERROR << ARM_EL_EC_SHIFT) | ARM_EL_IL | extra; diff --git a/target/arm/tcg-stubs.c b/target/arm/tcg-stubs.c index aac99b2..aeeede8 100644 --- a/target/arm/tcg-stubs.c +++ b/target/arm/tcg-stubs.c @@ -16,7 +16,7 @@ void write_v7m_exception(CPUARMState *env, uint32_t new_exc) g_assert_not_reached(); } -void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome, +void raise_exception_ra(CPUARMState *env, uint32_t excp, uint64_t syndrome, uint32_t target_el, uintptr_t ra) { g_assert_not_reached(); diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode index 55ff6c5..01b1b3e 100644 --- a/target/arm/tcg/a64.decode +++ b/target/arm/tcg/a64.decode @@ -248,6 +248,7 @@ ERETA 1101011 0100 11111 00001 m:1 11111 11111 &reta # ERETAA, ERETAB AUTIA1716 1101 0101 0000 0011 0010 0001 100 11111 AUTIB1716 1101 0101 0000 0011 0010 0001 110 11111 ESB 1101 0101 0000 0011 0010 0010 000 11111 + GCSB 1101 0101 0000 0011 0010 0010 011 11111 PACIAZ 1101 0101 0000 0011 0010 0011 000 11111 PACIASP 1101 0101 0000 0011 0010 0011 001 11111 PACIBZ 1101 0101 0000 0011 0010 0011 010 11111 @@ -256,6 +257,7 @@ ERETA 1101011 0100 11111 00001 m:1 11111 11111 &reta # ERETAA, ERETAB AUTIASP 1101 0101 0000 0011 0010 0011 101 11111 AUTIBZ 1101 0101 0000 0011 0010 0011 110 11111 AUTIBSP 1101 0101 0000 0011 0010 0011 111 11111 + CHKFEAT 1101 0101 0000 0011 0010 0101 000 11111 ] # The canonical NOP has CRm == op2 == 0, but all of the space # that isn't specifically allocated to an instruction must NOP @@ -570,6 +572,9 @@ LDAPR_i 10 011001 10 0 ......... 00 ..... ..... @ldapr_stlr_i sign=1 ext LDAPR_i 00 011001 11 0 ......... 00 ..... ..... @ldapr_stlr_i sign=1 ext=1 sz=0 LDAPR_i 01 011001 11 0 ......... 00 ..... ..... @ldapr_stlr_i sign=1 ext=1 sz=1 +# GCSSTR, GCSSTTR +GCSSTR 11011001 000 11111 000 unpriv:1 11 rn:5 rt:5 + # Load/store multiple structures # The 4-bit opcode in [15:12] encodes repeat count and structure elements &ldst_mult rm rn rt sz q p rpt selem diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index 8c617fe..1bffe66 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -1280,6 +1280,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64PFR1, SME, 2); /* FEAT_SME2 */ t = FIELD_DP64(t, ID_AA64PFR1, CSV2_FRAC, 0); /* FEAT_CSV2_3 */ t = FIELD_DP64(t, ID_AA64PFR1, NMI, 1); /* FEAT_NMI */ + t = FIELD_DP64(t, ID_AA64PFR1, GCS, 1); /* FEAT_GCS */ SET_IDREG(isar, ID_AA64PFR1, t); t = GET_IDREG(isar, ID_AA64MMFR0); @@ -1326,7 +1327,10 @@ void aarch64_max_tcg_initfn(Object *obj) t = GET_IDREG(isar, ID_AA64MMFR3); t = FIELD_DP64(t, ID_AA64MMFR3, TCRX, 1); /* FEAT_TCR2 */ t = FIELD_DP64(t, ID_AA64MMFR3, SCTLRX, 1); /* FEAT_SCTLR2 */ + t = FIELD_DP64(t, ID_AA64MMFR3, MEC, 1); /* FEAT_MEC */ t = FIELD_DP64(t, ID_AA64MMFR3, SPEC_FPACC, 1); /* FEAT_FPACC_SPEC */ + t = FIELD_DP64(t, ID_AA64MMFR3, S1PIE, 1); /* FEAT_S1PIE */ + t = FIELD_DP64(t, ID_AA64MMFR3, S2PIE, 1); /* FEAT_S2PIE */ SET_IDREG(isar, ID_AA64MMFR3, t); t = GET_IDREG(isar, ID_AA64ZFR0); diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c index 71c6c44..ba1d775 100644 --- a/target/arm/tcg/helper-a64.c +++ b/target/arm/tcg/helper-a64.c @@ -576,6 +576,7 @@ uint32_t HELPER(advsimd_rinth)(uint32_t x, float_status *fp_status) return ret; } +#ifndef CONFIG_USER_ONLY static int el_from_spsr(uint32_t spsr) { /* Return the exception level that this SPSR is requesting a return to, @@ -614,32 +615,12 @@ static int el_from_spsr(uint32_t spsr) } } -static void cpsr_write_from_spsr_elx(CPUARMState *env, - uint32_t val) -{ - uint32_t mask; - - /* Save SPSR_ELx.SS into PSTATE. */ - env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS); - val &= ~PSTATE_SS; - - /* Move DIT to the correct location for CPSR */ - if (val & PSTATE_DIT) { - val &= ~PSTATE_DIT; - val |= CPSR_DIT; - } - - mask = aarch32_cpsr_valid_mask(env->features, \ - &env_archcpu(env)->isar); - cpsr_write(env, val, mask, CPSRWriteRaw); -} - void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) { ARMCPU *cpu = env_archcpu(env); int cur_el = arm_current_el(env); unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el); - uint32_t spsr = env->banked_spsr[spsr_idx]; + uint64_t spsr = env->banked_spsr[spsr_idx]; int new_el; bool return_to_aa64 = (spsr & PSTATE_nRW) == 0; @@ -694,6 +675,17 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) goto illegal_return; } + /* + * If GetCurrentEXLOCKEN, the exception return path must use GCSPOPCX, + * which will set PSTATE.EXLOCK. We need not explicitly check FEAT_GCS, + * because GCSCR_ELx cannot be set without it. + */ + if (new_el == cur_el && + (env->cp15.gcscr_el[cur_el] & GCSCR_EXLOCKEN) && + !(env->pstate & PSTATE_EXLOCK)) { + goto illegal_return; + } + bql_lock(); arm_call_pre_el_change_hook(cpu); bql_unlock(); @@ -787,6 +779,7 @@ illegal_return: qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: " "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc); } +#endif /* !CONFIG_USER_ONLY */ void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in) { diff --git a/target/arm/tcg/helper-a64.h b/target/arm/tcg/helper-a64.h index 8502346..b6008b5 100644 --- a/target/arm/tcg/helper-a64.h +++ b/target/arm/tcg/helper-a64.h @@ -80,7 +80,6 @@ DEF_HELPER_3(vfp_ah_maxh, f16, f16, f16, fpst) DEF_HELPER_3(vfp_ah_maxs, f32, f32, f32, fpst) DEF_HELPER_3(vfp_ah_maxd, f64, f64, f64, fpst) -DEF_HELPER_2(exception_return, void, env, i64) DEF_HELPER_FLAGS_2(dc_zva, TCG_CALL_NO_WG, void, env, i64) DEF_HELPER_FLAGS_3(pacia, TCG_CALL_NO_WG, i64, env, i64, i64) @@ -145,3 +144,7 @@ DEF_HELPER_FLAGS_5(gvec_fmulx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32 DEF_HELPER_FLAGS_5(gvec_fmulx_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fmulx_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fmulx_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +#ifndef CONFIG_USER_ONLY +DEF_HELPER_2(exception_return, void, env, i64) +#endif diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c index 17f83f1..5c9b9be 100644 --- a/target/arm/tcg/hflags.c +++ b/target/arm/tcg/hflags.c @@ -451,6 +451,44 @@ static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, DP_TBFLAG_A64(flags, TCMA, aa64_va_parameter_tcma(tcr, mmu_idx)); } + if (cpu_isar_feature(aa64_gcs, env_archcpu(env))) { + /* C.f. GCSEnabled */ + if (env->cp15.gcscr_el[el] & GCSCR_PCRSEL) { + switch (el) { + default: + if (!el_is_in_host(env, el) + && !(arm_hcrx_el2_eff(env) & HCRX_GCSEN)) { + break; + } + /* fall through */ + case 2: + if (arm_feature(env, ARM_FEATURE_EL3) + && !(env->cp15.scr_el3 & SCR_GCSEN)) { + break; + } + /* fall through */ + case 3: + DP_TBFLAG_A64(flags, GCS_EN, 1); + break; + } + } + + /* C.f. GCSReturnValueCheckEnabled */ + if (env->cp15.gcscr_el[el] & GCSCR_RVCHKEN) { + DP_TBFLAG_A64(flags, GCS_RVCEN, 1); + } + + /* C.f. CheckGCSSTREnabled */ + if (!(env->cp15.gcscr_el[el] & GCSCR_STREN)) { + DP_TBFLAG_A64(flags, GCSSTR_EL, el ? el : 1); + } else if (el == 1 + && EX_TBFLAG_ANY(flags, FGT_ACTIVE) + && !FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], + HFGITR_EL2, NGCSSTR_EL1)) { + DP_TBFLAG_A64(flags, GCSSTR_EL, 2); + } + } + if (env->vfp.fpcr & FPCR_AH) { DP_TBFLAG_A64(flags, AH, 1); } diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c index b96c953..bb48fe3 100644 --- a/target/arm/tcg/mte_helper.c +++ b/target/arm/tcg/mte_helper.c @@ -605,7 +605,7 @@ void mte_check_fail(CPUARMState *env, uint32_t desc, int el, reg_el, tcf; uint64_t sctlr; - reg_el = regime_el(env, arm_mmu_idx); + reg_el = regime_el(arm_mmu_idx); sctlr = env->cp15.sctlr_el[reg_el]; switch (arm_mmu_idx) { diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c index 5373e0e..4fbd219 100644 --- a/target/arm/tcg/op_helper.c +++ b/target/arm/tcg/op_helper.c @@ -46,7 +46,7 @@ int exception_target_el(CPUARMState *env) } void raise_exception(CPUARMState *env, uint32_t excp, - uint32_t syndrome, uint32_t target_el) + uint64_t syndrome, uint32_t target_el) { CPUState *cs = env_cpu(env); @@ -70,7 +70,7 @@ void raise_exception(CPUARMState *env, uint32_t excp, cpu_loop_exit(cs); } -void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome, +void raise_exception_ra(CPUARMState *env, uint32_t excp, uint64_t syndrome, uint32_t target_el, uintptr_t ra) { CPUState *cs = env_cpu(env); @@ -881,6 +881,13 @@ const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key, } syndrome = syn_uncategorized(); break; + case CP_ACCESS_EXLOCK: + /* + * CP_ACCESS_EXLOCK is always directed to the current EL, + * which is going to be the same as the usual target EL. + */ + syndrome = syn_gcs_exlock(); + break; default: g_assert_not_reached(); } diff --git a/target/arm/tcg/tlb-insns.c b/target/arm/tcg/tlb-insns.c index 95c26c6..1a0a332 100644 --- a/target/arm/tcg/tlb-insns.c +++ b/target/arm/tcg/tlb-insns.c @@ -149,7 +149,8 @@ static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = env_cpu(env); uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdxBit_E2); + tlb_flush_page_by_mmuidx(cs, pageaddr, + ARMMMUIdxBit_E2 | ARMMMUIdxBit_E2_GCS); } static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -159,7 +160,8 @@ static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12); tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, - ARMMMUIdxBit_E2); + ARMMMUIdxBit_E2 | + ARMMMUIdxBit_E2_GCS); } static void tlbiipas2_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -202,7 +204,7 @@ static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, { CPUState *cs = env_cpu(env); - tlb_flush_by_mmuidx(cs, ARMMMUIdxBit_E2); + tlb_flush_by_mmuidx(cs, ARMMMUIdxBit_E2 | ARMMMUIdxBit_E2_GCS); } static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -210,7 +212,8 @@ static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, { CPUState *cs = env_cpu(env); - tlb_flush_by_mmuidx_all_cpus_synced(cs, ARMMMUIdxBit_E2); + tlb_flush_by_mmuidx_all_cpus_synced(cs, ARMMMUIdxBit_E2 | + ARMMMUIdxBit_E2_GCS); } /* @@ -228,12 +231,16 @@ static int vae1_tlbmask(CPUARMState *env) if ((hcr & (HCR_E2H | HCR_TGE)) == (HCR_E2H | HCR_TGE)) { mask = ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E20_2_PAN | - ARMMMUIdxBit_E20_0; + ARMMMUIdxBit_E20_2_GCS | + ARMMMUIdxBit_E20_0 | + ARMMMUIdxBit_E20_0_GCS; } else { /* This is AArch64 only, so we don't need to touch the EL30_x TLBs */ mask = ARMMMUIdxBit_E10_1 | ARMMMUIdxBit_E10_1_PAN | - ARMMMUIdxBit_E10_0; + ARMMMUIdxBit_E10_1_GCS | + ARMMMUIdxBit_E10_0 | + ARMMMUIdxBit_E10_0_GCS; } return mask; } @@ -246,13 +253,20 @@ static int vae2_tlbmask(CPUARMState *env) if (hcr & HCR_E2H) { mask = ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E20_2_PAN | - ARMMMUIdxBit_E20_0; + ARMMMUIdxBit_E20_2_GCS | + ARMMMUIdxBit_E20_0 | + ARMMMUIdxBit_E20_0_GCS; } else { - mask = ARMMMUIdxBit_E2; + mask = ARMMMUIdxBit_E2 | ARMMMUIdxBit_E2_GCS; } return mask; } +static int vae3_tlbmask(void) +{ + return ARMMMUIdxBit_E3 | ARMMMUIdxBit_E3_GCS; +} + /* Return 56 if TBI is enabled, 64 otherwise. */ static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx, uint64_t addr) @@ -325,9 +339,12 @@ static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri, static int e2_tlbmask(CPUARMState *env) { return (ARMMMUIdxBit_E20_0 | + ARMMMUIdxBit_E20_0_GCS | ARMMMUIdxBit_E20_2 | ARMMMUIdxBit_E20_2_PAN | - ARMMMUIdxBit_E2); + ARMMMUIdxBit_E20_2_GCS | + ARMMMUIdxBit_E2 | + ARMMMUIdxBit_E2_GCS); } static void tlbi_aa64_alle1_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -354,7 +371,7 @@ static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri, ARMCPU *cpu = env_archcpu(env); CPUState *cs = CPU(cpu); - tlb_flush_by_mmuidx(cs, ARMMMUIdxBit_E3); + tlb_flush_by_mmuidx(cs, vae3_tlbmask()); } static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -380,7 +397,7 @@ static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri, { CPUState *cs = env_cpu(env); - tlb_flush_by_mmuidx_all_cpus_synced(cs, ARMMMUIdxBit_E3); + tlb_flush_by_mmuidx_all_cpus_synced(cs, vae3_tlbmask()); } static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -411,7 +428,7 @@ static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = CPU(cpu); uint64_t pageaddr = sextract64(value << 12, 0, 56); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdxBit_E3); + tlb_flush_page_by_mmuidx(cs, pageaddr, vae3_tlbmask()); } static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -465,7 +482,7 @@ static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri, int bits = tlbbits_for_regime(env, ARMMMUIdx_E3, pageaddr); tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, - ARMMMUIdxBit_E3, bits); + vae3_tlbmask(), bits); } static int ipas2e1_tlbmask(CPUARMState *env, int64_t value) @@ -963,7 +980,7 @@ static void tlbi_aa64_rvae3_write(CPUARMState *env, * flush-last-level-only. */ - do_rvae_write(env, value, ARMMMUIdxBit_E3, tlb_force_broadcast(env)); + do_rvae_write(env, value, vae3_tlbmask(), tlb_force_broadcast(env)); } static void tlbi_aa64_rvae3is_write(CPUARMState *env, @@ -977,7 +994,7 @@ static void tlbi_aa64_rvae3is_write(CPUARMState *env, * flush-last-level-only or inner/outer specific flushes. */ - do_rvae_write(env, value, ARMMMUIdxBit_E3, true); + do_rvae_write(env, value, vae3_tlbmask(), true); } static void tlbi_aa64_ripas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri, diff --git a/target/arm/tcg/tlb_helper.c b/target/arm/tcg/tlb_helper.c index 23c72a9..f1983a5 100644 --- a/target/arm/tcg/tlb_helper.c +++ b/target/arm/tcg/tlb_helper.c @@ -24,13 +24,13 @@ bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) return regime_using_lpae_format(env, mmu_idx); } -static inline uint32_t merge_syn_data_abort(uint32_t template_syn, +static inline uint64_t merge_syn_data_abort(uint32_t template_syn, ARMMMUFaultInfo *fi, unsigned int target_el, bool same_el, bool is_write, - int fsc) + int fsc, bool gcs) { - uint32_t syn; + uint64_t syn; /* * ISV is only set for stage-2 data aborts routed to EL2 and @@ -75,6 +75,11 @@ static inline uint32_t merge_syn_data_abort(uint32_t template_syn, /* Merge the runtime syndrome with the template syndrome. */ syn |= template_syn; } + + /* Form ISS2 at the top of the syndrome. */ + syn |= (uint64_t)fi->dirtybit << 37; + syn |= (uint64_t)gcs << 40; + return syn; } @@ -176,7 +181,9 @@ void arm_deliver_fault(ARMCPU *cpu, vaddr addr, int target_el = exception_target_el(env); int current_el = arm_current_el(env); bool same_el; - uint32_t syn, exc, fsr, fsc; + uint32_t exc, fsr, fsc; + uint64_t syn; + /* * We know this must be a data or insn abort, and that * env->exception.syndrome contains the template syndrome set @@ -246,9 +253,10 @@ void arm_deliver_fault(ARMCPU *cpu, vaddr addr, syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc); exc = EXCP_PREFETCH_ABORT; } else { + bool gcs = regime_is_gcs(core_to_arm_mmu_idx(env, mmu_idx)); syn = merge_syn_data_abort(env->exception.syndrome, fi, target_el, same_el, access_type == MMU_DATA_STORE, - fsc); + fsc, gcs); if (access_type == MMU_DATA_STORE && arm_feature(env, ARM_FEATURE_V6)) { fsr |= (1 << 11); diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index a0e3300..918d5ed 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -26,6 +26,7 @@ #include "cpregs.h" static TCGv_i64 cpu_X[32]; +static TCGv_i64 cpu_gcspr[4]; static TCGv_i64 cpu_pc; /* Load/store exclusive handling */ @@ -77,6 +78,10 @@ static int scale_by_log2_tag_granule(DisasContext *s, int x) /* initialize TCG globals. */ void a64_translate_init(void) { + static const char gcspr_names[4][12] = { + "gcspr_el0", "gcspr_el1", "gcspr_el2", "gcspr_el3" + }; + int i; cpu_pc = tcg_global_mem_new_i64(tcg_env, @@ -90,10 +95,17 @@ void a64_translate_init(void) cpu_exclusive_high = tcg_global_mem_new_i64(tcg_env, offsetof(CPUARMState, exclusive_high), "exclusive_high"); + + for (i = 0; i < 4; i++) { + cpu_gcspr[i] = + tcg_global_mem_new_i64(tcg_env, + offsetof(CPUARMState, cp15.gcspr_el[i]), + gcspr_names[i]); + } } /* - * Return the core mmu_idx to use for A64 load/store insns which + * Return the full arm mmu_idx to use for A64 load/store insns which * have a "unprivileged load/store" variant. Those insns access * EL0 if executed from an EL which has control over EL0 (usually * EL1) but behave like normal loads and stores if executed from @@ -103,7 +115,7 @@ void a64_translate_init(void) * normal encoding (in which case we will return the same * thing as get_mem_index(). */ -static int get_a64_user_mem_index(DisasContext *s, bool unpriv) +static ARMMMUIdx full_a64_user_mem_index(DisasContext *s, bool unpriv) { /* * If AccType_UNPRIV is not used, the insn uses AccType_NORMAL, @@ -130,7 +142,19 @@ static int get_a64_user_mem_index(DisasContext *s, bool unpriv) g_assert_not_reached(); } } - return arm_to_core_mmu_idx(useridx); + return useridx; +} + +/* Return the core mmu_idx per above. */ +static int core_a64_user_mem_index(DisasContext *s, bool unpriv) +{ + return arm_to_core_mmu_idx(full_a64_user_mem_index(s, unpriv)); +} + +/* For a given translation regime, return the core mmu_idx for gcs access. */ +static int core_gcs_mem_index(ARMMMUIdx armidx) +{ + return arm_to_core_mmu_idx(regime_to_gcs(armidx)); } static void set_btype_raw(int val) @@ -408,6 +432,39 @@ static MemOp check_ordered_align(DisasContext *s, int rn, int imm, return finalize_memop(s, mop); } +static void gen_add_gcs_record(DisasContext *s, TCGv_i64 value) +{ + TCGv_i64 addr = tcg_temp_new_i64(); + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + + tcg_gen_addi_i64(addr, gcspr, -8); + tcg_gen_qemu_st_i64(value, clean_data_tbi(s, addr), mmuidx, mop); + tcg_gen_mov_i64(gcspr, addr); +} + +static void gen_load_check_gcs_record(DisasContext *s, TCGv_i64 target, + GCSInstructionType it, int rt) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 rec_va = tcg_temp_new_i64(); + + tcg_gen_qemu_ld_i64(rec_va, clean_data_tbi(s, gcspr), mmuidx, mop); + + if (s->gcs_rvcen) { + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(it, rt)); + + tcg_gen_brcond_i64(TCG_COND_NE, rec_va, target, fail_label); + } + + gen_a64_set_pc(s, rec_va); + tcg_gen_addi_i64(gcspr, gcspr, 8); +} + typedef struct DisasCompare64 { TCGCond cond; TCGv_i64 value; @@ -1642,7 +1699,14 @@ static bool trans_B(DisasContext *s, arg_i *a) static bool trans_BL(DisasContext *s, arg_i *a) { - gen_pc_plus_diff(s, cpu_reg(s, 30), curr_insn_len(s)); + TCGv_i64 link = tcg_temp_new_i64(); + + gen_pc_plus_diff(s, link, 4); + if (s->gcs_en) { + gen_add_gcs_record(s, link); + } + tcg_gen_mov_i64(cpu_reg(s, 30), link); + reset_btype(s); gen_goto_tb(s, 0, a->imm); return true; @@ -1739,15 +1803,15 @@ static bool trans_BR(DisasContext *s, arg_r *a) static bool trans_BLR(DisasContext *s, arg_r *a) { - TCGv_i64 dst = cpu_reg(s, a->rn); - TCGv_i64 lr = cpu_reg(s, 30); - if (dst == lr) { - TCGv_i64 tmp = tcg_temp_new_i64(); - tcg_gen_mov_i64(tmp, dst); - dst = tmp; + TCGv_i64 link = tcg_temp_new_i64(); + + gen_pc_plus_diff(s, link, 4); + if (s->gcs_en) { + gen_add_gcs_record(s, link); } - gen_pc_plus_diff(s, lr, curr_insn_len(s)); - gen_a64_set_pc(s, dst); + gen_a64_set_pc(s, cpu_reg(s, a->rn)); + tcg_gen_mov_i64(cpu_reg(s, 30), link); + set_btype_for_blr(s); s->base.is_jmp = DISAS_JUMP; return true; @@ -1755,7 +1819,13 @@ static bool trans_BLR(DisasContext *s, arg_r *a) static bool trans_RET(DisasContext *s, arg_r *a) { - gen_a64_set_pc(s, cpu_reg(s, a->rn)); + TCGv_i64 target = cpu_reg(s, a->rn); + + if (s->gcs_en) { + gen_load_check_gcs_record(s, target, GCS_IT_RET_nPauth, a->rn); + } else { + gen_a64_set_pc(s, target); + } s->base.is_jmp = DISAS_JUMP; return true; } @@ -1799,21 +1869,21 @@ static bool trans_BRAZ(DisasContext *s, arg_braz *a) static bool trans_BLRAZ(DisasContext *s, arg_braz *a) { - TCGv_i64 dst, lr; + TCGv_i64 dst, link; if (!dc_isar_feature(aa64_pauth, s)) { return false; } - dst = auth_branch_target(s, cpu_reg(s, a->rn), tcg_constant_i64(0), !a->m); - lr = cpu_reg(s, 30); - if (dst == lr) { - TCGv_i64 tmp = tcg_temp_new_i64(); - tcg_gen_mov_i64(tmp, dst); - dst = tmp; + + link = tcg_temp_new_i64(); + gen_pc_plus_diff(s, link, 4); + if (s->gcs_en) { + gen_add_gcs_record(s, link); } - gen_pc_plus_diff(s, lr, curr_insn_len(s)); gen_a64_set_pc(s, dst); + tcg_gen_mov_i64(cpu_reg(s, 30), link); + set_btype_for_blr(s); s->base.is_jmp = DISAS_JUMP; return true; @@ -1828,7 +1898,12 @@ static bool trans_RETA(DisasContext *s, arg_reta *a) } dst = auth_branch_target(s, cpu_reg(s, 30), cpu_X[31], !a->m); - gen_a64_set_pc(s, dst); + if (s->gcs_en) { + GCSInstructionType it = a->m ? GCS_IT_RET_PauthB : GCS_IT_RET_PauthA; + gen_load_check_gcs_record(s, dst, it, 30); + } else { + gen_a64_set_pc(s, dst); + } s->base.is_jmp = DISAS_JUMP; return true; } @@ -1849,20 +1924,21 @@ static bool trans_BRA(DisasContext *s, arg_bra *a) static bool trans_BLRA(DisasContext *s, arg_bra *a) { - TCGv_i64 dst, lr; + TCGv_i64 dst, link; if (!dc_isar_feature(aa64_pauth, s)) { return false; } dst = auth_branch_target(s, cpu_reg(s, a->rn), cpu_reg_sp(s, a->rm), !a->m); - lr = cpu_reg(s, 30); - if (dst == lr) { - TCGv_i64 tmp = tcg_temp_new_i64(); - tcg_gen_mov_i64(tmp, dst); - dst = tmp; + + link = tcg_temp_new_i64(); + gen_pc_plus_diff(s, link, 4); + if (s->gcs_en) { + gen_add_gcs_record(s, link); } - gen_pc_plus_diff(s, lr, curr_insn_len(s)); gen_a64_set_pc(s, dst); + tcg_gen_mov_i64(cpu_reg(s, 30), link); + set_btype_for_blr(s); s->base.is_jmp = DISAS_JUMP; return true; @@ -1870,6 +1946,9 @@ static bool trans_BLRA(DisasContext *s, arg_bra *a) static bool trans_ERET(DisasContext *s, arg_ERET *a) { +#ifdef CONFIG_USER_ONLY + return false; +#else TCGv_i64 dst; if (s->current_el == 0) { @@ -1889,10 +1968,14 @@ static bool trans_ERET(DisasContext *s, arg_ERET *a) /* Must exit loop to check un-masked IRQs */ s->base.is_jmp = DISAS_EXIT; return true; +#endif } static bool trans_ERETA(DisasContext *s, arg_reta *a) { +#ifdef CONFIG_USER_ONLY + return false; +#else TCGv_i64 dst; if (!dc_isar_feature(aa64_pauth, s)) { @@ -1918,6 +2001,7 @@ static bool trans_ERETA(DisasContext *s, arg_reta *a) /* Must exit loop to check un-masked IRQs */ s->base.is_jmp = DISAS_EXIT; return true; +#endif } static bool trans_NOP(DisasContext *s, arg_NOP *a) @@ -2060,6 +2144,14 @@ static bool trans_ESB(DisasContext *s, arg_ESB *a) return true; } +static bool trans_GCSB(DisasContext *s, arg_GCSB *a) +{ + if (dc_isar_feature(aa64_gcs, s)) { + tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL); + } + return true; +} + static bool trans_PACIAZ(DisasContext *s, arg_PACIAZ *a) { if (s->pauth_active) { @@ -2124,6 +2216,20 @@ static bool trans_AUTIBSP(DisasContext *s, arg_AUTIBSP *a) return true; } +static bool trans_CHKFEAT(DisasContext *s, arg_CHKFEAT *a) +{ + uint64_t feat_en = 0; + + if (s->gcs_en) { + feat_en |= 1 << 0; + } + if (feat_en) { + TCGv_i64 x16 = cpu_reg(s, 16); + tcg_gen_andi_i64(x16, x16, ~feat_en); + } + return true; +} + static bool trans_CLREX(DisasContext *s, arg_CLREX *a) { tcg_gen_movi_i64(cpu_exclusive_addr, -1); @@ -2455,6 +2561,182 @@ static void gen_sysreg_undef(DisasContext *s, bool isread, gen_exception_insn(s, 0, EXCP_UDEF, syndrome); } +static void gen_gcspopm(DisasContext *s, int rt) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 value = tcg_temp_new_i64(); + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(GCS_IT_GCSPOPM, rt)); + + /* The value at top-of-stack must have low 2 bits clear. */ + tcg_gen_qemu_ld_i64(value, clean_data_tbi(s, gcspr), mmuidx, mop); + tcg_gen_brcondi_i64(TCG_COND_TSTNE, value, 3, fail_label); + + /* Complete the pop and return the value. */ + tcg_gen_addi_i64(gcspr, gcspr, 8); + tcg_gen_mov_i64(cpu_reg(s, rt), value); +} + +static void gen_gcspushx(DisasContext *s) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int spsr_idx = aarch64_banked_spsr_index(s->current_el); + int spsr_off = offsetof(CPUARMState, banked_spsr[spsr_idx]); + int elr_off = offsetof(CPUARMState, elr_el[s->current_el]); + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 addr = tcg_temp_new_i64(); + TCGv_i64 tmp = tcg_temp_new_i64(); + + tcg_gen_addi_i64(addr, gcspr, -8); + tcg_gen_qemu_st_i64(cpu_reg(s, 30), addr, mmuidx, mop); + + tcg_gen_ld_i64(tmp, tcg_env, spsr_off); + tcg_gen_addi_i64(addr, addr, -8); + tcg_gen_qemu_st_i64(tmp, addr, mmuidx, mop); + + tcg_gen_ld_i64(tmp, tcg_env, elr_off); + tcg_gen_addi_i64(addr, addr, -8); + tcg_gen_qemu_st_i64(tmp, addr, mmuidx, mop); + + tcg_gen_addi_i64(addr, addr, -8); + tcg_gen_qemu_st_i64(tcg_constant_i64(0b1001), addr, mmuidx, mop); + + tcg_gen_mov_i64(gcspr, addr); + clear_pstate_bits(PSTATE_EXLOCK); +} + +static void gen_gcspopcx(DisasContext *s) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int spsr_idx = aarch64_banked_spsr_index(s->current_el); + int spsr_off = offsetof(CPUARMState, banked_spsr[spsr_idx]); + int elr_off = offsetof(CPUARMState, elr_el[s->current_el]); + int gcscr_off = offsetof(CPUARMState, cp15.gcscr_el[s->current_el]); + int pstate_off = offsetof(CPUARMState, pstate); + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 addr = tcg_temp_new_i64(); + TCGv_i64 tmp1 = tcg_temp_new_i64(); + TCGv_i64 tmp2 = tcg_temp_new_i64(); + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(GCS_IT_GCSPOPCX, 31)); + + /* The value at top-of-stack must be an exception token. */ + tcg_gen_qemu_ld_i64(tmp1, gcspr, mmuidx, mop); + tcg_gen_brcondi_i64(TCG_COND_NE, tmp1, 0b1001, fail_label); + + /* Validate in turn, ELR ... */ + tcg_gen_addi_i64(addr, gcspr, 8); + tcg_gen_qemu_ld_i64(tmp1, addr, mmuidx, mop); + tcg_gen_ld_i64(tmp2, tcg_env, elr_off); + tcg_gen_brcond_i64(TCG_COND_NE, tmp1, tmp2, fail_label); + + /* ... SPSR ... */ + tcg_gen_addi_i64(addr, addr, 8); + tcg_gen_qemu_ld_i64(tmp1, addr, mmuidx, mop); + tcg_gen_ld_i64(tmp2, tcg_env, spsr_off); + tcg_gen_brcond_i64(TCG_COND_NE, tmp1, tmp2, fail_label); + + /* ... and LR. */ + tcg_gen_addi_i64(addr, addr, 8); + tcg_gen_qemu_ld_i64(tmp1, addr, mmuidx, mop); + tcg_gen_brcond_i64(TCG_COND_NE, tmp1, cpu_reg(s, 30), fail_label); + + /* Writeback stack pointer after pop. */ + tcg_gen_addi_i64(gcspr, addr, 8); + + /* PSTATE.EXLOCK = GetCurrentEXLOCKEN(). */ + tcg_gen_ld_i64(tmp1, tcg_env, gcscr_off); + tcg_gen_ld_i64(tmp2, tcg_env, pstate_off); + tcg_gen_shri_i64(tmp1, tmp1, ctz64(GCSCR_EXLOCKEN)); + tcg_gen_deposit_i64(tmp2, tmp2, tmp1, ctz64(PSTATE_EXLOCK), 1); + tcg_gen_st_i64(tmp2, tcg_env, pstate_off); +} + +static void gen_gcspopx(DisasContext *s) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 addr = tcg_temp_new_i64(); + TCGv_i64 tmp = tcg_temp_new_i64(); + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(GCS_IT_GCSPOPX, 31)); + + /* The value at top-of-stack must be an exception token. */ + tcg_gen_qemu_ld_i64(tmp, gcspr, mmuidx, mop); + tcg_gen_brcondi_i64(TCG_COND_NE, tmp, 0b1001, fail_label); + + /* + * The other three values in the exception return record + * are ignored, but are loaded anyway to raise faults. + */ + tcg_gen_addi_i64(addr, gcspr, 8); + tcg_gen_qemu_ld_i64(tmp, addr, mmuidx, mop); + tcg_gen_addi_i64(addr, addr, 8); + tcg_gen_qemu_ld_i64(tmp, addr, mmuidx, mop); + tcg_gen_addi_i64(addr, addr, 8); + tcg_gen_qemu_ld_i64(tmp, addr, mmuidx, mop); + tcg_gen_addi_i64(gcspr, addr, 8); +} + +static void gen_gcsss1(DisasContext *s, int rt) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 inptr = cpu_reg(s, rt); + TCGv_i64 cmp = tcg_temp_new_i64(); + TCGv_i64 new = tcg_temp_new_i64(); + TCGv_i64 old = tcg_temp_new_i64(); + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(GCS_IT_GCSSS1, rt)); + + /* Compute the valid cap entry that the new stack must have. */ + tcg_gen_deposit_i64(cmp, inptr, tcg_constant_i64(1), 0, 12); + /* Compute the in-progress cap entry for the old stack. */ + tcg_gen_deposit_i64(new, gcspr, tcg_constant_i64(5), 0, 3); + + /* Swap the valid cap the with the in-progress cap. */ + tcg_gen_atomic_cmpxchg_i64(old, inptr, cmp, new, mmuidx, mop); + tcg_gen_brcond_i64(TCG_COND_NE, old, cmp, fail_label); + + /* The new stack had a valid cap: change gcspr. */ + tcg_gen_andi_i64(gcspr, inptr, ~7); +} + +static void gen_gcsss2(DisasContext *s, int rt) +{ + TCGv_i64 gcspr = cpu_gcspr[s->current_el]; + int mmuidx = core_gcs_mem_index(s->mmu_idx); + MemOp mop = finalize_memop(s, MO_64 | MO_ALIGN); + TCGv_i64 outptr = tcg_temp_new_i64(); + TCGv_i64 tmp = tcg_temp_new_i64(); + TCGLabel *fail_label = + delay_exception(s, EXCP_UDEF, syn_gcs_data_check(GCS_IT_GCSSS2, rt)); + + /* Validate that the new stack has an in-progress cap. */ + tcg_gen_qemu_ld_i64(outptr, gcspr, mmuidx, mop); + tcg_gen_andi_i64(tmp, outptr, 7); + tcg_gen_brcondi_i64(TCG_COND_NE, tmp, 5, fail_label); + + /* Push a valid cap to the old stack. */ + tcg_gen_andi_i64(outptr, outptr, ~7); + tcg_gen_addi_i64(outptr, outptr, -8); + tcg_gen_deposit_i64(tmp, outptr, tcg_constant_i64(1), 0, 12); + tcg_gen_qemu_st_i64(tmp, outptr, mmuidx, mop); + tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL); + + /* Pop the in-progress cap from the new stack. */ + tcg_gen_addi_i64(gcspr, gcspr, 8); + + /* Return a pointer to the old stack cap. */ + tcg_gen_mov_i64(cpu_reg(s, rt), outptr); +} + /* * Look up @key, returning the cpreg, which must exist. * Additionally, the new cpreg must also be accessible. @@ -2761,6 +3043,51 @@ static void handle_sys(DisasContext *s, bool isread, } } return; + case ARM_CP_GCSPUSHM: + if (s->gcs_en) { + gen_add_gcs_record(s, cpu_reg(s, rt)); + } + return; + case ARM_CP_GCSPOPM: + /* Note that X[rt] is unchanged if !GCSEnabled. */ + if (s->gcs_en) { + gen_gcspopm(s, rt); + } + return; + case ARM_CP_GCSPUSHX: + /* Choose the CONSTRAINED UNPREDICTABLE for UNDEF. */ + if (rt != 31) { + unallocated_encoding(s); + } else if (s->gcs_en) { + gen_gcspushx(s); + } + return; + case ARM_CP_GCSPOPCX: + /* Choose the CONSTRAINED UNPREDICTABLE for UNDEF. */ + if (rt != 31) { + unallocated_encoding(s); + } else if (s->gcs_en) { + gen_gcspopcx(s); + } + return; + case ARM_CP_GCSPOPX: + /* Choose the CONSTRAINED UNPREDICTABLE for UNDEF. */ + if (rt != 31) { + unallocated_encoding(s); + } else if (s->gcs_en) { + gen_gcspopx(s); + } + return; + case ARM_CP_GCSSS1: + if (s->gcs_en) { + gen_gcsss1(s, rt); + } + return; + case ARM_CP_GCSSS2: + if (s->gcs_en) { + gen_gcsss2(s, rt); + } + return; default: g_assert_not_reached(); } @@ -3555,7 +3882,7 @@ static void op_addr_ldst_imm_pre(DisasContext *s, arg_ldst_imm *a, if (!a->p) { tcg_gen_addi_i64(*dirty_addr, *dirty_addr, offset); } - memidx = get_a64_user_mem_index(s, a->unpriv); + memidx = core_a64_user_mem_index(s, a->unpriv); *clean_addr = gen_mte_check1_mmuidx(s, *dirty_addr, is_store, a->w || a->rn != 31, mop, a->unpriv, memidx); @@ -3576,7 +3903,7 @@ static bool trans_STR_i(DisasContext *s, arg_ldst_imm *a) { bool iss_sf, iss_valid = !a->w; TCGv_i64 clean_addr, dirty_addr, tcg_rt; - int memidx = get_a64_user_mem_index(s, a->unpriv); + int memidx = core_a64_user_mem_index(s, a->unpriv); MemOp mop = finalize_memop(s, a->sz + a->sign * MO_SIGN); op_addr_ldst_imm_pre(s, a, &clean_addr, &dirty_addr, a->imm, true, mop); @@ -3594,7 +3921,7 @@ static bool trans_LDR_i(DisasContext *s, arg_ldst_imm *a) { bool iss_sf, iss_valid = !a->w; TCGv_i64 clean_addr, dirty_addr, tcg_rt; - int memidx = get_a64_user_mem_index(s, a->unpriv); + int memidx = core_a64_user_mem_index(s, a->unpriv); MemOp mop = finalize_memop(s, a->sz + a->sign * MO_SIGN); op_addr_ldst_imm_pre(s, a, &clean_addr, &dirty_addr, a->imm, false, mop); @@ -3961,6 +4288,42 @@ static bool trans_STLR_i(DisasContext *s, arg_ldapr_stlr_i *a) return true; } +static bool trans_GCSSTR(DisasContext *s, arg_GCSSTR *a) +{ + ARMMMUIdx armidx; + + if (!dc_isar_feature(aa64_gcs, s)) { + return false; + } + + /* + * The pseudocode for GCSSTTR is + * + * effective_el = AArch64.IsUnprivAccessPriv() ? PSTATE.EL : EL0; + * if (effective_el == PSTATE.EL) CheckGCSSTREnabled(); + * + * We have cached the result of IsUnprivAccessPriv in DisasContext, + * but since we need the result of full_a64_user_mem_index anyway, + * use the mmu_idx test as a proxy for the effective_el test. + */ + armidx = full_a64_user_mem_index(s, a->unpriv); + if (armidx == s->mmu_idx && s->gcsstr_el != 0) { + gen_exception_insn_el(s, 0, EXCP_UDEF, + syn_gcs_gcsstr(a->rn, a->rt), + s->gcsstr_el); + return true; + } + + if (a->rn == 31) { + gen_check_sp_alignment(s); + } + tcg_gen_qemu_st_i64(cpu_reg(s, a->rt), + clean_data_tbi(s, cpu_reg_sp(s, a->rn)), + core_gcs_mem_index(armidx), + finalize_memop(s, MO_64 | MO_ALIGN)); + return true; +} + static bool trans_LD_mult(DisasContext *s, arg_ldst_mult *a) { TCGv_i64 clean_addr, tcg_rn, tcg_ebytes; @@ -4492,7 +4855,7 @@ static bool do_SET(DisasContext *s, arg_set *a, bool is_epilogue, return false; } - memidx = get_a64_user_mem_index(s, a->unpriv); + memidx = core_a64_user_mem_index(s, a->unpriv); /* * We pass option_a == true, matching our implementation; @@ -4546,8 +4909,8 @@ static bool do_CPY(DisasContext *s, arg_cpy *a, bool is_epilogue, CpyFn fn) return false; } - rmemidx = get_a64_user_mem_index(s, runpriv); - wmemidx = get_a64_user_mem_index(s, wunpriv); + rmemidx = core_a64_user_mem_index(s, runpriv); + wmemidx = core_a64_user_mem_index(s, wunpriv); /* * We pass option_a == true, matching our implementation; @@ -10344,6 +10707,9 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->nv2_mem_be = EX_TBFLAG_A64(tb_flags, NV2_MEM_BE); dc->fpcr_ah = EX_TBFLAG_A64(tb_flags, AH); dc->fpcr_nep = EX_TBFLAG_A64(tb_flags, NEP); + dc->gcs_en = EX_TBFLAG_A64(tb_flags, GCS_EN); + dc->gcs_rvcen = EX_TBFLAG_A64(tb_flags, GCS_RVCEN); + dc->gcsstr_el = EX_TBFLAG_A64(tb_flags, GCSSTR_EL); dc->vec_len = 0; dc->vec_stride = 0; dc->cp_regs = arm_cpu->cp_regs; @@ -10570,6 +10936,8 @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) break; } } + + emit_delayed_exceptions(dc); } const TranslatorOps aarch64_translator_ops = { diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index e62dcc5..3df0bbc 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -1088,6 +1088,57 @@ void gen_exception_insn(DisasContext *s, target_long pc_diff, s->base.is_jmp = DISAS_NORETURN; } +TCGLabel *delay_exception_el(DisasContext *s, int excp, + uint32_t syn, uint32_t target_el) +{ + /* Use tcg_malloc for automatic release on longjmp out of translation. */ + DisasDelayException *e = tcg_malloc(sizeof(DisasDelayException)); + + memset(e, 0, sizeof(*e)); + + /* Save enough of the current state to satisfy gen_exception_insn. */ + e->pc_curr = s->pc_curr; + e->pc_save = s->pc_save; + if (!s->aarch64) { + e->condexec_cond = s->condexec_cond; + e->condexec_mask = s->condexec_mask; + } + + e->excp = excp; + e->syn = syn; + e->target_el = target_el; + + e->next = s->delay_excp_list; + s->delay_excp_list = e; + + e->lab = gen_new_label(); + return e->lab; +} + +TCGLabel *delay_exception(DisasContext *s, int excp, uint32_t syn) +{ + return delay_exception_el(s, excp, syn, 0); +} + +void emit_delayed_exceptions(DisasContext *s) +{ + for (DisasDelayException *e = s->delay_excp_list; e ; e = e->next) { + gen_set_label(e->lab); + + /* Restore the insn state to satisfy gen_exception_insn. */ + s->pc_curr = e->pc_curr; + s->pc_save = e->pc_save; + s->condexec_cond = e->condexec_cond; + s->condexec_mask = e->condexec_mask; + + if (e->target_el) { + gen_exception_insn_el(s, 0, e->excp, e->syn, e->target_el); + } else { + gen_exception_insn(s, 0, e->excp, e->syn); + } + } +} + static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn) { gen_set_condexec(s); @@ -1723,21 +1774,11 @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64, if (maskbit != 4 && maskbit != 14) { /* T4 and T14 are RES0 so never cause traps */ - TCGv_i32 t; - DisasLabel over = gen_disas_label(s); - - t = load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2)); - tcg_gen_andi_i32(t, t, 1u << maskbit); - tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label); + TCGLabel *fail = delay_exception_el(s, EXCP_UDEF, syndrome, 2); + TCGv_i32 t = + load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2)); - gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2); - /* - * gen_exception_insn() will set is_jmp to DISAS_NORETURN, - * but since we're conditionally branching over it, we want - * to assume continue-to-next-instruction. - */ - s->base.is_jmp = DISAS_NEXT; - set_disas_label(s, over); + tcg_gen_brcondi_i32(TCG_COND_TSTNE, t, 1u << maskbit, fail); } } @@ -5557,11 +5598,10 @@ static bool trans_LE(DisasContext *s, arg_LE *a) if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) { /* Need to do a runtime check for LTPSIZE != 4 */ - DisasLabel skipexc = gen_disas_label(s); + TCGLabel *fail = delay_exception(s, EXCP_INVSTATE, syn_uncategorized()); + tmp = load_cpu_field(v7m.ltpsize); - tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc.label); - gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized()); - set_disas_label(s, skipexc); + tcg_gen_brcondi_i32(TCG_COND_NE, tmp, 4, fail); } if (a->f) { @@ -6791,6 +6831,8 @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) gen_goto_tb(dc, 1, curr_insn_len(dc)); } } + + emit_delayed_exceptions(dc); } static const TranslatorOps arm_translator_ops = { diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index f1a6e5e..9a85ea7 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -21,9 +21,25 @@ typedef struct DisasLabel { target_ulong pc_save; } DisasLabel; +/* + * Emit an exception call out of line. + */ +typedef struct DisasDelayException { + struct DisasDelayException *next; + TCGLabel *lab; + target_long pc_curr; + target_long pc_save; + int condexec_mask; + int condexec_cond; + uint32_t excp; + uint32_t syn; + uint32_t target_el; +} DisasDelayException; + typedef struct DisasContext { DisasContextBase base; const ARMISARegisters *isar; + DisasDelayException *delay_excp_list; /* The address of the current instruction being translated. */ target_ulong pc_curr; @@ -166,6 +182,12 @@ typedef struct DisasContext { bool fpcr_ah; /* True if FPCR.NEP is 1 (FEAT_AFP scalar upper-element result handling) */ bool fpcr_nep; + /* True if GCSEnabled. */ + bool gcs_en; + /* True if GCSReturnValueCheckEnabled. */ + bool gcs_rvcen; + /* GCSSTR exception EL or 0 if enabled */ + uint8_t gcsstr_el; /* * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI. * < 0, set by the current instruction. @@ -359,6 +381,10 @@ void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp, uint32_t syn, uint32_t target_el); void gen_exception_insn(DisasContext *s, target_long pc_diff, int excp, uint32_t syn); +TCGLabel *delay_exception_el(DisasContext *s, int excp, + uint32_t syn, uint32_t target_el); +TCGLabel *delay_exception(DisasContext *s, int excp, uint32_t syn); +void emit_delayed_exceptions(DisasContext *s); /* Return state of Alternate Half-precision flag, caller frees result */ static inline TCGv_i32 get_ahp_flag(void) @@ -372,27 +398,27 @@ static inline TCGv_i32 get_ahp_flag(void) } /* Set bits within PSTATE. */ -static inline void set_pstate_bits(uint32_t bits) +static inline void set_pstate_bits(uint64_t bits) { - TCGv_i32 p = tcg_temp_new_i32(); + TCGv_i64 p = tcg_temp_new_i64(); tcg_debug_assert(!(bits & CACHED_PSTATE_BITS)); - tcg_gen_ld_i32(p, tcg_env, offsetof(CPUARMState, pstate)); - tcg_gen_ori_i32(p, p, bits); - tcg_gen_st_i32(p, tcg_env, offsetof(CPUARMState, pstate)); + tcg_gen_ld_i64(p, tcg_env, offsetof(CPUARMState, pstate)); + tcg_gen_ori_i64(p, p, bits); + tcg_gen_st_i64(p, tcg_env, offsetof(CPUARMState, pstate)); } /* Clear bits within PSTATE. */ -static inline void clear_pstate_bits(uint32_t bits) +static inline void clear_pstate_bits(uint64_t bits) { - TCGv_i32 p = tcg_temp_new_i32(); + TCGv_i64 p = tcg_temp_new_i64(); tcg_debug_assert(!(bits & CACHED_PSTATE_BITS)); - tcg_gen_ld_i32(p, tcg_env, offsetof(CPUARMState, pstate)); - tcg_gen_andi_i32(p, p, ~bits); - tcg_gen_st_i32(p, tcg_env, offsetof(CPUARMState, pstate)); + tcg_gen_ld_i64(p, tcg_env, offsetof(CPUARMState, pstate)); + tcg_gen_andi_i64(p, p, ~bits); + tcg_gen_st_i64(p, tcg_env, offsetof(CPUARMState, pstate)); } /* If the singlestep state is Active-not-pending, advance to Active-pending. */ diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6d85149..ab18de8 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -7539,6 +7539,20 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) #endif break; + case FEAT_7_0_EDX: + /* + * Windows does not like ARCH_CAPABILITIES on AMD machines at all. + * Do not show the fake ARCH_CAPABILITIES MSR that KVM sets up, + * except if needed for migration. + * + * When arch_cap_always_on is removed, this tweak can move to + * kvm_arch_get_supported_cpuid. + */ + if (cpu && IS_AMD_CPU(&cpu->env) && !cpu->arch_cap_always_on) { + unavail = CPUID_7_0_EDX_ARCH_CAPABILITIES; + } + break; + default: break; } @@ -7894,6 +7908,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, /* Fixup overflow: max value for bits 23-16 is 255. */ *ebx |= MIN(num, 255) << 16; } + if (cpu->pdcm_on_even_without_pmu) { + if (!cpu->enable_pmu) { + *ecx &= ~CPUID_EXT_PDCM; + } + } break; case 2: { /* cache info: needed for Pentium Pro compatibility */ const CPUCaches *caches; @@ -8944,9 +8963,11 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) } } - /* PDCM is fixed1 bit for TDX */ - if (!cpu->enable_pmu && !is_tdx_vm()) { - env->features[FEAT_1_ECX] &= ~CPUID_EXT_PDCM; + if (!cpu->pdcm_on_even_without_pmu) { + /* PDCM is fixed1 bit for TDX */ + if (!cpu->enable_pmu && !is_tdx_vm()) { + env->features[FEAT_1_ECX] &= ~CPUID_EXT_PDCM; + } } for (i = 0; i < ARRAY_SIZE(feature_dependencies); i++) { @@ -10004,6 +10025,11 @@ static const Property x86_cpu_properties[] = { true), DEFINE_PROP_BOOL("x-l1-cache-per-thread", X86CPU, l1_cache_per_core, true), DEFINE_PROP_BOOL("x-force-cpuid-0x1f", X86CPU, force_cpuid_0x1f, false), + + DEFINE_PROP_BOOL("x-arch-cap-always-on", X86CPU, + arch_cap_always_on, false), + DEFINE_PROP_BOOL("x-pdcm-on-even-without-pmu", X86CPU, + pdcm_on_even_without_pmu, false), }; #ifndef CONFIG_USER_ONLY diff --git a/target/i386/cpu.h b/target/i386/cpu.h index e0be7a7..8b7c173 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -435,9 +435,11 @@ typedef enum X86Seg { #define MSR_SMI_COUNT 0x34 #define MSR_CORE_THREAD_COUNT 0x35 #define MSR_MTRRcap 0xfe +#define MSR_MTRR_MEM_TYPE_WB 0x06 #define MSR_MTRRcap_VCNT 8 #define MSR_MTRRcap_FIXRANGE_SUPPORT (1 << 8) #define MSR_MTRRcap_WC_SUPPORTED (1 << 10) +#define MSR_MTRR_ENABLE (1 << 11) #define MSR_IA32_SYSENTER_CS 0x174 #define MSR_IA32_SYSENTER_ESP 0x175 @@ -2126,7 +2128,7 @@ typedef struct CPUArchState { QEMUTimer *xen_periodic_timer; QemuMutex xen_timers_lock; #endif -#if defined(CONFIG_HVF) +#if defined(CONFIG_HVF) || defined(CONFIG_MSHV) void *emu_mmio_buf; #endif @@ -2314,6 +2316,18 @@ struct ArchCPU { /* Forcefully disable KVM PV features not exposed in guest CPUIDs */ bool kvm_pv_enforce_cpuid; + /* + * Expose arch-capabilities unconditionally even on AMD models, for backwards + * compatibility with QEMU <10.1. + */ + bool arch_cap_always_on; + + /* + * Backwards compatibility with QEMU <10.1. The PDCM feature is now disabled when + * PMU is not available, but prior to 10.1 it was enabled even if PMU is off. + */ + bool pdcm_on_even_without_pmu; + /* Number of physical address bits supported */ uint32_t phys_bits; diff --git a/target/i386/emulate/meson.build b/target/i386/emulate/meson.build index 4edd4f4..b6dafb6 100644 --- a/target/i386/emulate/meson.build +++ b/target/i386/emulate/meson.build @@ -1,5 +1,8 @@ -i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files( +emulator_files = files( 'x86_decode.c', 'x86_emu.c', 'x86_flags.c', -)) +) + +i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: emulator_files) +i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_files) diff --git a/target/i386/emulate/x86_decode.c b/target/i386/emulate/x86_decode.c index 2eca398..97bd6f1 100644 --- a/target/i386/emulate/x86_decode.c +++ b/target/i386/emulate/x86_decode.c @@ -71,10 +71,16 @@ static inline uint64_t decode_bytes(CPUX86State *env, struct x86_decode *decode, VM_PANIC_EX("%s invalid size %d\n", __func__, size); break; } - target_ulong va = linear_rip(env_cpu(env), env->eip) + decode->len; - emul_ops->read_mem(env_cpu(env), &val, va, size); + + /* copy the bytes from the instruction stream, if available */ + if (decode->stream && decode->len + size <= decode->stream->len) { + memcpy(&val, decode->stream->bytes + decode->len, size); + } else { + target_ulong va = linear_rip(env_cpu(env), env->eip) + decode->len; + emul_ops->fetch_instruction(env_cpu(env), &val, va, size); + } decode->len += size; - + return val; } @@ -2076,9 +2082,10 @@ static void decode_opcodes(CPUX86State *env, struct x86_decode *decode) } } -uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) +static uint32_t decode_opcode(CPUX86State *env, struct x86_decode *decode) { memset(decode, 0, sizeof(*decode)); + decode_prefix(env, decode); set_addressing_size(env, decode); set_operand_size(env, decode); @@ -2088,6 +2095,18 @@ uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) return decode->len; } +uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) +{ + return decode_opcode(env, decode); +} + +uint32_t decode_instruction_stream(CPUX86State *env, struct x86_decode *decode, + struct x86_insn_stream *stream) +{ + decode->stream = stream; + return decode_opcode(env, decode); +} + void init_decoder(void) { int i; diff --git a/target/i386/emulate/x86_decode.h b/target/i386/emulate/x86_decode.h index 927645a..1cadf36 100644 --- a/target/i386/emulate/x86_decode.h +++ b/target/i386/emulate/x86_decode.h @@ -272,6 +272,11 @@ typedef struct x86_decode_op { }; } x86_decode_op; +typedef struct x86_insn_stream { + const uint8_t *bytes; + size_t len; +} x86_insn_stream; + typedef struct x86_decode { int len; uint8_t opcode[4]; @@ -298,11 +303,15 @@ typedef struct x86_decode { struct x86_modrm modrm; struct x86_decode_op op[4]; bool is_fpu; + + x86_insn_stream *stream; } x86_decode; uint64_t sign(uint64_t val, int size); uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode); +uint32_t decode_instruction_stream(CPUX86State *env, struct x86_decode *decode, + struct x86_insn_stream *stream); void *get_reg_ref(CPUX86State *env, int reg, int rex_present, int is_extended, int size); diff --git a/target/i386/emulate/x86_emu.c b/target/i386/emulate/x86_emu.c index db7a7f7..4409f7b 100644 --- a/target/i386/emulate/x86_emu.c +++ b/target/i386/emulate/x86_emu.c @@ -1246,7 +1246,8 @@ static void init_cmd_handler(void) bool exec_instruction(CPUX86State *env, struct x86_decode *ins) { if (!_cmd_handler[ins->cmd].handler) { - printf("Unimplemented handler (" TARGET_FMT_lx ") for %d (%x %x) \n", env->eip, + printf("Unimplemented handler (" TARGET_FMT_lx ") for %d (%x %x)\n", + env->eip, ins->cmd, ins->opcode[0], ins->opcode_len > 1 ? ins->opcode[1] : 0); env->eip += ins->len; diff --git a/target/i386/emulate/x86_emu.h b/target/i386/emulate/x86_emu.h index a1a9612..05686b1 100644 --- a/target/i386/emulate/x86_emu.h +++ b/target/i386/emulate/x86_emu.h @@ -24,6 +24,8 @@ #include "cpu.h" struct x86_emul_ops { + void (*fetch_instruction)(CPUState *cpu, void *data, target_ulong addr, + int bytes); void (*read_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes); void (*write_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes); void (*read_segment_descriptor)(CPUState *cpu, struct x86_segment_descriptor *desc, diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6a3a1c1..db40caa 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -503,12 +503,8 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is * returned by KVM_GET_MSR_INDEX_LIST. - * - * But also, because Windows does not like ARCH_CAPABILITIES on AMD - * mcahines at all, do not show the fake ARCH_CAPABILITIES MSR that - * KVM sets up. */ - if (!has_msr_arch_capabs || !(edx & CPUID_7_0_EDX_ARCH_CAPABILITIES)) { + if (!has_msr_arch_capabs) { ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; } } else if (function == 7 && index == 1 && reg == R_EAX) { diff --git a/target/i386/meson.build b/target/i386/meson.build index 092af34..89ba491 100644 --- a/target/i386/meson.build +++ b/target/i386/meson.build @@ -13,6 +13,7 @@ i386_ss.add(when: 'CONFIG_KVM', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_HVF', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_WHPX', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_NVMM', if_true: files('host-cpu.c')) +i386_ss.add(when: 'CONFIG_MSHV', if_true: files('host-cpu.c')) i386_system_ss = ss.source_set() i386_system_ss.add(files( @@ -34,6 +35,7 @@ subdir('nvmm') subdir('hvf') subdir('tcg') subdir('emulate') +subdir('mshv') target_arch += {'i386': i386_ss} target_system_arch += {'i386': i386_system_ss} diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build new file mode 100644 index 0000000..647e5da --- /dev/null +++ b/target/i386/mshv/meson.build @@ -0,0 +1,8 @@ +i386_mshv_ss = ss.source_set() + +i386_mshv_ss.add(files( + 'mshv-cpu.c', + 'x86.c', +)) + +i386_system_ss.add_all(when: 'CONFIG_MSHV', if_true: i386_mshv_ss) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c new file mode 100644 index 0000000..1f7b9cb --- /dev/null +++ b/target/i386/mshv/mshv-cpu.c @@ -0,0 +1,1763 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/memalign.h" +#include "qemu/typedefs.h" + +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/address-spaces.h" +#include "linux/mshv.h" +#include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/i386/apic_internal.h" + +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "emulate/x86_flags.h" + +#include "trace-accel_mshv.h" +#include "trace.h" + +#include <sys/ioctl.h> + +#define MAX_REGISTER_COUNT (MAX_CONST(ARRAY_SIZE(STANDARD_REGISTER_NAMES), \ + MAX_CONST(ARRAY_SIZE(SPECIAL_REGISTER_NAMES), \ + ARRAY_SIZE(FPU_REGISTER_NAMES)))) + +static enum hv_register_name STANDARD_REGISTER_NAMES[18] = { + HV_X64_REGISTER_RAX, + HV_X64_REGISTER_RBX, + HV_X64_REGISTER_RCX, + HV_X64_REGISTER_RDX, + HV_X64_REGISTER_RSI, + HV_X64_REGISTER_RDI, + HV_X64_REGISTER_RSP, + HV_X64_REGISTER_RBP, + HV_X64_REGISTER_R8, + HV_X64_REGISTER_R9, + HV_X64_REGISTER_R10, + HV_X64_REGISTER_R11, + HV_X64_REGISTER_R12, + HV_X64_REGISTER_R13, + HV_X64_REGISTER_R14, + HV_X64_REGISTER_R15, + HV_X64_REGISTER_RIP, + HV_X64_REGISTER_RFLAGS, +}; + +static enum hv_register_name SPECIAL_REGISTER_NAMES[17] = { + HV_X64_REGISTER_CS, + HV_X64_REGISTER_DS, + HV_X64_REGISTER_ES, + HV_X64_REGISTER_FS, + HV_X64_REGISTER_GS, + HV_X64_REGISTER_SS, + HV_X64_REGISTER_TR, + HV_X64_REGISTER_LDTR, + HV_X64_REGISTER_GDTR, + HV_X64_REGISTER_IDTR, + HV_X64_REGISTER_CR0, + HV_X64_REGISTER_CR2, + HV_X64_REGISTER_CR3, + HV_X64_REGISTER_CR4, + HV_X64_REGISTER_CR8, + HV_X64_REGISTER_EFER, + HV_X64_REGISTER_APIC_BASE, +}; + +static enum hv_register_name FPU_REGISTER_NAMES[26] = { + HV_X64_REGISTER_XMM0, + HV_X64_REGISTER_XMM1, + HV_X64_REGISTER_XMM2, + HV_X64_REGISTER_XMM3, + HV_X64_REGISTER_XMM4, + HV_X64_REGISTER_XMM5, + HV_X64_REGISTER_XMM6, + HV_X64_REGISTER_XMM7, + HV_X64_REGISTER_XMM8, + HV_X64_REGISTER_XMM9, + HV_X64_REGISTER_XMM10, + HV_X64_REGISTER_XMM11, + HV_X64_REGISTER_XMM12, + HV_X64_REGISTER_XMM13, + HV_X64_REGISTER_XMM14, + HV_X64_REGISTER_XMM15, + HV_X64_REGISTER_FP_MMX0, + HV_X64_REGISTER_FP_MMX1, + HV_X64_REGISTER_FP_MMX2, + HV_X64_REGISTER_FP_MMX3, + HV_X64_REGISTER_FP_MMX4, + HV_X64_REGISTER_FP_MMX5, + HV_X64_REGISTER_FP_MMX6, + HV_X64_REGISTER_FP_MMX7, + HV_X64_REGISTER_FP_CONTROL_STATUS, + HV_X64_REGISTER_XMM_CONTROL_STATUS, +}; + +static int translate_gva(const CPUState *cpu, uint64_t gva, uint64_t *gpa, + uint64_t flags) +{ + int ret; + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + + hv_input_translate_virtual_address in = { 0 }; + hv_output_translate_virtual_address out = { 0 }; + struct mshv_root_hvcall args = {0}; + uint64_t gva_page = gva >> HV_HYP_PAGE_SHIFT; + + in.vp_index = vp_index; + in.control_flags = flags; + in.gva_page = gva_page; + + /* create the hvcall envelope */ + args.code = HVCALL_TRANSLATE_VIRTUAL_ADDRESS; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t) ∈ + args.out_sz = sizeof(out); + args.out_ptr = (uint64_t) &out; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to invoke gva->gpa translation"); + return -errno; + } + + if (out.translation_result.result_code != HV_TRANSLATE_GVA_SUCCESS) { + error_report("Failed to translate gva (" TARGET_FMT_lx ") to gpa", gva); + return -1; + } + + *gpa = ((out.gpa_page << HV_HYP_PAGE_SHIFT) + | (gva & ~(uint64_t)HV_HYP_PAGE_MASK)); + + return 0; +} + +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + size_t in_sz, assocs_sz; + hv_input_set_vp_registers *in = cpu->accel->hvcall_args.input_page; + struct mshv_root_hvcall args = {0}; + int ret; + + /* find out the size of the struct w/ a flexible array at the tail */ + assocs_sz = n_regs * sizeof(hv_register_assoc); + in_sz = sizeof(hv_input_set_vp_registers) + assocs_sz; + + /* fill the input struct */ + memset(in, 0, sizeof(hv_input_set_vp_registers)); + in->vp_index = vp_index; + memcpy(in->elements, assocs, assocs_sz); + + /* create the hvcall envelope */ + args.code = HVCALL_SET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to set registers"); + return -1; + } + + /* assert we set all registers */ + if (args.reps != n_regs) { + error_report("Failed to set registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + return 0; +} + +static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + hv_input_get_vp_registers *in = cpu->accel->hvcall_args.input_page; + hv_register_value *values = cpu->accel->hvcall_args.output_page; + size_t in_sz, names_sz, values_sz; + int i, ret; + struct mshv_root_hvcall args = {0}; + + /* find out the size of the struct w/ a flexible array at the tail */ + names_sz = n_regs * sizeof(hv_register_name); + in_sz = sizeof(hv_input_get_vp_registers) + names_sz; + + /* fill the input struct */ + memset(in, 0, sizeof(hv_input_get_vp_registers)); + in->vp_index = vp_index; + for (i = 0; i < n_regs; i++) { + in->names[i] = assocs[i].name; + } + + /* determine size of value output buffer */ + values_sz = n_regs * sizeof(union hv_register_value); + + /* create the hvcall envelope */ + args.code = HVCALL_GET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.out_sz = values_sz; + args.out_ptr = (uint64_t) values; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to retrieve registers"); + return -1; + } + + /* assert we got all registers */ + if (args.reps != n_regs) { + error_report("Failed to retrieve registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + /* copy values into assoc */ + for (i = 0; i < n_regs; i++) { + assocs[i].value = values[i]; + } + + return 0; +} + +static int set_standard_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + /* set names */ + for (size_t i = 0; i < ARRAY_SIZE(STANDARD_REGISTER_NAMES); i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + assocs[0].value.reg64 = env->regs[R_EAX]; + assocs[1].value.reg64 = env->regs[R_EBX]; + assocs[2].value.reg64 = env->regs[R_ECX]; + assocs[3].value.reg64 = env->regs[R_EDX]; + assocs[4].value.reg64 = env->regs[R_ESI]; + assocs[5].value.reg64 = env->regs[R_EDI]; + assocs[6].value.reg64 = env->regs[R_ESP]; + assocs[7].value.reg64 = env->regs[R_EBP]; + assocs[8].value.reg64 = env->regs[R_R8]; + assocs[9].value.reg64 = env->regs[R_R9]; + assocs[10].value.reg64 = env->regs[R_R10]; + assocs[11].value.reg64 = env->regs[R_R11]; + assocs[12].value.reg64 = env->regs[R_R12]; + assocs[13].value.reg64 = env->regs[R_R13]; + assocs[14].value.reg64 = env->regs[R_R14]; + assocs[15].value.reg64 = env->regs[R_R15]; + assocs[16].value.reg64 = env->eip; + lflags_to_rflags(env); + assocs[17].value.reg64 = env->eflags; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set standard registers"); + return -errno; + } + return 0; +} + +int mshv_store_regs(CPUState *cpu) +{ + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to store standard registers"); + return -1; + } + + return 0; +} + +static void populate_standard_regs(const hv_register_assoc *assocs, + CPUX86State *env) +{ + env->regs[R_EAX] = assocs[0].value.reg64; + env->regs[R_EBX] = assocs[1].value.reg64; + env->regs[R_ECX] = assocs[2].value.reg64; + env->regs[R_EDX] = assocs[3].value.reg64; + env->regs[R_ESI] = assocs[4].value.reg64; + env->regs[R_EDI] = assocs[5].value.reg64; + env->regs[R_ESP] = assocs[6].value.reg64; + env->regs[R_EBP] = assocs[7].value.reg64; + env->regs[R_R8] = assocs[8].value.reg64; + env->regs[R_R9] = assocs[9].value.reg64; + env->regs[R_R10] = assocs[10].value.reg64; + env->regs[R_R11] = assocs[11].value.reg64; + env->regs[R_R12] = assocs[12].value.reg64; + env->regs[R_R13] = assocs[13].value.reg64; + env->regs[R_R14] = assocs[14].value.reg64; + env->regs[R_R15] = assocs[15].value.reg64; + + env->eip = assocs[16].value.reg64; + env->eflags = assocs[17].value.reg64; + rflags_to_lflags(env); +} + +int mshv_get_standard_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get standard registers"); + return -1; + } + + populate_standard_regs(assocs, env); + return 0; +} + +static inline void populate_segment_reg(const hv_x64_segment_register *hv_seg, + SegmentCache *seg) +{ + memset(seg, 0, sizeof(SegmentCache)); + + seg->base = hv_seg->base; + seg->limit = hv_seg->limit; + seg->selector = hv_seg->selector; + + seg->flags = (hv_seg->segment_type << DESC_TYPE_SHIFT) + | (hv_seg->present * DESC_P_MASK) + | (hv_seg->descriptor_privilege_level << DESC_DPL_SHIFT) + | (hv_seg->_default << DESC_B_SHIFT) + | (hv_seg->non_system_segment * DESC_S_MASK) + | (hv_seg->_long << DESC_L_SHIFT) + | (hv_seg->granularity * DESC_G_MASK) + | (hv_seg->available * DESC_AVL_MASK); + +} + +static inline void populate_table_reg(const hv_x64_table_register *hv_seg, + SegmentCache *tbl) +{ + memset(tbl, 0, sizeof(SegmentCache)); + + tbl->base = hv_seg->base; + tbl->limit = hv_seg->limit; +} + +static void populate_special_regs(const hv_register_assoc *assocs, + X86CPU *x86cpu) +{ + CPUX86State *env = &x86cpu->env; + + populate_segment_reg(&assocs[0].value.segment, &env->segs[R_CS]); + populate_segment_reg(&assocs[1].value.segment, &env->segs[R_DS]); + populate_segment_reg(&assocs[2].value.segment, &env->segs[R_ES]); + populate_segment_reg(&assocs[3].value.segment, &env->segs[R_FS]); + populate_segment_reg(&assocs[4].value.segment, &env->segs[R_GS]); + populate_segment_reg(&assocs[5].value.segment, &env->segs[R_SS]); + + populate_segment_reg(&assocs[6].value.segment, &env->tr); + populate_segment_reg(&assocs[7].value.segment, &env->ldt); + + populate_table_reg(&assocs[8].value.table, &env->gdt); + populate_table_reg(&assocs[9].value.table, &env->idt); + + env->cr[0] = assocs[10].value.reg64; + env->cr[2] = assocs[11].value.reg64; + env->cr[3] = assocs[12].value.reg64; + env->cr[4] = assocs[13].value.reg64; + + cpu_set_apic_tpr(x86cpu->apic_state, assocs[14].value.reg64); + env->efer = assocs[15].value.reg64; + cpu_set_apic_base(x86cpu->apic_state, assocs[16].value.reg64); +} + + +int mshv_get_special_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get special registers"); + return -errno; + } + + populate_special_regs(assocs, x86cpu); + return 0; +} + +int mshv_load_regs(CPUState *cpu) +{ + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to load standard registers"); + return -1; + } + + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to load special registers"); + return -1; + } + + return 0; +} + +static void add_cpuid_entry(GList *cpuid_entries, + uint32_t function, uint32_t index, + uint32_t eax, uint32_t ebx, + uint32_t ecx, uint32_t edx) +{ + struct hv_cpuid_entry *entry; + + entry = g_malloc0(sizeof(struct hv_cpuid_entry)); + entry->function = function; + entry->index = index; + entry->eax = eax; + entry->ebx = ebx; + entry->ecx = ecx; + entry->edx = edx; + + cpuid_entries = g_list_append(cpuid_entries, entry); +} + +static void collect_cpuid_entries(const CPUState *cpu, GList *cpuid_entries) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint32_t eax, ebx, ecx, edx; + uint32_t leaf, subleaf; + size_t max_leaf = 0x1F; + size_t max_subleaf = 0x20; + + uint32_t leaves_with_subleaves[] = {0x4, 0x7, 0xD, 0xF, 0x10}; + int n_subleaf_leaves = ARRAY_SIZE(leaves_with_subleaves); + + /* Regular leaves without subleaves */ + for (leaf = 0; leaf <= max_leaf; leaf++) { + bool has_subleaves = false; + for (int i = 0; i < n_subleaf_leaves; i++) { + if (leaf == leaves_with_subleaves[i]) { + has_subleaves = true; + break; + } + } + + if (!has_subleaves) { + cpu_x86_cpuid(env, leaf, 0, &eax, &ebx, &ecx, &edx); + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + continue; + } + + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + continue; + } + + subleaf = 0; + while (subleaf < max_subleaf) { + cpu_x86_cpuid(env, leaf, subleaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + break; + } + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + subleaf++; + } + } +} + +static int register_intercept_result_cpuid_entry(const CPUState *cpu, + uint8_t subleaf_specific, + uint8_t always_override, + struct hv_cpuid_entry *entry) +{ + int ret; + int vp_index = cpu->cpu_index; + int cpu_fd = mshv_vcpufd(cpu); + + struct hv_register_x64_cpuid_result_parameters cpuid_params = { + .input.eax = entry->function, + .input.ecx = entry->index, + .input.subleaf_specific = subleaf_specific, + .input.always_override = always_override, + .input.padding = 0, + /* + * With regard to masks - these are to specify bits to be overwritten + * The current CpuidEntry structure wouldn't allow to carry the masks + * in addition to the actual register values. For this reason, the + * masks are set to the exact values of the corresponding register bits + * to be registered for an overwrite. To view resulting values the + * hypervisor would return, HvCallGetVpCpuidValues hypercall can be + * used. + */ + .result.eax = entry->eax, + .result.eax_mask = entry->eax, + .result.ebx = entry->ebx, + .result.ebx_mask = entry->ebx, + .result.ecx = entry->ecx, + .result.ecx_mask = entry->ecx, + .result.edx = entry->edx, + .result.edx_mask = entry->edx, + }; + union hv_register_intercept_result_parameters parameters = { + .cpuid = cpuid_params, + }; + + hv_input_register_intercept_result in = {0}; + in.vp_index = vp_index; + in.intercept_type = HV_INTERCEPT_TYPE_X64_CPUID; + in.parameters = parameters; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_REGISTER_INTERCEPT_RESULT; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("failed to register intercept result for cpuid"); + return -1; + } + + return 0; +} + +static int register_intercept_result_cpuid(const CPUState *cpu, + struct hv_cpuid *cpuid) +{ + int ret = 0, entry_ret; + struct hv_cpuid_entry *entry; + uint8_t subleaf_specific, always_override; + + for (size_t i = 0; i < cpuid->nent; i++) { + entry = &cpuid->entries[i]; + + /* set defaults */ + subleaf_specific = 0; + always_override = 1; + + /* Intel */ + /* 0xb - Extended Topology Enumeration Leaf */ + /* 0x1f - V2 Extended Topology Enumeration Leaf */ + /* AMD */ + /* 0x8000_001e - Processor Topology Information */ + /* 0x8000_0026 - Extended CPU Topology */ + if (entry->function == 0xb + || entry->function == 0x1f + || entry->function == 0x8000001e + || entry->function == 0x80000026) { + subleaf_specific = 1; + always_override = 1; + } else if (entry->function == 0x00000001 + || entry->function == 0x80000000 + || entry->function == 0x80000001 + || entry->function == 0x80000008) { + subleaf_specific = 0; + always_override = 1; + } + + entry_ret = register_intercept_result_cpuid_entry(cpu, subleaf_specific, + always_override, + entry); + if ((entry_ret < 0) && (ret == 0)) { + ret = entry_ret; + } + } + + return ret; +} + +static int set_cpuid2(const CPUState *cpu) +{ + int ret; + size_t n_entries, cpuid_size; + struct hv_cpuid *cpuid; + struct hv_cpuid_entry *entry; + GList *entries = NULL; + + collect_cpuid_entries(cpu, entries); + n_entries = g_list_length(entries); + + cpuid_size = sizeof(struct hv_cpuid) + + n_entries * sizeof(struct hv_cpuid_entry); + + cpuid = g_malloc0(cpuid_size); + cpuid->nent = n_entries; + cpuid->padding = 0; + + for (size_t i = 0; i < n_entries; i++) { + entry = g_list_nth_data(entries, i); + cpuid->entries[i] = *entry; + g_free(entry); + } + g_list_free(entries); + + ret = register_intercept_result_cpuid(cpu, cpuid); + g_free(cpuid); + if (ret < 0) { + return ret; + } + + return 0; +} + +static inline void populate_hv_segment_reg(SegmentCache *seg, + hv_x64_segment_register *hv_reg) +{ + uint32_t flags = seg->flags; + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; + hv_reg->selector = seg->selector; + hv_reg->segment_type = (flags >> DESC_TYPE_SHIFT) & 0xF; + hv_reg->non_system_segment = (flags & DESC_S_MASK) != 0; + hv_reg->descriptor_privilege_level = (flags >> DESC_DPL_SHIFT) & 0x3; + hv_reg->present = (flags & DESC_P_MASK) != 0; + hv_reg->reserved = 0; + hv_reg->available = (flags & DESC_AVL_MASK) != 0; + hv_reg->_long = (flags >> DESC_L_SHIFT) & 0x1; + hv_reg->_default = (flags >> DESC_B_SHIFT) & 0x1; + hv_reg->granularity = (flags & DESC_G_MASK) != 0; +} + +static inline void populate_hv_table_reg(const struct SegmentCache *seg, + hv_x64_table_register *hv_reg) +{ + memset(hv_reg, 0, sizeof(*hv_reg)); + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; +} + +static int set_special_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + int ret; + + /* set names */ + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + populate_hv_segment_reg(&env->segs[R_CS], &assocs[0].value.segment); + populate_hv_segment_reg(&env->segs[R_DS], &assocs[1].value.segment); + populate_hv_segment_reg(&env->segs[R_ES], &assocs[2].value.segment); + populate_hv_segment_reg(&env->segs[R_FS], &assocs[3].value.segment); + populate_hv_segment_reg(&env->segs[R_GS], &assocs[4].value.segment); + populate_hv_segment_reg(&env->segs[R_SS], &assocs[5].value.segment); + populate_hv_segment_reg(&env->tr, &assocs[6].value.segment); + populate_hv_segment_reg(&env->ldt, &assocs[7].value.segment); + + populate_hv_table_reg(&env->gdt, &assocs[8].value.table); + populate_hv_table_reg(&env->idt, &assocs[9].value.table); + + assocs[10].value.reg64 = env->cr[0]; + assocs[11].value.reg64 = env->cr[2]; + assocs[12].value.reg64 = env->cr[3]; + assocs[13].value.reg64 = env->cr[4]; + assocs[14].value.reg64 = cpu_get_apic_tpr(x86cpu->apic_state); + assocs[15].value.reg64 = env->efer; + assocs[16].value.reg64 = cpu_get_apic_base(x86cpu->apic_state); + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set special registers"); + return -1; + } + + return 0; +} + +static int set_fpu(const CPUState *cpu, const struct MshvFPU *regs) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(FPU_REGISTER_NAMES)]; + union hv_register_value *value; + size_t fp_i; + union hv_x64_fp_control_status_register *ctrl_status; + union hv_x64_xmm_control_status_register *xmm_ctrl_status; + int ret; + size_t n_regs = ARRAY_SIZE(FPU_REGISTER_NAMES); + + /* first 16 registers are xmm0-xmm15 */ + for (size_t i = 0; i < 16; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + value = &assocs[i].value; + memcpy(&value->reg128, ®s->xmm[i], 16); + } + + /* next 8 registers are fp_mmx0-fp_mmx7 */ + for (size_t i = 16; i < 24; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + fp_i = (i - 16); + value = &assocs[i].value; + memcpy(&value->reg128, ®s->fpr[fp_i], 16); + } + + /* last two registers are fp_control_status and xmm_control_status */ + assocs[24].name = FPU_REGISTER_NAMES[24]; + value = &assocs[24].value; + ctrl_status = &value->fp_control_status; + ctrl_status->fp_control = regs->fcw; + ctrl_status->fp_status = regs->fsw; + ctrl_status->fp_tag = regs->ftwx; + ctrl_status->reserved = 0; + ctrl_status->last_fp_op = regs->last_opcode; + ctrl_status->last_fp_rip = regs->last_ip; + + assocs[25].name = FPU_REGISTER_NAMES[25]; + value = &assocs[25].value; + xmm_ctrl_status = &value->xmm_control_status; + xmm_ctrl_status->xmm_status_control = regs->mxcsr; + xmm_ctrl_status->xmm_status_control_mask = 0; + xmm_ctrl_status->last_fp_rdp = regs->last_dp; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set fpu registers"); + return -1; + } + + return 0; +} + +static int set_xc_reg(const CPUState *cpu, uint64_t xcr0) +{ + int ret; + struct hv_register_assoc assoc = { + .name = HV_X64_REGISTER_XFEM, + .value.reg64 = xcr0, + }; + + ret = mshv_set_generic_regs(cpu, &assoc, 1); + if (ret < 0) { + error_report("failed to set xcr0"); + return -errno; + } + return 0; +} + +static int set_cpu_state(const CPUState *cpu, const MshvFPU *fpu_regs, + uint64_t xcr0) +{ + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_special_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_fpu(cpu, fpu_regs); + if (ret < 0) { + return ret; + } + ret = set_xc_reg(cpu, xcr0); + if (ret < 0) { + return ret; + } + return 0; +} + +static int get_vp_state(int cpu_fd, struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_GET_VP_STATE, state); + if (ret < 0) { + error_report("failed to get partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int get_lapic(int cpu_fd, + struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = get_vp_state(cpu_fd, &mshv_state); + if (ret == 0) { + memcpy(state, buffer, sizeof(*state)); + } + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to get lapic"); + return -1; + } + + return 0; +} + +static uint32_t set_apic_delivery_mode(uint32_t reg, uint32_t mode) +{ + return ((reg) & ~0x700) | ((mode) << 8); +} + +static int set_vp_state(int cpu_fd, const struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_SET_VP_STATE, state); + if (ret < 0) { + error_report("failed to set partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lapic(int cpu_fd, + const struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + if (!state) { + error_report("lapic state is NULL"); + return -1; + } + memcpy(buffer, state, sizeof(*state)); + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = set_vp_state(cpu_fd, &mshv_state); + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to set lapic: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lint(int cpu_fd) +{ + int ret; + uint32_t *lvt_lint0, *lvt_lint1; + + struct hv_local_interrupt_controller_state lapic_state = { 0 }; + ret = get_lapic(cpu_fd, &lapic_state); + if (ret < 0) { + return ret; + } + + lvt_lint0 = &lapic_state.apic_lvt_lint0; + *lvt_lint0 = set_apic_delivery_mode(*lvt_lint0, APIC_DM_EXTINT); + + lvt_lint1 = &lapic_state.apic_lvt_lint1; + *lvt_lint1 = set_apic_delivery_mode(*lvt_lint1, APIC_DM_NMI); + + /* TODO: should we skip setting lapic if the values are the same? */ + + return set_lapic(cpu_fd, &lapic_state); +} + +static int setup_msrs(const CPUState *cpu) +{ + int ret; + uint64_t default_type = MSR_MTRR_ENABLE | MSR_MTRR_MEM_TYPE_WB; + + /* boot msr entries */ + MshvMsrEntry msrs[9] = { + { .index = IA32_MSR_SYSENTER_CS, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_ESP, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_EIP, .data = 0x0, }, + { .index = IA32_MSR_STAR, .data = 0x0, }, + { .index = IA32_MSR_CSTAR, .data = 0x0, }, + { .index = IA32_MSR_LSTAR, .data = 0x0, }, + { .index = IA32_MSR_KERNEL_GS_BASE, .data = 0x0, }, + { .index = IA32_MSR_SFMASK, .data = 0x0, }, + { .index = IA32_MSR_MTRR_DEF_TYPE, .data = default_type, }, + }; + + ret = mshv_configure_msr(cpu, msrs, 9); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + + return 0; +} + +/* + * TODO: populate topology info: + * + * X86CPU *x86cpu = X86_CPU(cpu); + * CPUX86State *env = &x86cpu->env; + * X86CPUTopoInfo *topo_info = &env->topo_info; + */ +int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, + uint64_t xcr0) +{ + int ret; + int cpu_fd = mshv_vcpufd(cpu); + + ret = set_cpuid2(cpu); + if (ret < 0) { + error_report("failed to set cpuid"); + return -1; + } + + ret = setup_msrs(cpu); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + + ret = set_cpu_state(cpu, fpu, xcr0); + if (ret < 0) { + error_report("failed to set cpu state"); + return -1; + } + + ret = set_lint(cpu_fd); + if (ret < 0) { + error_report("failed to set lpic int"); + return -1; + } + + return 0; +} + +static int put_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvFPU fpu = {0}; + int ret; + + memset(&fpu, 0, sizeof(fpu)); + + ret = mshv_configure_vcpu(cpu, &fpu, env->xcr0); + if (ret < 0) { + error_report("failed to configure vcpu"); + return ret; + } + + return 0; +} + +struct MsrPair { + uint32_t index; + uint64_t value; +}; + +static int put_msrs(const CPUState *cpu) +{ + int ret = 0; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvMsrEntries *msrs = g_malloc0(sizeof(MshvMsrEntries)); + + struct MsrPair pairs[] = { + { MSR_IA32_SYSENTER_CS, env->sysenter_cs }, + { MSR_IA32_SYSENTER_ESP, env->sysenter_esp }, + { MSR_IA32_SYSENTER_EIP, env->sysenter_eip }, + { MSR_EFER, env->efer }, + { MSR_PAT, env->pat }, + { MSR_STAR, env->star }, + { MSR_CSTAR, env->cstar }, + { MSR_LSTAR, env->lstar }, + { MSR_KERNELGSBASE, env->kernelgsbase }, + { MSR_FMASK, env->fmask }, + { MSR_MTRRdefType, env->mtrr_deftype }, + { MSR_VM_HSAVE_PA, env->vm_hsave }, + { MSR_SMI_COUNT, env->msr_smi_count }, + { MSR_IA32_PKRS, env->pkrs }, + { MSR_IA32_BNDCFGS, env->msr_bndcfgs }, + { MSR_IA32_XSS, env->xss }, + { MSR_IA32_UMWAIT_CONTROL, env->umwait }, + { MSR_IA32_TSX_CTRL, env->tsx_ctrl }, + { MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr }, + { MSR_TSC_AUX, env->tsc_aux }, + { MSR_TSC_ADJUST, env->tsc_adjust }, + { MSR_IA32_SMBASE, env->smbase }, + { MSR_IA32_SPEC_CTRL, env->spec_ctrl }, + { MSR_VIRT_SSBD, env->virt_ssbd }, + }; + + if (ARRAY_SIZE(pairs) > MSHV_MSR_ENTRIES_COUNT) { + error_report("MSR entries exceed maximum size"); + g_free(msrs); + return -1; + } + + for (size_t i = 0; i < ARRAY_SIZE(pairs); i++) { + MshvMsrEntry *entry = &msrs->entries[i]; + entry->index = pairs[i].index; + entry->reserved = 0; + entry->data = pairs[i].value; + msrs->nmsrs++; + } + + ret = mshv_configure_msr(cpu, &msrs->entries[0], msrs->nmsrs); + g_free(msrs); + return ret; +} + + +int mshv_arch_put_registers(const CPUState *cpu) +{ + int ret; + + ret = put_regs(cpu); + if (ret < 0) { + error_report("Failed to put registers"); + return -1; + } + + ret = put_msrs(cpu); + if (ret < 0) { + error_report("Failed to put msrs"); + return -1; + } + + return 0; +} + +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features) +{ + features->access_guest_idle_reg = 1; +} + +static int set_memory_info(const struct hyperv_message *msg, + struct hv_x64_memory_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_GPA_INTERCEPT + && msg->header.message_type != HVMSG_UNMAPPED_GPA + && msg->header.message_type != HVMSG_UNACCEPTED_GPA) { + error_report("invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int emulate_instruction(CPUState *cpu, + const uint8_t *insn_bytes, size_t insn_len, + uint64_t gva, uint64_t gpa) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + struct x86_decode decode = { 0 }; + int ret; + x86_insn_stream stream = { .bytes = insn_bytes, .len = insn_len }; + + ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("failed to load registers"); + return -1; + } + + decode_instruction_stream(env, &decode, &stream); + exec_instruction(env, &decode); + + ret = mshv_store_regs(cpu); + if (ret < 0) { + error_report("failed to store registers"); + return -1; + } + + return 0; +} + +static int handle_mmio(CPUState *cpu, const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + size_t insn_len; + uint8_t access_type; + uint8_t *instruction_bytes; + int ret; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + insn_len = info.instruction_byte_count; + access_type = info.header.intercept_access_type; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_EXECUTE) { + error_report("invalid intercept access type: execute"); + return -1; + } + + if (insn_len > 16) { + error_report("invalid mmio instruction length: %zu", insn_len); + return -1; + } + + trace_mshv_handle_mmio(info.guest_virtual_address, + info.guest_physical_address, + info.instruction_byte_count, access_type); + + instruction_bytes = info.instruction_bytes; + + ret = emulate_instruction(cpu, instruction_bytes, insn_len, + info.guest_virtual_address, + info.guest_physical_address); + if (ret < 0) { + error_report("failed to emulate mmio"); + return -1; + } + + *exit_reason = MshvVmExitIgnore; + + return 0; +} + +static int handle_unmapped_mem(int vm_fd, CPUState *cpu, + const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + uint64_t gpa; + int ret; + enum MshvRemapResult remap_result; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + + gpa = info.guest_physical_address; + + /* attempt to remap the region, in case of overlapping userspace mappings */ + remap_result = mshv_remap_overlap_region(vm_fd, gpa); + *exit_reason = MshvVmExitIgnore; + + switch (remap_result) { + case MshvRemapNoMapping: + /* if we didn't find a mapping, it is probably mmio */ + return handle_mmio(cpu, msg, exit_reason); + case MshvRemapOk: + break; + case MshvRemapNoOverlap: + /* This should not happen, but we are forgiving it */ + warn_report("found no overlap for unmapped region"); + *exit_reason = MshvVmExitSpecial; + break; + } + + return 0; +} + +static int set_ioport_info(const struct hyperv_message *msg, + hv_x64_io_port_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_X64_IO_PORT_INTERCEPT) { + error_report("Invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int set_x64_registers(const CPUState *cpu, const uint32_t *names, + const uint64_t *values) +{ + + hv_register_assoc assocs[2]; + int ret; + + for (size_t i = 0; i < ARRAY_SIZE(assocs); i++) { + assocs[i].name = names[i]; + assocs[i].value.reg64 = values[i]; + } + + ret = mshv_set_generic_regs(cpu, assocs, ARRAY_SIZE(assocs)); + if (ret < 0) { + error_report("failed to set x64 registers"); + return -1; + } + + return 0; +} + +static inline MemTxAttrs get_mem_attrs(bool is_secure_mode) +{ + MemTxAttrs memattr = {0}; + memattr.secure = is_secure_mode; + return memattr; +} + +static void pio_read(uint64_t port, uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + false); + if (ret != MEMTX_OK) { + error_report("Failed to read from port %lx: %d", port, ret); + abort(); + } +} + +static int pio_write(uint64_t port, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + true); + return ret; +} + +static int handle_pio_non_str(const CPUState *cpu, + hv_x64_io_port_intercept_message *info) +{ + size_t len = info->access_info.access_size; + uint8_t access_type = info->header.intercept_access_type; + int ret; + uint32_t val, eax; + const uint32_t eax_mask = 0xffffffffu >> (32 - len * 8); + size_t insn_len; + uint64_t rip, rax; + uint32_t reg_names[2]; + uint64_t reg_values[2]; + uint16_t port = info->port_number; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + union { + uint32_t u32; + uint8_t bytes[4]; + } conv; + + /* convert the first 4 bytes of rax to bytes */ + conv.u32 = (uint32_t)info->rax; + /* secure mode is set to false */ + ret = pio_write(port, conv.bytes, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + } else { + uint8_t data[4] = { 0 }; + /* secure mode is set to false */ + pio_read(info->port_number, data, len, false); + + /* Preserve high bits in EAX, but clear out high bits in RAX */ + val = *(uint32_t *)data; + eax = (((uint32_t)info->rax) & ~eax_mask) | (val & eax_mask); + info->rax = (uint64_t)eax; + } + + insn_len = info->header.instruction_length; + + /* Advance RIP and update RAX */ + rip = info->header.rip + insn_len; + rax = info->rax; + + reg_names[0] = HV_X64_REGISTER_RIP; + reg_values[0] = rip; + reg_names[1] = HV_X64_REGISTER_RAX; + reg_values[1] = rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int fetch_guest_state(CPUState *cpu) +{ + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to get standard registers"); + return -1; + } + + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to get special registers"); + return -1; + } + + return 0; +} + +static int read_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, len, false, false); + if (ret < 0) { + error_report("failed to read guest mem"); + return -1; + } + } + + return 0; +} + +static int write_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, const uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + } + ret = mshv_guest_mem_write(gpa, data, len, false); + if (ret != MEMTX_OK) { + error_report("failed to write to mmio"); + return -1; + } + + return 0; +} + +static int handle_pio_str_write(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t src; + uint8_t data[4] = { 0 }; + size_t len = info->access_info.access_size; + + src = linear_addr(cpu, info->rsi, R_DS); + + for (size_t i = 0; i < repeat; i++) { + ret = read_memory(cpu, 0, 0, src, data, len); + if (ret < 0) { + error_report("Failed to read memory"); + return -1; + } + ret = pio_write(port, data, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + src += direction_flag ? -len : len; + info->rsi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str_read(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t dst; + size_t len = info->access_info.access_size; + uint8_t data[4] = { 0 }; + + dst = linear_addr(cpu, info->rdi, R_ES); + + for (size_t i = 0; i < repeat; i++) { + pio_read(port, data, len, false); + + ret = write_memory(cpu, 0, 0, dst, data, len); + if (ret < 0) { + error_report("Failed to write memory"); + return -1; + } + dst += direction_flag ? -len : len; + info->rdi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str(CPUState *cpu, hv_x64_io_port_intercept_message *info) +{ + uint8_t access_type = info->header.intercept_access_type; + uint16_t port = info->port_number; + bool repop = info->access_info.rep_prefix == 1; + size_t repeat = repop ? info->rcx : 1; + size_t insn_len = info->header.instruction_length; + bool direction_flag; + uint32_t reg_names[3]; + uint64_t reg_values[3]; + int ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + + ret = fetch_guest_state(cpu); + if (ret < 0) { + error_report("Failed to fetch guest state"); + return -1; + } + + direction_flag = (env->eflags & DESC_E_MASK) != 0; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + ret = handle_pio_str_write(cpu, info, repeat, port, direction_flag); + if (ret < 0) { + error_report("Failed to handle pio str write"); + return -1; + } + reg_names[0] = HV_X64_REGISTER_RSI; + reg_values[0] = info->rsi; + } else { + ret = handle_pio_str_read(cpu, info, repeat, port, direction_flag); + reg_names[0] = HV_X64_REGISTER_RDI; + reg_values[0] = info->rdi; + } + + reg_names[1] = HV_X64_REGISTER_RIP; + reg_values[1] = info->header.rip + insn_len; + reg_names[2] = HV_X64_REGISTER_RAX; + reg_values[2] = info->rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int handle_pio(CPUState *cpu, const struct hyperv_message *msg) +{ + struct hv_x64_io_port_intercept_message info = { 0 }; + int ret; + + ret = set_ioport_info(msg, &info); + if (ret < 0) { + error_report("Failed to convert message to ioport info"); + return -1; + } + + if (info.access_info.string_op) { + return handle_pio_str(cpu, &info); + } + + return handle_pio_non_str(cpu, &info); +} + +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) +{ + int ret; + enum MshvVmExit exit_reason; + int cpu_fd = mshv_vcpufd(cpu); + + ret = ioctl(cpu_fd, MSHV_RUN_VP, msg); + if (ret < 0) { + return MshvVmExitShutdown; + } + + switch (msg->header.message_type) { + case HVMSG_UNRECOVERABLE_EXCEPTION: + return MshvVmExitShutdown; + case HVMSG_UNMAPPED_GPA: + ret = handle_unmapped_mem(vm_fd, cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle unmapped memory"); + return -1; + } + return exit_reason; + case HVMSG_GPA_INTERCEPT: + ret = handle_mmio(cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle mmio"); + return -1; + } + return exit_reason; + case HVMSG_X64_IO_PORT_INTERCEPT: + ret = handle_pio(cpu, msg); + if (ret < 0) { + return MshvVmExitSpecial; + } + return MshvVmExitIgnore; + default: + break; + } + + *exit = MshvVmExitIgnore; + return 0; +} + +void mshv_remove_vcpu(int vm_fd, int cpu_fd) +{ + close(cpu_fd); +} + + +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd) +{ + int ret; + struct mshv_create_vp vp_arg = { + .vp_index = vp_index, + }; + ret = ioctl(vm_fd, MSHV_CREATE_VP, &vp_arg); + if (ret < 0) { + error_report("failed to create mshv vcpu: %s", strerror(errno)); + return -1; + } + + *cpu_fd = ret; + + return 0; +} + +static int guest_mem_read_with_gva(const CPUState *cpu, uint64_t gva, + uint8_t *data, uintptr_t size, + bool fetch_instruction) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, size, false, fetch_instruction); + if (ret < 0) { + error_report("failed to read from guest memory"); + return -1; + } + + return 0; +} + +static int guest_mem_write_with_gva(const CPUState *cpu, uint64_t gva, + const uint8_t *data, uintptr_t size) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + ret = mshv_guest_mem_write(gpa, data, size, false); + if (ret < 0) { + error_report("failed to write to guest memory"); + return -1; + } + return 0; +} + +static void write_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_write_with_gva(cpu, addr, data, bytes) < 0) { + error_report("failed to write memory"); + abort(); + } +} + +static void fetch_instruction(CPUState *cpu, void *data, + target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, true) < 0) { + error_report("failed to fetch instruction"); + abort(); + } +} + +static void read_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, false) < 0) { + error_report("failed to read memory"); + abort(); + } +} + +static void read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + enum X86Seg seg_idx) +{ + bool ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + x86_segment_selector sel = { .sel = seg->selector & 0xFFFF }; + + ret = x86_read_segment_descriptor(cpu, desc, sel); + if (ret == false) { + error_report("failed to read segment descriptor"); + abort(); + } +} + +static const struct x86_emul_ops mshv_x86_emul_ops = { + .fetch_instruction = fetch_instruction, + .read_mem = read_mem, + .write_mem = write_mem, + .read_segment_descriptor = read_segment_descriptor, +}; + +void mshv_init_mmio_emu(void) +{ + init_decoder(); + init_emu(&mshv_x86_emul_ops); +} + +void mshv_arch_init_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + size_t page = HV_HYP_PAGE_SIZE; + void *mem = qemu_memalign(page, 2 * page); + + /* sanity check, to make sure we don't overflow the page */ + QEMU_BUILD_BUG_ON((MAX_REGISTER_COUNT + * sizeof(hv_register_assoc) + + sizeof(hv_input_get_vp_registers) + > HV_HYP_PAGE_SIZE)); + + state->hvcall_args.base = mem; + state->hvcall_args.input_page = mem; + state->hvcall_args.output_page = (uint8_t *)mem + page; + + env->emu_mmio_buf = g_new(char, 4096); +} + +void mshv_arch_destroy_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + + g_free(state->hvcall_args.base); + state->hvcall_args = (MshvHvCallArgs){0}; + g_clear_pointer(&env->emu_mmio_buf, g_free); +} + +/* + * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a + * fault to the guest if it tries to access it. It is possible to override + * this behavior with a more suitable option i.e., ignore writes from the guest + * and return zero in attempt to read unimplemented. + */ +static int set_unimplemented_msr_action(int vm_fd) +{ + struct hv_input_set_partition_property in = {0}; + struct mshv_root_hvcall args = {0}; + + in.property_code = HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION; + in.property_value = HV_UNIMPLEMENTED_MSR_ACTION_IGNORE_WRITE_READ_ZERO; + + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("unimplemented_msr_action", args.code, args.in_sz); + + int ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + return -1; + } + return 0; +} + +int mshv_arch_post_init_vm(int vm_fd) +{ + int ret; + + ret = set_unimplemented_msr_action(vm_fd); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + } + + return ret; +} diff --git a/target/i386/mshv/x86.c b/target/i386/mshv/x86.c new file mode 100644 index 0000000..d574b3b --- /dev/null +++ b/target/i386/mshv/x86.c @@ -0,0 +1,297 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "qemu/typedefs.h" +#include "qemu/error-report.h" +#include "system/mshv.h" + +/* RW or Exec segment */ +static const uint8_t RWRX_SEGMENT_TYPE = 0x2; +static const uint8_t CODE_SEGMENT_TYPE = 0x8; +static const uint8_t EXPAND_DOWN_SEGMENT_TYPE = 0x4; + +typedef enum CpuMode { + REAL_MODE, + PROTECTED_MODE, + LONG_MODE, +} CpuMode; + +static CpuMode cpu_mode(CPUState *cpu) +{ + enum CpuMode m = REAL_MODE; + + if (x86_is_protected(cpu)) { + m = PROTECTED_MODE; + + if (x86_is_long_mode(cpu)) { + m = LONG_MODE; + } + } + + return m; +} + +static bool segment_type_ro(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & (~RWRX_SEGMENT_TYPE)) == 0; +} + +static bool segment_type_code(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & CODE_SEGMENT_TYPE) != 0; +} + +static bool segment_expands_down(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + + if (segment_type_code(seg)) { + return false; + } + + return (type_ & EXPAND_DOWN_SEGMENT_TYPE) != 0; +} + +static uint32_t segment_limit(const SegmentCache *seg) +{ + uint32_t limit = seg->limit; + uint32_t granularity = (seg->flags & DESC_G_MASK) != 0; + + if (granularity != 0) { + limit = (limit << 12) | 0xFFF; + } + + return limit; +} + +static uint8_t segment_db(const SegmentCache *seg) +{ + return (seg->flags >> DESC_B_SHIFT) & 1; +} + +static uint32_t segment_max_limit(const SegmentCache *seg) +{ + if (segment_db(seg) != 0) { + return 0xFFFFFFFF; + } + return 0xFFFF; +} + +static int linearize(CPUState *cpu, + target_ulong logical_addr, target_ulong *linear_addr, + X86Seg seg_idx) +{ + enum CpuMode mode; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + target_ulong base = seg->base; + target_ulong logical_addr_32b; + uint32_t limit; + /* TODO: the emulator will not pass us "write" indicator yet */ + bool write = false; + + mode = cpu_mode(cpu); + + switch (mode) { + case LONG_MODE: + if (__builtin_add_overflow(logical_addr, base, linear_addr)) { + error_report("Address overflow"); + return -1; + } + break; + case PROTECTED_MODE: + case REAL_MODE: + if (segment_type_ro(seg) && write) { + error_report("Cannot write to read-only segment"); + return -1; + } + + logical_addr_32b = logical_addr & 0xFFFFFFFF; + limit = segment_limit(seg); + + if (segment_expands_down(seg)) { + if (logical_addr_32b >= limit) { + error_report("Address exceeds limit (expands down)"); + return -1; + } + + limit = segment_max_limit(seg); + } + + if (logical_addr_32b > limit) { + error_report("Address exceeds limit %u", limit); + return -1; + } + *linear_addr = logical_addr_32b + base; + break; + default: + error_report("Unknown cpu mode: %d", mode); + return -1; + } + + return 0; +} + +bool x86_read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + x86_segment_selector sel) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + + memset(desc, 0, sizeof(*desc)); + + /* valid gdt descriptors start from index 1 */ + if (!sel.index && GDT_SEL == sel.ti) { + return false; + } + + if (GDT_SEL == sel.ti) { + base = env->gdt.base; + limit = env->gdt.limit; + } else { + base = env->ldt.base; + limit = env->ldt.limit; + } + + if (sel.index * 8 >= limit) { + return false; + } + + gva = base + sel.index * 8; + emul_ops->read_mem(cpu, desc, gva, sizeof(*desc)); + + return true; +} + +bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc, + int gate) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + + base = env->idt.base; + limit = env->idt.limit; + + memset(idt_desc, 0, sizeof(*idt_desc)); + if (gate * 8 >= limit) { + perror("call gate exceeds idt limit"); + return false; + } + + gva = base + gate * 8; + emul_ops->read_mem(cpu, idt_desc, gva, sizeof(*idt_desc)); + + return true; +} + +bool x86_is_protected(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PE_MASK; +} + +bool x86_is_real(CPUState *cpu) +{ + return !x86_is_protected(cpu); +} + +bool x86_is_v8086(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + return x86_is_protected(cpu) && (env->eflags & VM_MASK); +} + +bool x86_is_long_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t efer = env->efer; + uint64_t lme_lma = (MSR_EFER_LME | MSR_EFER_LMA); + + return ((efer & lme_lma) == lme_lma); +} + +bool x86_is_long64_mode(CPUState *cpu) +{ + error_report("unimplemented: is_long64_mode()"); + abort(); +} + +bool x86_is_paging_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PG_MASK; +} + +bool x86_is_pae_enabled(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr4 = env->cr[4]; + + return cr4 & CR4_PAE_MASK; +} + +target_ulong linear_addr(CPUState *cpu, target_ulong addr, X86Seg seg) +{ + int ret; + target_ulong linear_addr; + + ret = linearize(cpu, addr, &linear_addr, seg); + if (ret < 0) { + error_report("failed to linearize address"); + abort(); + } + + return linear_addr; +} + +target_ulong linear_addr_size(CPUState *cpu, target_ulong addr, int size, + X86Seg seg) +{ + switch (size) { + case 2: + addr = (uint16_t)addr; + break; + case 4: + addr = (uint32_t)addr; + break; + default: + break; + } + return linear_addr(cpu, addr, seg); +} + +target_ulong linear_rip(CPUState *cpu, target_ulong rip) +{ + return linear_addr(cpu, rip, R_CS); +} diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 993602f..86490e0 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -28,11 +28,6 @@ #ifdef CONFIG_KVM #include <linux/kvm.h> #endif -#ifdef CONFIG_TCG -#include "accel/tcg/cpu-ldst.h" -#include "accel/tcg/cpu-ops.h" -#include "tcg/tcg.h" -#endif #include "tcg/tcg_loongarch.h" const char * const regnames[32] = { @@ -49,62 +44,6 @@ const char * const fregnames[32] = { "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", }; -struct TypeExcp { - int32_t exccode; - const char * const name; -}; - -static const struct TypeExcp excp_names[] = { - {EXCCODE_INT, "Interrupt"}, - {EXCCODE_PIL, "Page invalid exception for load"}, - {EXCCODE_PIS, "Page invalid exception for store"}, - {EXCCODE_PIF, "Page invalid exception for fetch"}, - {EXCCODE_PME, "Page modified exception"}, - {EXCCODE_PNR, "Page Not Readable exception"}, - {EXCCODE_PNX, "Page Not Executable exception"}, - {EXCCODE_PPI, "Page Privilege error"}, - {EXCCODE_ADEF, "Address error for instruction fetch"}, - {EXCCODE_ADEM, "Address error for Memory access"}, - {EXCCODE_SYS, "Syscall"}, - {EXCCODE_BRK, "Break"}, - {EXCCODE_INE, "Instruction Non-Existent"}, - {EXCCODE_IPE, "Instruction privilege error"}, - {EXCCODE_FPD, "Floating Point Disabled"}, - {EXCCODE_FPE, "Floating Point Exception"}, - {EXCCODE_DBP, "Debug breakpoint"}, - {EXCCODE_BCE, "Bound Check Exception"}, - {EXCCODE_SXD, "128 bit vector instructions Disable exception"}, - {EXCCODE_ASXD, "256 bit vector instructions Disable exception"}, - {EXCP_HLT, "EXCP_HLT"}, -}; - -const char *loongarch_exception_name(int32_t exception) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(excp_names); i++) { - if (excp_names[i].exccode == exception) { - return excp_names[i].name; - } - } - return "Unknown"; -} - -void G_NORETURN do_raise_exception(CPULoongArchState *env, - uint32_t exception, - uintptr_t pc) -{ - CPUState *cs = env_cpu(env); - - qemu_log_mask(CPU_LOG_INT, "%s: exception: %d (%s)\n", - __func__, - exception, - loongarch_exception_name(exception)); - cs->exception_index = exception; - - cpu_loop_exit_restore(cs, pc); -} - static void loongarch_cpu_set_pc(CPUState *cs, vaddr value) { set_pc(cpu_env(cs), value); @@ -140,18 +79,8 @@ void loongarch_cpu_set_irq(void *opaque, int irq, int level) } } -static inline bool cpu_loongarch_hw_interrupts_enabled(CPULoongArchState *env) -{ - bool ret = 0; - - ret = (FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE) && - !(FIELD_EX64(env->CSR_DBG, CSR_DBG, DST))); - - return ret; -} - /* Check if there is pending and not masked out interrupt */ -static inline bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env) +bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env) { uint32_t pending; uint32_t status; @@ -163,217 +92,8 @@ static inline bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env) } #endif -#ifdef CONFIG_TCG -#ifndef CONFIG_USER_ONLY -static void loongarch_cpu_do_interrupt(CPUState *cs) -{ - CPULoongArchState *env = cpu_env(cs); - bool update_badinstr = 1; - int cause = -1; - bool tlbfill = FIELD_EX64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR); - uint32_t vec_size = FIELD_EX64(env->CSR_ECFG, CSR_ECFG, VS); - - if (cs->exception_index != EXCCODE_INT) { - qemu_log_mask(CPU_LOG_INT, - "%s enter: pc " TARGET_FMT_lx " ERA " TARGET_FMT_lx - " TLBRERA " TARGET_FMT_lx " exception: %d (%s)\n", - __func__, env->pc, env->CSR_ERA, env->CSR_TLBRERA, - cs->exception_index, - loongarch_exception_name(cs->exception_index)); - } - - switch (cs->exception_index) { - case EXCCODE_DBP: - env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DCL, 1); - env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, ECODE, 0xC); - goto set_DERA; - set_DERA: - env->CSR_DERA = env->pc; - env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DST, 1); - set_pc(env, env->CSR_EENTRY + 0x480); - break; - case EXCCODE_INT: - if (FIELD_EX64(env->CSR_DBG, CSR_DBG, DST)) { - env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DEI, 1); - goto set_DERA; - } - QEMU_FALLTHROUGH; - case EXCCODE_PIF: - case EXCCODE_ADEF: - cause = cs->exception_index; - update_badinstr = 0; - break; - case EXCCODE_SYS: - case EXCCODE_BRK: - case EXCCODE_INE: - case EXCCODE_IPE: - case EXCCODE_FPD: - case EXCCODE_FPE: - case EXCCODE_SXD: - case EXCCODE_ASXD: - env->CSR_BADV = env->pc; - QEMU_FALLTHROUGH; - case EXCCODE_BCE: - case EXCCODE_ADEM: - case EXCCODE_PIL: - case EXCCODE_PIS: - case EXCCODE_PME: - case EXCCODE_PNR: - case EXCCODE_PNX: - case EXCCODE_PPI: - cause = cs->exception_index; - break; - default: - qemu_log("Error: exception(%d) has not been supported\n", - cs->exception_index); - abort(); - } - - if (update_badinstr) { - env->CSR_BADI = cpu_ldl_code(env, env->pc); - } - - /* Save PLV and IE */ - if (tlbfill) { - env->CSR_TLBRPRMD = FIELD_DP64(env->CSR_TLBRPRMD, CSR_TLBRPRMD, PPLV, - FIELD_EX64(env->CSR_CRMD, - CSR_CRMD, PLV)); - env->CSR_TLBRPRMD = FIELD_DP64(env->CSR_TLBRPRMD, CSR_TLBRPRMD, PIE, - FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE)); - /* set the DA mode */ - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DA, 1); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PG, 0); - env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, - PC, (env->pc >> 2)); - } else { - env->CSR_ESTAT = FIELD_DP64(env->CSR_ESTAT, CSR_ESTAT, ECODE, - EXCODE_MCODE(cause)); - env->CSR_ESTAT = FIELD_DP64(env->CSR_ESTAT, CSR_ESTAT, ESUBCODE, - EXCODE_SUBCODE(cause)); - env->CSR_PRMD = FIELD_DP64(env->CSR_PRMD, CSR_PRMD, PPLV, - FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PLV)); - env->CSR_PRMD = FIELD_DP64(env->CSR_PRMD, CSR_PRMD, PIE, - FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE)); - env->CSR_ERA = env->pc; - } - - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PLV, 0); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, IE, 0); - - if (vec_size) { - vec_size = (1 << vec_size) * 4; - } - - if (cs->exception_index == EXCCODE_INT) { - /* Interrupt */ - uint32_t vector = 0; - uint32_t pending = FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS); - pending &= FIELD_EX64(env->CSR_ECFG, CSR_ECFG, LIE); - - /* Find the highest-priority interrupt. */ - vector = 31 - clz32(pending); - set_pc(env, env->CSR_EENTRY + \ - (EXCCODE_EXTERNAL_INT + vector) * vec_size); - qemu_log_mask(CPU_LOG_INT, - "%s: PC " TARGET_FMT_lx " ERA " TARGET_FMT_lx - " cause %d\n" " A " TARGET_FMT_lx " D " - TARGET_FMT_lx " vector = %d ExC " TARGET_FMT_lx "ExS" - TARGET_FMT_lx "\n", - __func__, env->pc, env->CSR_ERA, - cause, env->CSR_BADV, env->CSR_DERA, vector, - env->CSR_ECFG, env->CSR_ESTAT); - } else { - if (tlbfill) { - set_pc(env, env->CSR_TLBRENTRY); - } else { - set_pc(env, env->CSR_EENTRY + EXCODE_MCODE(cause) * vec_size); - } - qemu_log_mask(CPU_LOG_INT, - "%s: PC " TARGET_FMT_lx " ERA " TARGET_FMT_lx - " cause %d%s\n, ESTAT " TARGET_FMT_lx - " EXCFG " TARGET_FMT_lx " BADVA " TARGET_FMT_lx - "BADI " TARGET_FMT_lx " SYS_NUM " TARGET_FMT_lu - " cpu %d asid " TARGET_FMT_lx "\n", __func__, env->pc, - tlbfill ? env->CSR_TLBRERA : env->CSR_ERA, - cause, tlbfill ? "(refill)" : "", env->CSR_ESTAT, - env->CSR_ECFG, - tlbfill ? env->CSR_TLBRBADV : env->CSR_BADV, - env->CSR_BADI, env->gpr[11], cs->cpu_index, - env->CSR_ASID); - } - cs->exception_index = -1; -} - -static void loongarch_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr, - vaddr addr, unsigned size, - MMUAccessType access_type, - int mmu_idx, MemTxAttrs attrs, - MemTxResult response, - uintptr_t retaddr) -{ - CPULoongArchState *env = cpu_env(cs); - - if (access_type == MMU_INST_FETCH) { - do_raise_exception(env, EXCCODE_ADEF, retaddr); - } else { - do_raise_exception(env, EXCCODE_ADEM, retaddr); - } -} - -static bool loongarch_cpu_exec_interrupt(CPUState *cs, int interrupt_request) -{ - if (interrupt_request & CPU_INTERRUPT_HARD) { - CPULoongArchState *env = cpu_env(cs); - - if (cpu_loongarch_hw_interrupts_enabled(env) && - cpu_loongarch_hw_interrupts_pending(env)) { - /* Raise it */ - cs->exception_index = EXCCODE_INT; - loongarch_cpu_do_interrupt(cs); - return true; - } - } - return false; -} - -static vaddr loongarch_pointer_wrap(CPUState *cs, int mmu_idx, - vaddr result, vaddr base) -{ - return is_va32(cpu_env(cs)) ? (uint32_t)result : result; -} -#endif - -static TCGTBCPUState loongarch_get_tb_cpu_state(CPUState *cs) -{ - CPULoongArchState *env = cpu_env(cs); - uint32_t flags; - - flags = env->CSR_CRMD & (R_CSR_CRMD_PLV_MASK | R_CSR_CRMD_PG_MASK); - flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, FPE) * HW_FLAGS_EUEN_FPE; - flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, SXE) * HW_FLAGS_EUEN_SXE; - flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, ASXE) * HW_FLAGS_EUEN_ASXE; - flags |= is_va32(env) * HW_FLAGS_VA32; - - return (TCGTBCPUState){ .pc = env->pc, .flags = flags }; -} - -static void loongarch_cpu_synchronize_from_tb(CPUState *cs, - const TranslationBlock *tb) -{ - tcg_debug_assert(!tcg_cflags_has(cs, CF_PCREL)); - set_pc(cpu_env(cs), tb->pc); -} - -static void loongarch_restore_state_to_opc(CPUState *cs, - const TranslationBlock *tb, - const uint64_t *data) -{ - set_pc(cpu_env(cs), data[0]); -} -#endif /* CONFIG_TCG */ - #ifndef CONFIG_USER_ONLY -static bool loongarch_cpu_has_work(CPUState *cs) +bool loongarch_cpu_has_work(CPUState *cs) { bool has_work = false; @@ -386,16 +106,6 @@ static bool loongarch_cpu_has_work(CPUState *cs) } #endif /* !CONFIG_USER_ONLY */ -static int loongarch_cpu_mmu_index(CPUState *cs, bool ifetch) -{ - CPULoongArchState *env = cpu_env(cs); - - if (FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG)) { - return FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PLV); - } - return MMU_DA_IDX; -} - static void loongarch_la464_init_csr(Object *obj) { #ifndef CONFIG_USER_ONLY @@ -911,30 +621,6 @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) } } -#ifdef CONFIG_TCG -static const TCGCPUOps loongarch_tcg_ops = { - .guest_default_memory_order = 0, - .mttcg_supported = true, - - .initialize = loongarch_translate_init, - .translate_code = loongarch_translate_code, - .get_tb_cpu_state = loongarch_get_tb_cpu_state, - .synchronize_from_tb = loongarch_cpu_synchronize_from_tb, - .restore_state_to_opc = loongarch_restore_state_to_opc, - .mmu_index = loongarch_cpu_mmu_index, - -#ifndef CONFIG_USER_ONLY - .tlb_fill = loongarch_cpu_tlb_fill, - .pointer_wrap = loongarch_pointer_wrap, - .cpu_exec_interrupt = loongarch_cpu_exec_interrupt, - .cpu_exec_halt = loongarch_cpu_has_work, - .cpu_exec_reset = cpu_reset, - .do_interrupt = loongarch_cpu_do_interrupt, - .do_transaction_failed = loongarch_cpu_do_transaction_failed, -#endif -}; -#endif /* CONFIG_TCG */ - #ifndef CONFIG_USER_ONLY #include "hw/core/sysemu-cpu-ops.h" diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index e50d109..8793bd9 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -24,8 +24,6 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env, uint32_t exception, uintptr_t pc); -const char *loongarch_exception_name(int32_t exception); - #ifdef CONFIG_TCG int ieee_ex_to_loongarch(int xcpt); void restore_fp_status(CPULoongArchState *env); @@ -41,6 +39,8 @@ uint64_t cpu_loongarch_get_constant_timer_counter(LoongArchCPU *cpu); uint64_t cpu_loongarch_get_constant_timer_ticks(LoongArchCPU *cpu); void cpu_loongarch_store_constant_timer_config(LoongArchCPU *cpu, uint64_t value); +bool loongarch_cpu_has_work(CPUState *cs); +bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env); #endif /* !CONFIG_USER_ONLY */ uint64_t read_fcc(CPULoongArchState *env); diff --git a/target/loongarch/tcg/meson.build b/target/loongarch/tcg/meson.build index bdf34f9..b7adfe4 100644 --- a/target/loongarch/tcg/meson.build +++ b/target/loongarch/tcg/meson.build @@ -7,6 +7,7 @@ loongarch_ss.add([zlib, gen]) loongarch_ss.add(files( 'fpu_helper.c', 'op_helper.c', + 'tcg_cpu.c', 'translate.c', 'vec_helper.c', )) diff --git a/target/loongarch/tcg/tcg_cpu.c b/target/loongarch/tcg/tcg_cpu.c new file mode 100644 index 0000000..82b54e6 --- /dev/null +++ b/target/loongarch/tcg/tcg_cpu.c @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch CPU parameters for QEMU. + * + * Copyright (c) 2025 Loongson Technology Corporation Limited + */ +#include "qemu/osdep.h" +#include "qemu/accel.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include "accel/accel-cpu-target.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/cpu-ops.h" +#include "exec/translation-block.h" +#include "exec/target_page.h" +#include "tcg_loongarch.h" +#include "internals.h" + +struct TypeExcp { + int32_t exccode; + const char * const name; +}; + +static const struct TypeExcp excp_names[] = { + {EXCCODE_INT, "Interrupt"}, + {EXCCODE_PIL, "Page invalid exception for load"}, + {EXCCODE_PIS, "Page invalid exception for store"}, + {EXCCODE_PIF, "Page invalid exception for fetch"}, + {EXCCODE_PME, "Page modified exception"}, + {EXCCODE_PNR, "Page Not Readable exception"}, + {EXCCODE_PNX, "Page Not Executable exception"}, + {EXCCODE_PPI, "Page Privilege error"}, + {EXCCODE_ADEF, "Address error for instruction fetch"}, + {EXCCODE_ADEM, "Address error for Memory access"}, + {EXCCODE_SYS, "Syscall"}, + {EXCCODE_BRK, "Break"}, + {EXCCODE_INE, "Instruction Non-Existent"}, + {EXCCODE_IPE, "Instruction privilege error"}, + {EXCCODE_FPD, "Floating Point Disabled"}, + {EXCCODE_FPE, "Floating Point Exception"}, + {EXCCODE_DBP, "Debug breakpoint"}, + {EXCCODE_BCE, "Bound Check Exception"}, + {EXCCODE_SXD, "128 bit vector instructions Disable exception"}, + {EXCCODE_ASXD, "256 bit vector instructions Disable exception"}, + {EXCP_HLT, "EXCP_HLT"}, +}; + +static const char *loongarch_exception_name(int32_t exception) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(excp_names); i++) { + if (excp_names[i].exccode == exception) { + return excp_names[i].name; + } + } + return "Unknown"; +} + +void G_NORETURN do_raise_exception(CPULoongArchState *env, + uint32_t exception, + uintptr_t pc) +{ + CPUState *cs = env_cpu(env); + + qemu_log_mask(CPU_LOG_INT, "%s: exception: %d (%s)\n", + __func__, + exception, + loongarch_exception_name(exception)); + cs->exception_index = exception; + + cpu_loop_exit_restore(cs, pc); +} + +#ifndef CONFIG_USER_ONLY +static void loongarch_cpu_do_interrupt(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + bool update_badinstr = 1; + int cause = -1; + bool tlbfill = FIELD_EX64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR); + uint32_t vec_size = FIELD_EX64(env->CSR_ECFG, CSR_ECFG, VS); + + if (cs->exception_index != EXCCODE_INT) { + qemu_log_mask(CPU_LOG_INT, + "%s enter: pc " TARGET_FMT_lx " ERA " TARGET_FMT_lx + " TLBRERA " TARGET_FMT_lx " exception: %d (%s)\n", + __func__, env->pc, env->CSR_ERA, env->CSR_TLBRERA, + cs->exception_index, + loongarch_exception_name(cs->exception_index)); + } + + switch (cs->exception_index) { + case EXCCODE_DBP: + env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DCL, 1); + env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, ECODE, 0xC); + goto set_DERA; + set_DERA: + env->CSR_DERA = env->pc; + env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DST, 1); + set_pc(env, env->CSR_EENTRY + 0x480); + break; + case EXCCODE_INT: + if (FIELD_EX64(env->CSR_DBG, CSR_DBG, DST)) { + env->CSR_DBG = FIELD_DP64(env->CSR_DBG, CSR_DBG, DEI, 1); + goto set_DERA; + } + QEMU_FALLTHROUGH; + case EXCCODE_PIF: + case EXCCODE_ADEF: + cause = cs->exception_index; + update_badinstr = 0; + break; + case EXCCODE_SYS: + case EXCCODE_BRK: + case EXCCODE_INE: + case EXCCODE_IPE: + case EXCCODE_FPD: + case EXCCODE_FPE: + case EXCCODE_SXD: + case EXCCODE_ASXD: + env->CSR_BADV = env->pc; + QEMU_FALLTHROUGH; + case EXCCODE_BCE: + case EXCCODE_ADEM: + case EXCCODE_PIL: + case EXCCODE_PIS: + case EXCCODE_PME: + case EXCCODE_PNR: + case EXCCODE_PNX: + case EXCCODE_PPI: + cause = cs->exception_index; + break; + default: + qemu_log("Error: exception(%d) has not been supported\n", + cs->exception_index); + abort(); + } + + if (update_badinstr) { + env->CSR_BADI = cpu_ldl_code(env, env->pc); + } + + /* Save PLV and IE */ + if (tlbfill) { + env->CSR_TLBRPRMD = FIELD_DP64(env->CSR_TLBRPRMD, CSR_TLBRPRMD, PPLV, + FIELD_EX64(env->CSR_CRMD, + CSR_CRMD, PLV)); + env->CSR_TLBRPRMD = FIELD_DP64(env->CSR_TLBRPRMD, CSR_TLBRPRMD, PIE, + FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE)); + /* set the DA mode */ + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DA, 1); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PG, 0); + env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, + PC, (env->pc >> 2)); + } else { + env->CSR_ESTAT = FIELD_DP64(env->CSR_ESTAT, CSR_ESTAT, ECODE, + EXCODE_MCODE(cause)); + env->CSR_ESTAT = FIELD_DP64(env->CSR_ESTAT, CSR_ESTAT, ESUBCODE, + EXCODE_SUBCODE(cause)); + env->CSR_PRMD = FIELD_DP64(env->CSR_PRMD, CSR_PRMD, PPLV, + FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PLV)); + env->CSR_PRMD = FIELD_DP64(env->CSR_PRMD, CSR_PRMD, PIE, + FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE)); + env->CSR_ERA = env->pc; + } + + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PLV, 0); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, IE, 0); + + if (vec_size) { + vec_size = (1 << vec_size) * 4; + } + + if (cs->exception_index == EXCCODE_INT) { + /* Interrupt */ + uint32_t vector = 0; + uint32_t pending = FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS); + pending &= FIELD_EX64(env->CSR_ECFG, CSR_ECFG, LIE); + + /* Find the highest-priority interrupt. */ + vector = 31 - clz32(pending); + set_pc(env, env->CSR_EENTRY + \ + (EXCCODE_EXTERNAL_INT + vector) * vec_size); + qemu_log_mask(CPU_LOG_INT, + "%s: PC " TARGET_FMT_lx " ERA " TARGET_FMT_lx + " cause %d\n" " A " TARGET_FMT_lx " D " + TARGET_FMT_lx " vector = %d ExC " TARGET_FMT_lx "ExS" + TARGET_FMT_lx "\n", + __func__, env->pc, env->CSR_ERA, + cause, env->CSR_BADV, env->CSR_DERA, vector, + env->CSR_ECFG, env->CSR_ESTAT); + } else { + if (tlbfill) { + set_pc(env, env->CSR_TLBRENTRY); + } else { + set_pc(env, env->CSR_EENTRY + EXCODE_MCODE(cause) * vec_size); + } + qemu_log_mask(CPU_LOG_INT, + "%s: PC " TARGET_FMT_lx " ERA " TARGET_FMT_lx + " cause %d%s\n, ESTAT " TARGET_FMT_lx + " EXCFG " TARGET_FMT_lx " BADVA " TARGET_FMT_lx + "BADI " TARGET_FMT_lx " SYS_NUM " TARGET_FMT_lu + " cpu %d asid " TARGET_FMT_lx "\n", __func__, env->pc, + tlbfill ? env->CSR_TLBRERA : env->CSR_ERA, + cause, tlbfill ? "(refill)" : "", env->CSR_ESTAT, + env->CSR_ECFG, + tlbfill ? env->CSR_TLBRBADV : env->CSR_BADV, + env->CSR_BADI, env->gpr[11], cs->cpu_index, + env->CSR_ASID); + } + cs->exception_index = -1; +} + +static void loongarch_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr, + vaddr addr, unsigned size, + MMUAccessType access_type, + int mmu_idx, MemTxAttrs attrs, + MemTxResult response, + uintptr_t retaddr) +{ + CPULoongArchState *env = cpu_env(cs); + + if (access_type == MMU_INST_FETCH) { + do_raise_exception(env, EXCCODE_ADEF, retaddr); + } else { + do_raise_exception(env, EXCCODE_ADEM, retaddr); + } +} + +static inline bool cpu_loongarch_hw_interrupts_enabled(CPULoongArchState *env) +{ + bool ret = 0; + + ret = (FIELD_EX64(env->CSR_CRMD, CSR_CRMD, IE) && + !(FIELD_EX64(env->CSR_DBG, CSR_DBG, DST))); + + return ret; +} + +static bool loongarch_cpu_exec_interrupt(CPUState *cs, int interrupt_request) +{ + if (interrupt_request & CPU_INTERRUPT_HARD) { + CPULoongArchState *env = cpu_env(cs); + + if (cpu_loongarch_hw_interrupts_enabled(env) && + cpu_loongarch_hw_interrupts_pending(env)) { + /* Raise it */ + cs->exception_index = EXCCODE_INT; + loongarch_cpu_do_interrupt(cs); + return true; + } + } + return false; +} + +static vaddr loongarch_pointer_wrap(CPUState *cs, int mmu_idx, + vaddr result, vaddr base) +{ + return is_va32(cpu_env(cs)) ? (uint32_t)result : result; +} +#endif + +static TCGTBCPUState loongarch_get_tb_cpu_state(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + uint32_t flags; + + flags = env->CSR_CRMD & (R_CSR_CRMD_PLV_MASK | R_CSR_CRMD_PG_MASK); + flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, FPE) * HW_FLAGS_EUEN_FPE; + flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, SXE) * HW_FLAGS_EUEN_SXE; + flags |= FIELD_EX64(env->CSR_EUEN, CSR_EUEN, ASXE) * HW_FLAGS_EUEN_ASXE; + flags |= is_va32(env) * HW_FLAGS_VA32; + + return (TCGTBCPUState){ .pc = env->pc, .flags = flags }; +} + +static void loongarch_cpu_synchronize_from_tb(CPUState *cs, + const TranslationBlock *tb) +{ + tcg_debug_assert(!tcg_cflags_has(cs, CF_PCREL)); + set_pc(cpu_env(cs), tb->pc); +} + +static void loongarch_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + set_pc(cpu_env(cs), data[0]); +} + +static int loongarch_cpu_mmu_index(CPUState *cs, bool ifetch) +{ + CPULoongArchState *env = cpu_env(cs); + + if (FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG)) { + return FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PLV); + } + return MMU_DA_IDX; +} + +const TCGCPUOps loongarch_tcg_ops = { + .guest_default_memory_order = 0, + .mttcg_supported = true, + + .initialize = loongarch_translate_init, + .translate_code = loongarch_translate_code, + .get_tb_cpu_state = loongarch_get_tb_cpu_state, + .synchronize_from_tb = loongarch_cpu_synchronize_from_tb, + .restore_state_to_opc = loongarch_restore_state_to_opc, + .mmu_index = loongarch_cpu_mmu_index, + +#ifndef CONFIG_USER_ONLY + .tlb_fill = loongarch_cpu_tlb_fill, + .pointer_wrap = loongarch_pointer_wrap, + .cpu_exec_interrupt = loongarch_cpu_exec_interrupt, + .cpu_exec_halt = loongarch_cpu_has_work, + .cpu_exec_reset = cpu_reset, + .do_interrupt = loongarch_cpu_do_interrupt, + .do_transaction_failed = loongarch_cpu_do_transaction_failed, +#endif +}; diff --git a/target/loongarch/tcg/tcg_loongarch.h b/target/loongarch/tcg/tcg_loongarch.h index 4770289..7fb627f 100644 --- a/target/loongarch/tcg/tcg_loongarch.h +++ b/target/loongarch/tcg/tcg_loongarch.h @@ -9,6 +9,7 @@ #include "cpu.h" #include "cpu-mmu.h" +extern const TCGCPUOps loongarch_tcg_ops; void loongarch_csr_translate_init(void); bool loongarch_cpu_tlb_fill(CPUState *cs, vaddr address, int size, diff --git a/target/s390x/helper.c b/target/s390x/helper.c index 5c127da..184428c 100644 --- a/target/s390x/helper.c +++ b/target/s390x/helper.c @@ -24,8 +24,8 @@ #include "gdbstub/helpers.h" #include "qemu/timer.h" #include "hw/s390x/ioinst.h" -#include "target/s390x/kvm/pv.h" #include "system/hw_accel.h" +#include "system/memory.h" #include "system/runstate.h" #include "exec/target_page.h" #include "exec/watchpoint.h" @@ -107,19 +107,23 @@ LowCore *cpu_map_lowcore(CPUS390XState *env) { LowCore *lowcore; hwaddr len = sizeof(LowCore); + CPUState *cs = env_cpu(env); + const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; - lowcore = cpu_physical_memory_map(env->psa, &len, true); + lowcore = address_space_map(cs->as, env->psa, &len, true, attrs); if (len < sizeof(LowCore)) { - cpu_abort(env_cpu(env), "Could not map lowcore\n"); + cpu_abort(cs, "Could not map lowcore\n"); } return lowcore; } -void cpu_unmap_lowcore(LowCore *lowcore) +void cpu_unmap_lowcore(CPUS390XState *env, LowCore *lowcore) { - cpu_physical_memory_unmap(lowcore, sizeof(LowCore), 1, sizeof(LowCore)); + AddressSpace *as = env_cpu(env)->as; + + address_space_unmap(as, lowcore, sizeof(LowCore), true, sizeof(LowCore)); } void do_restart_interrupt(CPUS390XState *env) @@ -134,7 +138,7 @@ void do_restart_interrupt(CPUS390XState *env) mask = be64_to_cpu(lowcore->restart_new_psw.mask); addr = be64_to_cpu(lowcore->restart_new_psw.addr); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); env->pending_int &= ~INTERRUPT_RESTART; s390_cpu_set_psw(env, mask, addr); @@ -177,109 +181,3 @@ void s390_cpu_recompute_watchpoints(CPUState *cs) wp_flags, NULL); } } - -typedef struct SigpSaveArea { - uint64_t fprs[16]; /* 0x0000 */ - uint64_t grs[16]; /* 0x0080 */ - PSW psw; /* 0x0100 */ - uint8_t pad_0x0110[0x0118 - 0x0110]; /* 0x0110 */ - uint32_t prefix; /* 0x0118 */ - uint32_t fpc; /* 0x011c */ - uint8_t pad_0x0120[0x0124 - 0x0120]; /* 0x0120 */ - uint32_t todpr; /* 0x0124 */ - uint64_t cputm; /* 0x0128 */ - uint64_t ckc; /* 0x0130 */ - uint8_t pad_0x0138[0x0140 - 0x0138]; /* 0x0138 */ - uint32_t ars[16]; /* 0x0140 */ - uint64_t crs[16]; /* 0x0384 */ -} SigpSaveArea; -QEMU_BUILD_BUG_ON(sizeof(SigpSaveArea) != 512); - -int s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch) -{ - static const uint8_t ar_id = 1; - SigpSaveArea *sa; - hwaddr len = sizeof(*sa); - int i; - - /* For PVMs storing will occur when this cpu enters SIE again */ - if (s390_is_pv()) { - return 0; - } - - sa = cpu_physical_memory_map(addr, &len, true); - if (!sa) { - return -EFAULT; - } - if (len != sizeof(*sa)) { - cpu_physical_memory_unmap(sa, len, 1, 0); - return -EFAULT; - } - - if (store_arch) { - cpu_physical_memory_write(offsetof(LowCore, ar_access_id), &ar_id, 1); - } - for (i = 0; i < 16; ++i) { - sa->fprs[i] = cpu_to_be64(*get_freg(&cpu->env, i)); - } - for (i = 0; i < 16; ++i) { - sa->grs[i] = cpu_to_be64(cpu->env.regs[i]); - } - sa->psw.addr = cpu_to_be64(cpu->env.psw.addr); - sa->psw.mask = cpu_to_be64(s390_cpu_get_psw_mask(&cpu->env)); - sa->prefix = cpu_to_be32(cpu->env.psa); - sa->fpc = cpu_to_be32(cpu->env.fpc); - sa->todpr = cpu_to_be32(cpu->env.todpr); - sa->cputm = cpu_to_be64(cpu->env.cputm); - sa->ckc = cpu_to_be64(cpu->env.ckc >> 8); - for (i = 0; i < 16; ++i) { - sa->ars[i] = cpu_to_be32(cpu->env.aregs[i]); - } - for (i = 0; i < 16; ++i) { - sa->crs[i] = cpu_to_be64(cpu->env.cregs[i]); - } - - cpu_physical_memory_unmap(sa, len, 1, len); - - return 0; -} - -typedef struct SigpAdtlSaveArea { - uint64_t vregs[32][2]; /* 0x0000 */ - uint8_t pad_0x0200[0x0400 - 0x0200]; /* 0x0200 */ - uint64_t gscb[4]; /* 0x0400 */ - uint8_t pad_0x0420[0x1000 - 0x0420]; /* 0x0420 */ -} SigpAdtlSaveArea; -QEMU_BUILD_BUG_ON(sizeof(SigpAdtlSaveArea) != 4096); - -#define ADTL_GS_MIN_SIZE 2048 /* minimal size of adtl save area for GS */ -int s390_store_adtl_status(S390CPU *cpu, hwaddr addr, hwaddr len) -{ - SigpAdtlSaveArea *sa; - hwaddr save = len; - int i; - - sa = cpu_physical_memory_map(addr, &save, true); - if (!sa) { - return -EFAULT; - } - if (save != len) { - cpu_physical_memory_unmap(sa, len, 1, 0); - return -EFAULT; - } - - if (s390_has_feat(S390_FEAT_VECTOR)) { - for (i = 0; i < 32; i++) { - sa->vregs[i][0] = cpu_to_be64(cpu->env.vregs[i][0]); - sa->vregs[i][1] = cpu_to_be64(cpu->env.vregs[i][1]); - } - } - if (s390_has_feat(S390_FEAT_GUARDED_STORAGE) && len >= ADTL_GS_MIN_SIZE) { - for (i = 0; i < 4; i++) { - sa->gscb[i] = cpu_to_be64(cpu->env.gscb[i]); - } - } - - cpu_physical_memory_unmap(sa, len, 1, len); - return 0; -} diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h index 56cce2e..9691366 100644 --- a/target/s390x/s390x-internal.h +++ b/target/s390x/s390x-internal.h @@ -323,11 +323,8 @@ void s390x_cpu_timer(void *opaque); void s390_handle_wait(S390CPU *cpu); hwaddr s390_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); hwaddr s390_cpu_get_phys_addr_debug(CPUState *cpu, vaddr addr); -#define S390_STORE_STATUS_DEF_ADDR offsetof(LowCore, floating_pt_save_area) -int s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch); -int s390_store_adtl_status(S390CPU *cpu, hwaddr addr, hwaddr len); LowCore *cpu_map_lowcore(CPUS390XState *env); -void cpu_unmap_lowcore(LowCore *lowcore); +void cpu_unmap_lowcore(CPUS390XState *env, LowCore *lowcore); #endif /* CONFIG_USER_ONLY */ diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c index 5e95c497..f5d7bc0 100644 --- a/target/s390x/sigp.c +++ b/target/s390x/sigp.c @@ -13,12 +13,14 @@ #include "s390x-internal.h" #include "hw/boards.h" #include "system/hw_accel.h" +#include "system/memory.h" #include "system/runstate.h" #include "system/address-spaces.h" #include "exec/cputlb.h" #include "system/tcg.h" #include "trace.h" #include "qapi/qapi-types-machine.h" +#include "target/s390x/kvm/pv.h" QemuMutex qemu_sigp_mutex; @@ -126,6 +128,78 @@ static void sigp_stop(CPUState *cs, run_on_cpu_data arg) si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } +typedef struct SigpSaveArea { + uint64_t fprs[16]; /* 0x0000 */ + uint64_t grs[16]; /* 0x0080 */ + PSW psw; /* 0x0100 */ + uint8_t pad_0x0110[0x0118 - 0x0110]; /* 0x0110 */ + uint32_t prefix; /* 0x0118 */ + uint32_t fpc; /* 0x011c */ + uint8_t pad_0x0120[0x0124 - 0x0120]; /* 0x0120 */ + uint32_t todpr; /* 0x0124 */ + uint64_t cputm; /* 0x0128 */ + uint64_t ckc; /* 0x0130 */ + uint8_t pad_0x0138[0x0140 - 0x0138]; /* 0x0138 */ + uint32_t ars[16]; /* 0x0140 */ + uint64_t crs[16]; /* 0x0384 */ +} SigpSaveArea; +QEMU_BUILD_BUG_ON(sizeof(SigpSaveArea) != 512); + +#define S390_STORE_STATUS_DEF_ADDR offsetof(LowCore, floating_pt_save_area) +static int s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch) +{ + const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; + AddressSpace *as = CPU(cpu)->as; + SigpSaveArea *sa; + hwaddr len = sizeof(*sa); + int i; + + /* For PVMs storing will occur when this cpu enters SIE again */ + if (s390_is_pv()) { + return 0; + } + + sa = address_space_map(as, addr, &len, true, attrs); + if (!sa) { + return -EFAULT; + } + if (len != sizeof(*sa)) { + address_space_unmap(as, sa, len, true, 0); + return -EFAULT; + } + + if (store_arch) { + static const uint8_t ar_id = 1; + + address_space_stb(as, offsetof(LowCore, ar_access_id), + ar_id, attrs, NULL); + + } + for (i = 0; i < 16; ++i) { + sa->fprs[i] = cpu_to_be64(*get_freg(&cpu->env, i)); + } + for (i = 0; i < 16; ++i) { + sa->grs[i] = cpu_to_be64(cpu->env.regs[i]); + } + sa->psw.addr = cpu_to_be64(cpu->env.psw.addr); + sa->psw.mask = cpu_to_be64(s390_cpu_get_psw_mask(&cpu->env)); + sa->prefix = cpu_to_be32(cpu->env.psa); + sa->fpc = cpu_to_be32(cpu->env.fpc); + sa->todpr = cpu_to_be32(cpu->env.todpr); + sa->cputm = cpu_to_be64(cpu->env.cputm); + sa->ckc = cpu_to_be64(cpu->env.ckc >> 8); + for (i = 0; i < 16; ++i) { + sa->ars[i] = cpu_to_be32(cpu->env.aregs[i]); + } + for (i = 0; i < 16; ++i) { + sa->crs[i] = cpu_to_be64(cpu->env.cregs[i]); + } + + address_space_unmap(as, sa, len, true, len); + + return 0; +} + static void sigp_stop_and_store_status(CPUState *cs, run_on_cpu_data arg) { S390CPU *cpu = S390_CPU(cs); @@ -172,6 +246,49 @@ static void sigp_store_status_at_address(CPUState *cs, run_on_cpu_data arg) si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } +typedef struct SigpAdtlSaveArea { + uint64_t vregs[32][2]; /* 0x0000 */ + uint8_t pad_0x0200[0x0400 - 0x0200]; /* 0x0200 */ + uint64_t gscb[4]; /* 0x0400 */ + uint8_t pad_0x0420[0x1000 - 0x0420]; /* 0x0420 */ +} SigpAdtlSaveArea; +QEMU_BUILD_BUG_ON(sizeof(SigpAdtlSaveArea) != 4096); + +#define ADTL_GS_MIN_SIZE 2048 /* minimal size of adtl save area for GS */ +static int s390_store_adtl_status(S390CPU *cpu, hwaddr addr, hwaddr len) +{ + const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; + AddressSpace *as = CPU(cpu)->as; + SigpAdtlSaveArea *sa; + hwaddr save = len; + int i; + + sa = address_space_map(as, addr, &save, true, attrs); + if (!sa) { + return -EFAULT; + } + if (save != len) { + address_space_unmap(as, sa, len, true, 0); + return -EFAULT; + } + + if (s390_has_feat(S390_FEAT_VECTOR)) { + for (i = 0; i < 32; i++) { + sa->vregs[i][0] = cpu_to_be64(cpu->env.vregs[i][0]); + sa->vregs[i][1] = cpu_to_be64(cpu->env.vregs[i][1]); + } + } + if (s390_has_feat(S390_FEAT_GUARDED_STORAGE) && len >= ADTL_GS_MIN_SIZE) { + for (i = 0; i < 4; i++) { + sa->gscb[i] = cpu_to_be64(cpu->env.gscb[i]); + } + } + + address_space_unmap(as, sa, len, true, len); + + return 0; +} + #define ADTL_SAVE_LC_MASK 0xfUL static void sigp_store_adtl_status(CPUState *cs, run_on_cpu_data arg) { diff --git a/target/s390x/tcg/excp_helper.c b/target/s390x/tcg/excp_helper.c index 4c7faee..0ae4e26 100644 --- a/target/s390x/tcg/excp_helper.c +++ b/target/s390x/tcg/excp_helper.c @@ -30,6 +30,7 @@ #ifndef CONFIG_USER_ONLY #include "qemu/timer.h" #include "system/address-spaces.h" +#include "system/memory.h" #include "hw/s390x/ioinst.h" #include "hw/s390x/s390_flic.h" #include "hw/boards.h" @@ -284,7 +285,7 @@ static void do_program_interrupt(CPUS390XState *env) addr = be64_to_cpu(lowcore->program_new_psw.addr); lowcore->per_breaking_event_addr = cpu_to_be64(env->gbea); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); s390_cpu_set_psw(env, mask, addr); } @@ -303,7 +304,7 @@ static void do_svc_interrupt(CPUS390XState *env) mask = be64_to_cpu(lowcore->svc_new_psw.mask); addr = be64_to_cpu(lowcore->svc_new_psw.addr); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); s390_cpu_set_psw(env, mask, addr); @@ -377,7 +378,7 @@ static void do_ext_interrupt(CPUS390XState *env) lowcore->external_old_psw.mask = cpu_to_be64(s390_cpu_get_psw_mask(env)); lowcore->external_old_psw.addr = cpu_to_be64(env->psw.addr); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); s390_cpu_set_psw(env, mask, addr); } @@ -404,7 +405,7 @@ static void do_io_interrupt(CPUS390XState *env) mask = be64_to_cpu(lowcore->io_new_psw.mask); addr = be64_to_cpu(lowcore->io_new_psw.addr); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); g_free(io); s390_cpu_set_psw(env, mask, addr); @@ -418,16 +419,18 @@ QEMU_BUILD_BUG_ON(sizeof(MchkExtSaveArea) != 1024); static int mchk_store_vregs(CPUS390XState *env, uint64_t mcesao) { + const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; + AddressSpace *as = env_cpu(env)->as; hwaddr len = sizeof(MchkExtSaveArea); MchkExtSaveArea *sa; int i; - sa = cpu_physical_memory_map(mcesao, &len, true); + sa = address_space_map(as, mcesao, &len, true, attrs); if (!sa) { return -EFAULT; } if (len != sizeof(MchkExtSaveArea)) { - cpu_physical_memory_unmap(sa, len, 1, 0); + address_space_unmap(as, sa, len, true, 0); return -EFAULT; } @@ -436,7 +439,7 @@ static int mchk_store_vregs(CPUS390XState *env, uint64_t mcesao) sa->vregs[i][1] = cpu_to_be64(env->vregs[i][1]); } - cpu_physical_memory_unmap(sa, len, 1, len); + address_space_unmap(as, sa, len, true, len); return 0; } @@ -488,7 +491,7 @@ static void do_mchk_interrupt(CPUS390XState *env) mask = be64_to_cpu(lowcore->mcck_new_psw.mask); addr = be64_to_cpu(lowcore->mcck_new_psw.addr); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); s390_cpu_set_psw(env, mask, addr); } diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c index f7101be..6d9d601 100644 --- a/target/s390x/tcg/misc_helper.c +++ b/target/s390x/tcg/misc_helper.c @@ -570,7 +570,7 @@ uint32_t HELPER(tpi)(CPUS390XState *env, uint64_t addr) lowcore->subchannel_nr = cpu_to_be16(io->nr); lowcore->io_int_parm = cpu_to_be32(io->parm); lowcore->io_int_word = cpu_to_be32(io->word); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); } g_free(io); @@ -700,7 +700,7 @@ void HELPER(stfl)(CPUS390XState *env) lowcore = cpu_map_lowcore(env); prepare_stfl(); memcpy(&lowcore->stfl_fac_list, stfl_bytes, sizeof(lowcore->stfl_fac_list)); - cpu_unmap_lowcore(lowcore); + cpu_unmap_lowcore(env, lowcore); } #endif diff --git a/tests/Makefile.include b/tests/Makefile.include index 62a4fc8..e47ef4d 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -105,11 +105,11 @@ check-venv: $(TESTS_VENV_TOKEN) FUNCTIONAL_TARGETS=$(patsubst %-softmmu,check-functional-%, $(filter %-softmmu,$(TARGETS))) .PHONY: $(FUNCTIONAL_TARGETS) -$(FUNCTIONAL_TARGETS): +$(FUNCTIONAL_TARGETS): check-venv @$(MAKE) SPEED=thorough $(subst -functional,-func,$@) .PHONY: check-functional -check-functional: +check-functional: check-venv @$(NINJA) precache-functional @QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func check-func-quick diff --git a/tests/data/acpi/loongarch64/virt/DSDT b/tests/data/acpi/loongarch64/virt/DSDT Binary files differindex b31841a..55aa34f 100644 --- a/tests/data/acpi/loongarch64/virt/DSDT +++ b/tests/data/acpi/loongarch64/virt/DSDT diff --git a/tests/data/acpi/loongarch64/virt/DSDT.memhp b/tests/data/acpi/loongarch64/virt/DSDT.memhp Binary files differindex e291200..c0955eb 100644 --- a/tests/data/acpi/loongarch64/virt/DSDT.memhp +++ b/tests/data/acpi/loongarch64/virt/DSDT.memhp diff --git a/tests/data/acpi/loongarch64/virt/DSDT.numamem b/tests/data/acpi/loongarch64/virt/DSDT.numamem Binary files differindex 07923ac..61e47e7 100644 --- a/tests/data/acpi/loongarch64/virt/DSDT.numamem +++ b/tests/data/acpi/loongarch64/virt/DSDT.numamem diff --git a/tests/data/acpi/loongarch64/virt/DSDT.topology b/tests/data/acpi/loongarch64/virt/DSDT.topology Binary files differindex 6dfbb49..b2afebc 100644 --- a/tests/data/acpi/loongarch64/virt/DSDT.topology +++ b/tests/data/acpi/loongarch64/virt/DSDT.topology diff --git a/tests/docker/common.rc b/tests/docker/common.rc index 752f4f3..79d533a 100755 --- a/tests/docker/common.rc +++ b/tests/docker/common.rc @@ -53,8 +53,8 @@ configure_qemu() config_opts="--enable-werror \ ${TARGET_LIST:+--target-list=${TARGET_LIST}} \ --prefix=$INSTALL_DIR \ - $QEMU_CONFIGURE_OPTS $EXTRA_CONFIGURE_OPTS \ $enable_rust \ + $QEMU_CONFIGURE_OPTS $EXTRA_CONFIGURE_OPTS \ $@" echo "Configure options:" echo $config_opts diff --git a/tests/docker/dockerfiles/debian-i686-cross.docker b/tests/docker/dockerfiles/debian-i686-cross.docker index 4e8b3a8..2998764 100644 --- a/tests/docker/dockerfiles/debian-i686-cross.docker +++ b/tests/docker/dockerfiles/debian-i686-cross.docker @@ -178,7 +178,7 @@ ENV ABI "i686-linux-gnu" ENV MESON_OPTS "--cross-file=i686-linux-gnu" ENV RUST_TARGET "i686-unknown-linux-gnu" ENV QEMU_CONFIGURE_OPTS --cross-prefix=i686-linux-gnu- -ENV DEF_TARGET_LIST x86_64-softmmu,x86_64-linux-user,i386-softmmu,i386-linux-user +ENV DEF_TARGET_LIST i386-softmmu,i386-linux-user # As a final step configure the user (if env is defined) ARG USER ARG UID diff --git a/tests/functional/aarch64/test_device_passthrough.py b/tests/functional/aarch64/test_device_passthrough.py index 1743778..05a3f52 100755 --- a/tests/functional/aarch64/test_device_passthrough.py +++ b/tests/functional/aarch64/test_device_passthrough.py @@ -85,8 +85,8 @@ class Aarch64DevicePassthrough(QemuSystemTest): # https://docs.kernel.org/driver-api/vfio.html#vfio-device-cde ASSET_DEVICE_PASSTHROUGH_STACK = Asset( ('https://github.com/pbo-linaro/qemu-linux-stack/' - 'releases/download/build/device_passthrough-c3fb84a.tar.xz'), - '15ac2b02bed0c0ea8e3e007de0bcfdaf6fd51c1ba98213f841dc7d01d6f72f04') + 'releases/download/build/device_passthrough-a9612a2.tar.xz'), + 'f7d2f70912e7231986e6e293e1a2c4786dd02bec113a7acb6bfc619e96155455') # This tests the device passthrough implementation, by booting a VM # supporting it with two nvme disks attached, and launching a nested VM diff --git a/tests/functional/aarch64/test_reverse_debug.py b/tests/functional/aarch64/test_reverse_debug.py index 8bc91cc..ec3348c 100755 --- a/tests/functional/aarch64/test_reverse_debug.py +++ b/tests/functional/aarch64/test_reverse_debug.py @@ -2,36 +2,34 @@ # # SPDX-License-Identifier: GPL-2.0-or-later # -# Reverse debugging test +# Reverse debugging test for aarch64 # # Copyright (c) 2020 ISP RAS +# Copyright (c) 2025 Linaro Limited # # Author: # Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> +# Gustavo Romero <gustavo.romero@linaro.org> (Run without Avocado) # # This work is licensed under the terms of the GNU GPL, version 2 or # later. See the COPYING file in the top-level directory. -from qemu_test import Asset, skipIfMissingImports, skipFlakyTest +from qemu_test import Asset, skipFlakyTest from reverse_debugging import ReverseDebugging -@skipIfMissingImports('avocado.utils') class ReverseDebugging_AArch64(ReverseDebugging): - REG_PC = 32 - ASSET_KERNEL = Asset( ('https://archives.fedoraproject.org/pub/archive/fedora/linux/' 'releases/29/Everything/aarch64/os/images/pxeboot/vmlinuz'), '7e1430b81c26bdd0da025eeb8fbd77b5dc961da4364af26e771bd39f379cbbf7') - @skipFlakyTest("https://gitlab.com/qemu-project/qemu/-/issues/2921") def test_aarch64_virt(self): self.set_machine('virt') self.cpu = 'cortex-a53' kernel_path = self.ASSET_KERNEL.fetch() - self.reverse_debugging(args=('-kernel', kernel_path)) + self.reverse_debugging(gdb_arch='aarch64', args=('-kernel', kernel_path)) if __name__ == '__main__': diff --git a/tests/functional/aarch64/test_rme_sbsaref.py b/tests/functional/aarch64/test_rme_sbsaref.py index ca892e0..6f92858 100755 --- a/tests/functional/aarch64/test_rme_sbsaref.py +++ b/tests/functional/aarch64/test_rme_sbsaref.py @@ -25,8 +25,8 @@ class Aarch64RMESbsaRefMachine(QemuSystemTest): # ./build.sh && ./archive_artifacts.sh out.tar.xz ASSET_RME_STACK_SBSA = Asset( ('https://github.com/pbo-linaro/qemu-linux-stack/' - 'releases/download/build/rme_sbsa_release-a7f02cf.tar.xz'), - '27d8400b11befb828d6db0cab97e7ae102d0992c928d3dfbf38b24b6cf6c324c') + 'releases/download/build/rme_sbsa_release-6a2dfc5.tar.xz'), + '5adba482aa069912292a8da746c6b21268224d9d81c97fe7c0bed690579ebdcb') # This tests the FEAT_RME cpu implementation, by booting a VM supporting it, # and launching a nested VM using it. diff --git a/tests/functional/aarch64/test_rme_virt.py b/tests/functional/aarch64/test_rme_virt.py index bb603aa..5e23773 100755 --- a/tests/functional/aarch64/test_rme_virt.py +++ b/tests/functional/aarch64/test_rme_virt.py @@ -23,8 +23,8 @@ class Aarch64RMEVirtMachine(QemuSystemTest): # ./build.sh && ./archive_artifacts.sh out.tar.xz ASSET_RME_STACK_VIRT = Asset( ('https://github.com/pbo-linaro/qemu-linux-stack/' - 'releases/download/build/rme_release-86101e5.tar.xz'), - 'e42fef8439badb52a071ac446fc33cff4cb7d61314c7a28fdbe61a11e1faad3a') + 'releases/download/build/rme_release-56bc99e.tar.xz'), + '0e3dc6b8a4b828dbae09c951a40dcb710eded084b32432b50c69cf4173ffa4be') # This tests the FEAT_RME cpu implementation, by booting a VM supporting it, # and launching a nested VM using it. diff --git a/tests/functional/meson.build b/tests/functional/meson.build index 2a0c5aa..725630d 100644 --- a/tests/functional/meson.build +++ b/tests/functional/meson.build @@ -77,6 +77,12 @@ foreach speed : ['quick', 'thorough'] test_env.set('PYTHONPATH', meson.project_source_root() / 'python:' + meson.current_source_dir()) + # Define the GDB environment variable if gdb is available. + gdb = get_option('gdb') + if gdb != '' + test_env.set('QEMU_TEST_GDB', gdb) + endif + foreach test : target_tests testname = '@0@-@1@'.format(target_base, test) if fs.exists('generic' / 'test_' + test + '.py') diff --git a/tests/functional/ppc64/test_reverse_debug.py b/tests/functional/ppc64/test_reverse_debug.py index 5931ade..69551fb 100755 --- a/tests/functional/ppc64/test_reverse_debug.py +++ b/tests/functional/ppc64/test_reverse_debug.py @@ -2,39 +2,36 @@ # # SPDX-License-Identifier: GPL-2.0-or-later # -# Reverse debugging test +# Reverse debugging test for ppc64 # # Copyright (c) 2020 ISP RAS +# Copyright (c) 2025 Linaro Limited # # Author: # Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> +# Gustavo Romero <gustavo.romero@linaro.org> (Run without Avocado) # # This work is licensed under the terms of the GNU GPL, version 2 or # later. See the COPYING file in the top-level directory. -from qemu_test import skipIfMissingImports, skipFlakyTest +from qemu_test import skipFlakyTest from reverse_debugging import ReverseDebugging -@skipIfMissingImports('avocado.utils') class ReverseDebugging_ppc64(ReverseDebugging): - REG_PC = 0x40 - @skipFlakyTest("https://gitlab.com/qemu-project/qemu/-/issues/1992") def test_ppc64_pseries(self): self.set_machine('pseries') # SLOF branches back to its entry point, which causes this test # to take the 'hit a breakpoint again' path. That's not a problem, # just slightly different than the other machines. - self.endian_is_le = False - self.reverse_debugging() + self.reverse_debugging(gdb_arch='powerpc:common64') @skipFlakyTest("https://gitlab.com/qemu-project/qemu/-/issues/1992") def test_ppc64_powernv(self): self.set_machine('powernv') - self.endian_is_le = False - self.reverse_debugging() + self.reverse_debugging(gdb_arch='powerpc:common64') if __name__ == '__main__': diff --git a/tests/functional/qemu_test/__init__.py b/tests/functional/qemu_test/__init__.py index 6e666a0..3201935 100644 --- a/tests/functional/qemu_test/__init__.py +++ b/tests/functional/qemu_test/__init__.py @@ -15,6 +15,8 @@ from .testcase import QemuBaseTest, QemuUserTest, QemuSystemTest from .linuxkernel import LinuxKernelTest from .decorators import skipIfMissingCommands, skipIfNotMachine, \ skipFlakyTest, skipUntrustedTest, skipBigDataTest, skipSlowTest, \ - skipIfMissingImports, skipIfOperatingSystem, skipLockedMemoryTest + skipIfMissingImports, skipIfOperatingSystem, skipLockedMemoryTest, \ + skipIfMissingEnv from .archive import archive_extract from .uncompress import uncompress +from .gdb import GDB diff --git a/tests/functional/qemu_test/asset.py b/tests/functional/qemu_test/asset.py index 2971a98..f666125 100644 --- a/tests/functional/qemu_test/asset.py +++ b/tests/functional/qemu_test/asset.py @@ -225,7 +225,6 @@ class Asset: log.addHandler(handler) for name, asset in vars(test.__class__).items(): if name.startswith("ASSET_") and type(asset) == Asset: - log.info("Attempting to cache '%s'" % asset) try: asset.fetch() except AssetError as e: diff --git a/tests/functional/qemu_test/decorators.py b/tests/functional/qemu_test/decorators.py index c0d1567..b239295 100644 --- a/tests/functional/qemu_test/decorators.py +++ b/tests/functional/qemu_test/decorators.py @@ -11,6 +11,24 @@ from unittest import skipIf, skipUnless from .cmd import which ''' +Decorator to skip execution of a test if the provided +environment variables are not set. +Example: + + @skipIfMissingEnv("QEMU_ENV_VAR0", "QEMU_ENV_VAR1") +''' +def skipIfMissingEnv(*vars_): + missing_vars = [] + for var in vars_: + if os.getenv(var) == None: + missing_vars.append(var) + + has_vars = True if len(missing_vars) == 0 else False + + return skipUnless(has_vars, f"Missing env var(s): {', '.join(missing_vars)}") + +''' + Decorator to skip execution of a test if the list of command binaries is not available in $PATH. Example: diff --git a/tests/functional/qemu_test/gdb.py b/tests/functional/qemu_test/gdb.py new file mode 100644 index 0000000..558d476 --- /dev/null +++ b/tests/functional/qemu_test/gdb.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# A simple interface module built around pygdbmi for handling GDB commands. +# +# Copyright (c) 2025 Linaro Limited +# +# Author: +# Gustavo Romero <gustavo.romero@linaro.org> +# + +import re + + +class GDB: + """Provides methods to run and capture GDB command output.""" + + + def __init__(self, gdb_path, echo=True, suffix='# ', prompt="$ "): + from pygdbmi.gdbcontroller import GdbController + from pygdbmi.constants import GdbTimeoutError + type(self).TimeoutError = GdbTimeoutError + + gdb_cmd = [gdb_path, "-q", "--interpreter=mi2"] + self.gdbmi = GdbController(gdb_cmd) + self.echo = echo + self.suffix = suffix + self.prompt = prompt + self.response = None + self.cmd_output = None + + + def get_payload(self, response, kind): + output = [] + for o in response: + # Unpack payloads of the same type. + _type, _, payload, *_ = o.values() + if _type == kind: + output += [payload] + + # Some output lines do not end with \n but begin with it, + # so remove the leading \n and merge them with the next line + # that ends with \n. + lines = [line.lstrip('\n') for line in output] + lines = "".join(lines) + lines = lines.splitlines(keepends=True) + + return lines + + + def cli(self, cmd, timeout=32.0): + self.response = self.gdbmi.write(cmd, timeout_sec=timeout) + self.cmd_output = self.get_payload(self.response, kind="console") + if self.echo: + print(self.suffix + self.prompt + cmd) + + if len(self.cmd_output) > 0: + cmd_output = self.suffix.join(self.cmd_output) + print(self.suffix + cmd_output, end="") + + return self + + + def get_addr(self): + address_pattern = r"0x[0-9A-Fa-f]+" + cmd_output = "".join(self.cmd_output) # Concat output lines. + + match = re.search(address_pattern, cmd_output) + + return int(match[0], 16) if match else None + + + def get_log(self): + r = self.get_payload(self.response, kind="log") + r = "".join(r) + + return r + + + def get_console(self): + r = "".join(self.cmd_output) + + return r + + + def exit(self): + self.gdbmi.exit() diff --git a/tests/functional/reverse_debugging.py b/tests/functional/reverse_debugging.py index f9a1d39..68cfcb3 100644 --- a/tests/functional/reverse_debugging.py +++ b/tests/functional/reverse_debugging.py @@ -1,18 +1,23 @@ -# Reverse debugging test -# # SPDX-License-Identifier: GPL-2.0-or-later # +# Reverse debugging test +# # Copyright (c) 2020 ISP RAS +# Copyright (c) 2025 Linaro Limited # # Author: # Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> +# Gustavo Romero <gustavo.romero@linaro.org> (Run without Avocado) # # This work is licensed under the terms of the GNU GPL, version 2 or # later. See the COPYING file in the top-level directory. -import os + import logging +import os +from subprocess import check_output -from qemu_test import LinuxKernelTest, get_qemu_img +from qemu_test import LinuxKernelTest, get_qemu_img, GDB, \ + skipIfMissingEnv, skipIfMissingImports from qemu_test.ports import Ports @@ -28,13 +33,9 @@ class ReverseDebugging(LinuxKernelTest): that the execution is stopped at the last of them. """ - timeout = 10 STEPS = 10 - endian_is_le = True def run_vm(self, record, shift, args, replay_path, image_path, port): - from avocado.utils import datadrainer - logger = logging.getLogger('replay') vm = self.get_vm(name='record' if record else 'replay') vm.set_console() @@ -52,55 +53,20 @@ class ReverseDebugging(LinuxKernelTest): if args: vm.add_args(*args) vm.launch() - console_drainer = datadrainer.LineLogger(vm.console_socket.fileno(), - logger=self.log.getChild('console'), - stop_check=(lambda : not vm.is_running())) - console_drainer.start() return vm @staticmethod - def get_reg_le(g, reg): - res = g.cmd(b'p%x' % reg) - num = 0 - for i in range(len(res))[-2::-2]: - num = 0x100 * num + int(res[i:i + 2], 16) - return num - - @staticmethod - def get_reg_be(g, reg): - res = g.cmd(b'p%x' % reg) - return int(res, 16) - - def get_reg(self, g, reg): - # value may be encoded in BE or LE order - if self.endian_is_le: - return self.get_reg_le(g, reg) - else: - return self.get_reg_be(g, reg) - - def get_pc(self, g): - return self.get_reg(g, self.REG_PC) - - def check_pc(self, g, addr): - pc = self.get_pc(g) - if pc != addr: - self.fail('Invalid PC (read %x instead of %x)' % (pc, addr)) - - @staticmethod - def gdb_step(g): - g.cmd(b's', b'T05thread:01;') - - @staticmethod - def gdb_bstep(g): - g.cmd(b'bs', b'T05thread:01;') + def get_pc(gdb: GDB): + return gdb.cli("print $pc").get_addr() @staticmethod def vm_get_icount(vm): return vm.qmp('query-replay')['return']['icount'] - def reverse_debugging(self, shift=7, args=None): - from avocado.utils import gdb - from avocado.utils import process + @skipIfMissingImports("pygdbmi") # Required by GDB class + @skipIfMissingEnv("QEMU_TEST_GDB") + def reverse_debugging(self, gdb_arch, shift=7, args=None): + from qemu_test import GDB logger = logging.getLogger('replay') @@ -111,8 +77,9 @@ class ReverseDebugging(LinuxKernelTest): if qemu_img is None: self.skipTest('Could not find "qemu-img", which is required to ' 'create the temporary qcow2 image') - cmd = '%s create -f qcow2 %s 128M' % (qemu_img, image_path) - process.run(cmd) + out = check_output([qemu_img, 'create', '-f', 'qcow2', image_path, '128M'], + encoding='utf8') + logger.info("qemu-img: %s" % out) replay_path = os.path.join(self.workdir, 'replay.bin') @@ -129,68 +96,107 @@ class ReverseDebugging(LinuxKernelTest): with Ports() as ports: port = ports.find_free_port() vm = self.run_vm(False, shift, args, replay_path, image_path, port) - logger.info('connecting to gdbstub') - g = gdb.GDBRemote('127.0.0.1', port, False, False) - g.connect() - r = g.cmd(b'qSupported') - if b'qXfer:features:read+' in r: - g.cmd(b'qXfer:features:read:target.xml:0,ffb') - if b'ReverseStep+' not in r: + + try: + logger.info('Connecting to gdbstub...') + self.reverse_debugging_run(vm, port, gdb_arch, last_icount) + logger.info('Test passed.') + except GDB.TimeoutError: + # Convert a GDB timeout exception into a unittest failure exception. + raise self.failureException("Timeout while connecting to or " + "communicating with gdbstub...") from None + except Exception: + # Re-throw exceptions from unittest, like the ones caused by fail(), + # skipTest(), etc. + raise + + def reverse_debugging_run(self, vm, port, gdb_arch, last_icount): + logger = logging.getLogger('replay') + + gdb_cmd = os.getenv('QEMU_TEST_GDB') + gdb = GDB(gdb_cmd) + + r = gdb.cli("set architecture").get_log() + if gdb_arch not in r: + self.skipTest(f"GDB does not support arch '{gdb_arch}'") + + gdb.cli("set debug remote 1") + + c = gdb.cli(f"target remote localhost:{port}").get_console() + if not f"Remote debugging using localhost:{port}" in c: + self.fail("Could not connect to gdbstub!") + + # Remote debug messages are in 'log' payloads. + r = gdb.get_log() + if 'ReverseStep+' not in r: self.fail('Reverse step is not supported by QEMU') - if b'ReverseContinue+' not in r: + if 'ReverseContinue+' not in r: self.fail('Reverse continue is not supported by QEMU') + gdb.cli("set debug remote 0") + logger.info('stepping forward') steps = [] # record first instruction addresses for _ in range(self.STEPS): - pc = self.get_pc(g) + pc = self.get_pc(gdb) logger.info('saving position %x' % pc) steps.append(pc) - self.gdb_step(g) + gdb.cli("stepi") # visit the recorded instruction in reverse order logger.info('stepping backward') for addr in steps[::-1]: - self.gdb_bstep(g) - self.check_pc(g, addr) logger.info('found position %x' % addr) + gdb.cli("reverse-stepi") + pc = self.get_pc(gdb) + if pc != addr: + logger.info('Invalid PC (read %x instead of %x)' % (pc, addr)) + self.fail('Reverse stepping failed!') # visit the recorded instruction in forward order logger.info('stepping forward') for addr in steps: - self.check_pc(g, addr) - self.gdb_step(g) logger.info('found position %x' % addr) + pc = self.get_pc(gdb) + if pc != addr: + logger.info('Invalid PC (read %x instead of %x)' % (pc, addr)) + self.fail('Forward stepping failed!') + gdb.cli("stepi") # set breakpoints for the instructions just stepped over logger.info('setting breakpoints') for addr in steps: - # hardware breakpoint at addr with len=1 - g.cmd(b'Z1,%x,1' % addr, b'OK') + gdb.cli(f"break *{hex(addr)}") # this may hit a breakpoint if first instructions are executed # again logger.info('continuing execution') vm.qmp('replay-break', icount=last_icount - 1) # continue - will return after pausing - # This could stop at the end and get a T02 return, or by - # re-executing one of the breakpoints and get a T05 return. - g.cmd(b'c') + # This can stop at the end of the replay-break and gdb gets a SIGINT, + # or by re-executing one of the breakpoints and gdb stops at a + # breakpoint. + gdb.cli("continue") + if self.vm_get_icount(vm) == last_icount - 1: logger.info('reached the end (icount %s)' % (last_icount - 1)) else: logger.info('hit a breakpoint again at %x (icount %s)' % - (self.get_pc(g), self.vm_get_icount(vm))) + (self.get_pc(gdb), self.vm_get_icount(vm))) logger.info('running reverse continue to reach %x' % steps[-1]) # reverse continue - will return after stopping at the breakpoint - g.cmd(b'bc', b'T05thread:01;') + gdb.cli("reverse-continue") # assume that none of the first instructions is executed again # breaking the order of the breakpoints - self.check_pc(g, steps[-1]) + pc = self.get_pc(gdb) + if pc != steps[-1]: + self.fail("'reverse-continue' did not hit the first PC in reverse order!") + logger.info('successfully reached %x' % steps[-1]) logger.info('exiting gdb and qemu') + gdb.exit() vm.shutdown() diff --git a/tests/functional/x86_64/test_reverse_debug.py b/tests/functional/x86_64/test_reverse_debug.py index d713e91..2b31ae8 100755 --- a/tests/functional/x86_64/test_reverse_debug.py +++ b/tests/functional/x86_64/test_reverse_debug.py @@ -2,34 +2,29 @@ # # SPDX-License-Identifier: GPL-2.0-or-later # -# Reverse debugging test +# Reverse debugging test for x86_64 # # Copyright (c) 2020 ISP RAS +# Copyright (c) 2025 Linaro Limited # # Author: # Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru> +# Gustavo Romero <gustavo.romero@linaro.org> (Run without Avocado) # # This work is licensed under the terms of the GNU GPL, version 2 or # later. See the COPYING file in the top-level directory. -from qemu_test import skipIfMissingImports, skipFlakyTest +from qemu_test import skipFlakyTest from reverse_debugging import ReverseDebugging -@skipIfMissingImports('avocado.utils') class ReverseDebugging_X86_64(ReverseDebugging): - REG_PC = 0x10 - REG_CS = 0x12 - def get_pc(self, g): - return self.get_reg_le(g, self.REG_PC) \ - + self.get_reg_le(g, self.REG_CS) * 0x10 - @skipFlakyTest("https://gitlab.com/qemu-project/qemu/-/issues/2922") def test_x86_64_pc(self): self.set_machine('pc') # start with BIOS only - self.reverse_debugging() + self.reverse_debugging(gdb_arch='x86-64') if __name__ == '__main__': diff --git a/tests/lcitool/refresh b/tests/lcitool/refresh index 6459593..056cfb6 100755 --- a/tests/lcitool/refresh +++ b/tests/lcitool/refresh @@ -216,8 +216,6 @@ try: generate_dockerfile("debian-i686-cross", "debian-13", cross="i686", trailer=cross_build("i686-linux-gnu-", - "x86_64-softmmu," - "x86_64-linux-user," "i386-softmmu,i386-linux-user")) # mips no longer supported in debian-13 @@ -272,8 +270,8 @@ try: # # Ansible package lists # - generate_yaml("ubuntu", "ubuntu-2204", "aarch64") - generate_yaml("ubuntu", "ubuntu-2204", "s390x") + generate_yaml("ubuntu", "ubuntu-2404", "aarch64") + generate_yaml("ubuntu", "ubuntu-2404", "s390x") sys.exit(0) diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target index 1755874..55ce34e 100644 --- a/tests/tcg/aarch64/Makefile.target +++ b/tests/tcg/aarch64/Makefile.target @@ -75,6 +75,11 @@ AARCH64_TESTS += $(SME_TESTS) $(SME_TESTS): CFLAGS += $(CROSS_AS_HAS_ARMV9_SME) endif +# GCS Tests +GCS_TESTS += gcsstr gcspushm gcsss +AARCH64_TESTS += $(GCS_TESTS) +$(GCS_TESTS): gcs.h + # System Registers Tests AARCH64_TESTS += sysregs diff --git a/tests/tcg/aarch64/gcs.h b/tests/tcg/aarch64/gcs.h new file mode 100644 index 0000000..6f013d0 --- /dev/null +++ b/tests/tcg/aarch64/gcs.h @@ -0,0 +1,80 @@ +/* + * Linux kernel fallback API definitions for GCS and test helpers. + * + * Copyright (c) 2025 Linaro Ltd + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <assert.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/syscall.h> + +#ifndef PR_GET_SHADOW_STACK_STATUS +#define PR_GET_SHADOW_STACK_STATUS 74 +#endif +#ifndef PR_SET_SHADOW_STACK_STATUS +#define PR_SET_SHADOW_STACK_STATUS 75 +#endif +#ifndef PR_LOCK_SHADOW_STACK_STATUS +#define PR_LOCK_SHADOW_STACK_STATUS 76 +#endif +#ifndef PR_SHADOW_STACK_ENABLE +# define PR_SHADOW_STACK_ENABLE (1 << 0) +# define PR_SHADOW_STACK_WRITE (1 << 1) +# define PR_SHADOW_STACK_PUSH (1 << 2) +#endif +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN (1 << 0) +#endif +#ifndef SHADOW_STACK_SET_MARKER +#define SHADOW_STACK_SET_MARKER (1 << 1) +#endif +#ifndef SEGV_CPERR +#define SEGV_CPERR 10 +#endif +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +/* + * Macros, and implement the syscall inline, lest we fail + * the checked return from any function call. + */ +#define enable_gcs(flags) \ + do { \ + register long num __asm__ ("x8") = __NR_prctl; \ + register long arg1 __asm__ ("x0") = PR_SET_SHADOW_STACK_STATUS; \ + register long arg2 __asm__ ("x1") = PR_SHADOW_STACK_ENABLE | flags; \ + register long arg3 __asm__ ("x2") = 0; \ + register long arg4 __asm__ ("x3") = 0; \ + register long arg5 __asm__ ("x4") = 0; \ + asm volatile("svc #0" \ + : "+r"(arg1) \ + : "r"(arg2), "r"(arg3), "r"(arg4), "r"(arg5), "r"(num) \ + : "memory", "cc"); \ + if (arg1) { \ + errno = -arg1; \ + perror("PR_SET_SHADOW_STACK_STATUS"); \ + exit(2); \ + } \ + } while (0) + +#define gcspr() \ + ({ uint64_t *r; asm volatile("mrs %0, s3_3_c2_c5_1" : "=r"(r)); r; }) + +#define gcsss1(val) \ + do { \ + asm volatile("sys #3, c7, c7, #2, %0" : : "r"(val) : "memory"); \ + } while (0) + +#define gcsss2() \ + ({ uint64_t *r; \ + asm volatile("sysl %0, #3, c7, c7, #3" : "=r"(r) : : "memory"); r; }) diff --git a/tests/tcg/aarch64/gcspushm.c b/tests/tcg/aarch64/gcspushm.c new file mode 100644 index 0000000..c330417 --- /dev/null +++ b/tests/tcg/aarch64/gcspushm.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "gcs.h" + + +#define GCSPUSHM "sys #3, c7, c7, #0, %[push]" +#define GCSPOPM "sysl %[pop], #3, c7, c7, #1" + +static void test_sigsegv(int sig, siginfo_t *info, void *vuc) +{ + ucontext_t *uc = vuc; + uint64_t inst_sigsegv; + + __asm__("adr %0, inst_sigsegv" : "=r"(inst_sigsegv)); + assert(uc->uc_mcontext.pc == inst_sigsegv); + assert(info->si_code == SEGV_CPERR); + /* TODO: Dig for ESR and verify syndrome. */ + uc->uc_mcontext.pc += 4; +} + +static void test_sigill(int sig, siginfo_t *info, void *vuc) +{ + ucontext_t *uc = vuc; + uint64_t inst_sigill; + + __asm__("adr %0, inst_sigill" : "=r"(inst_sigill)); + assert(uc->uc_mcontext.pc == inst_sigill); + assert(info->si_code == ILL_ILLOPC); + uc->uc_mcontext.pc += 4; +} + +int main() +{ + struct sigaction sa = { .sa_flags = SA_SIGINFO }; + uint64_t old, new; + + sa.sa_sigaction = test_sigsegv; + if (sigaction(SIGSEGV, &sa, NULL) < 0) { + perror("sigaction"); + exit(1); + } + + sa.sa_sigaction = test_sigill; + if (sigaction(SIGILL, &sa, NULL) < 0) { + perror("sigaction"); + exit(1); + } + + /* Pushm is disabled -- SIGILL via EC_SYSTEMREGISTERTRAP */ + asm volatile("inst_sigill:\t" GCSPUSHM + : : [push] "r" (1)); + + enable_gcs(PR_SHADOW_STACK_PUSH); + + /* Valid value -- low 2 bits clear */ + old = 0xdeadbeeffeedcaec; + asm volatile(GCSPUSHM "\n\t" GCSPOPM + : [pop] "=r" (new) + : [push] "r" (old) + : "memory"); + assert(old == new); + + /* Invalid value -- SIGSEGV via EC_GCS */ + asm volatile(GCSPUSHM "\n" + "inst_sigsegv:\t" GCSPOPM + : [pop] "=r" (new) + : [push] "r" (1) + : "memory"); + + exit(0); +} diff --git a/tests/tcg/aarch64/gcsss.c b/tests/tcg/aarch64/gcsss.c new file mode 100644 index 0000000..9550c68 --- /dev/null +++ b/tests/tcg/aarch64/gcsss.c @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "gcs.h" + +#define IN_PROGRESS(X) ((uint64_t)(X) | 5) +#define CAP(X) (((uint64_t)(X) & ~0xfff) + 1) + +static uint64_t * __attribute__((noinline)) recurse(size_t index) +{ + if (index == 0) { + return gcspr(); + } + return recurse(index - 1); +} + +int main() +{ + void *tmp; + uint64_t *alt_stack, *alt_cap; + uint64_t *orig_pr, *orig_cap; + uint64_t *bottom; + size_t pagesize = getpagesize(); + size_t words; + + enable_gcs(0); + orig_pr = gcspr(); + + /* Allocate a guard page before and after. */ + tmp = mmap(0, 3 * pagesize, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); + assert(tmp != MAP_FAILED); + + /* map_shadow_stack won't replace existing mappings */ + munmap(tmp + pagesize, pagesize); + + /* Allocate a new stack between the guards. */ + alt_stack = (uint64_t *) + syscall(__NR_map_shadow_stack, tmp + pagesize, pagesize, + SHADOW_STACK_SET_TOKEN); + assert(alt_stack == tmp + pagesize); + + words = pagesize / 8; + alt_cap = alt_stack + words - 1; + + /* SHADOW_STACK_SET_TOKEN set the cap. */ + assert(*alt_cap == CAP(alt_cap)); + + /* Swap to the alt stack, one step at a time. */ + gcsss1(alt_cap); + + assert(gcspr() == alt_cap); + assert(*alt_cap == IN_PROGRESS(orig_pr)); + + orig_cap = gcsss2(); + + assert(orig_cap == orig_pr - 1); + assert(*orig_cap == CAP(orig_cap)); + assert(gcspr() == alt_stack + words); + + /* We should be able to use the whole stack. */ + bottom = recurse(words - 1); + assert(bottom == alt_stack); + + /* We should be back where we started. */ + assert(gcspr() == alt_stack + words); + + /* Swap back to the original stack. */ + gcsss1(orig_cap); + tmp = gcsss2(); + + assert(gcspr() == orig_pr); + assert(tmp == alt_cap); + + exit(0); +} diff --git a/tests/tcg/aarch64/gcsstr.c b/tests/tcg/aarch64/gcsstr.c new file mode 100644 index 0000000..b045aee --- /dev/null +++ b/tests/tcg/aarch64/gcsstr.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "gcs.h" + +/* + * A single garbage store to the gcs stack. + * The asm inside must be unique, so disallow inlining. + */ +void __attribute__((noinline)) +test_gcsstr(void) +{ + register uint64_t *ptr __asm__("x0") = gcspr(); + /* GCSSTR x1, x0 */ + __asm__("inst_gcsstr: .inst 0xd91f1c01" : : "r"(--ptr)); +} + +static void test_sigsegv(int sig, siginfo_t *info, void *vuc) +{ + ucontext_t *uc = vuc; + uint64_t inst_gcsstr; + + __asm__("adr %0, inst_gcsstr" : "=r"(inst_gcsstr)); + assert(uc->uc_mcontext.pc == inst_gcsstr); + assert(info->si_code == SEGV_CPERR); + /* TODO: Dig for ESR and verify syndrome. */ + exit(0); +} + +int main() +{ + struct sigaction sa = { + .sa_sigaction = test_sigsegv, + .sa_flags = SA_SIGINFO, + }; + + /* Enable GCSSTR and test the store succeeds. */ + enable_gcs(PR_SHADOW_STACK_WRITE); + test_gcsstr(); + + /* Disable GCSSTR and test the resulting sigsegv. */ + enable_gcs(0); + if (sigaction(SIGSEGV, &sa, NULL) < 0) { + perror("sigaction"); + exit(1); + } + test_gcsstr(); + abort(); +} |