diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-18 11:14:31 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-18 11:14:31 -0400 |
commit | 214a8da23651f2472b296b3293e619fd58d9e212 (patch) | |
tree | 467317b595f2d19b68121d4ef5f843817c7783bb | |
parent | 2c65091fd9d387b8dca8115dbdd9c3c61f658a9e (diff) | |
parent | 653fad2497bed71d938827299cb9ac38ac333f9b (diff) | |
download | qemu-214a8da23651f2472b296b3293e619fd58d9e212.zip qemu-214a8da23651f2472b296b3293e619fd58d9e212.tar.gz qemu-214a8da23651f2472b296b3293e619fd58d9e212.tar.bz2 |
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging
* configure: don't enable firmware for targets that are not built
* configure: don't use strings(1)
* scsi, target/i386: switch from device_legacy_reset() to device_cold_reset()
* target/i386: AVX support for TCG
* target/i386: fix SynIC SINT assertion failure on guest reset
* target/i386: Use atomic operations for pte updates and other cleanups
* tests/tcg: extend SSE tests to AVX
* virtio-scsi: send "REPORTED LUNS CHANGED" sense data upon disk hotplug events
# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmNOlOcUHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroNuvwgAj/Z5pI9KU33XiWKFR3bZf2lHh21P
# xmTzNtPmnP1WHDY1DNug/UB+BLg3c+carpTf5n3B8aKI4X3FfxGSJvYlXy4BONFD
# XqYMH3OZB5GaR8Wza9trNYjDs/9hOZus/0R6Hqdl/T38PlMjf8mmayULJIGdcFcJ
# WJvITVntbcCwwbpyJbRC5BNigG8ZXTNRoKBgtFVGz6Ox+n0YydwKX5qU5J7xRfCU
# lW41LjZ0Fk5lonH16+xuS4WD5EyrNt8cMKCGsxnyxhI7nehe/OGnYr9l+xZJclrh
# inQlSwJv0IpUJcrGCI4Xugwux4Z7ZXv3JQ37FzsdZcv/ZXpGonXMeXNJ9A==
# =o6x7
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 18 Oct 2022 07:58:31 EDT
# gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg: issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full]
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1
# Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83
* tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (53 commits)
target/i386: remove old SSE decoder
target/i386: move 3DNow to the new decoder
tests/tcg: extend SSE tests to AVX
target/i386: Enable AVX cpuid bits when using TCG
target/i386: implement VLDMXCSR/VSTMXCSR
target/i386: implement XSAVE and XRSTOR of AVX registers
target/i386: reimplement 0x0f 0x28-0x2f, add AVX
target/i386: reimplement 0x0f 0x10-0x17, add AVX
target/i386: reimplement 0x0f 0xc2, 0xc4-0xc6, add AVX
target/i386: reimplement 0x0f 0x38, add AVX
target/i386: Use tcg gvec ops for pmovmskb
target/i386: reimplement 0x0f 0x3a, add AVX
target/i386: clarify (un)signedness of immediates from 0F3Ah opcodes
target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, add AVX
target/i386: reimplement 0x0f 0x70-0x77, add AVX
target/i386: reimplement 0x0f 0x78-0x7f, add AVX
target/i386: reimplement 0x0f 0x50-0x5f, add AVX
target/i386: reimplement 0x0f 0xd8-0xdf, 0xe8-0xef, 0xf8-0xff, add AVX
target/i386: reimplement 0x0f 0x60-0x6f, add AVX
target/i386: Introduce 256-bit vector helpers
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
32 files changed, 5980 insertions, 2931 deletions
@@ -1423,30 +1423,31 @@ if test "$tcg" = "enabled"; then git_submodules="$git_submodules tests/fp/berkeley-softfloat-3" fi -# --- +########################################## # big/little endian test cat > $TMPC << EOF -#include <stdio.h> -short big_endian[] = { 0x4269, 0x4765, 0x4e64, 0x4961, 0x4e00, 0, }; -short little_endian[] = { 0x694c, 0x7454, 0x654c, 0x6e45, 0x6944, 0x6e41, 0, }; -int main(int argc, char *argv[]) -{ - return printf("%s %s\n", (char *)big_endian, (char *)little_endian); -} +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# error LITTLE +#endif +int main(void) { return 0; } EOF -if compile_prog ; then - if strings -a $TMPE | grep -q BiGeNdIaN ; then - bigendian="yes" - elif strings -a $TMPE | grep -q LiTtLeEnDiAn ; then - bigendian="no" - else - echo big/little test failed - exit 1 - fi +if ! compile_prog ; then + bigendian="no" else + cat > $TMPC << EOF +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# error BIG +#endif +int main(void) { return 0; } +EOF + + if ! compile_prog ; then + bigendian="yes" + else echo big/little test failed exit 1 + fi fi ########################################## @@ -1841,6 +1842,16 @@ compute_target_variable() { fi } +have_target() { + for i; do + case " $target_list " in + *" $i "*) return 0;; + *) ;; + esac + done + return 1 +} + # probe_target_compiler TARGET # # Look for a compiler for the given target, either native or cross. @@ -2261,8 +2272,9 @@ echo "# Automatically generated by configure - do not modify" > Makefile.prereqs # Mac OS X ships with a broken assembler roms= -if test "$targetos" != "darwin" && test "$targetos" != "sunos" && \ - test "$targetos" != "haiku" && test "$softmmu" = yes && \ +if have_target i386-softmmu x86_64-softmmu && \ + test "$targetos" != "darwin" && test "$targetos" != "sunos" && \ + test "$targetos" != "haiku" && \ probe_target_compiler i386-softmmu; then roms="pc-bios/optionrom" config_mak=pc-bios/optionrom/config.mak @@ -2271,7 +2283,8 @@ if test "$targetos" != "darwin" && test "$targetos" != "sunos" && \ write_target_makefile >> $config_mak fi -if test "$softmmu" = yes && probe_target_compiler ppc-softmmu; then +if have_target ppc-softmmu ppc64-softmmu && \ + probe_target_compiler ppc-softmmu; then roms="$roms pc-bios/vof" config_mak=pc-bios/vof/config.mak echo "# Automatically generated by configure - do not modify" > $config_mak @@ -2281,7 +2294,7 @@ fi # Only build s390-ccw bios if the compiler has -march=z900 or -march=z10 # (which is the lowest architecture level that Clang supports) -if test "$softmmu" = yes && probe_target_compiler s390x-softmmu; then +if have_target s390x-softmmu && probe_target_compiler s390x-softmmu; then write_c_skeleton do_compiler "$target_cc" $target_cc_cflags -march=z900 -o $TMPO -c $TMPC has_z900=$? diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index 7fe8cce..52f9aa9 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -485,9 +485,7 @@ static void microvm_machine_reset(MachineState *machine) CPU_FOREACH(cs) { cpu = X86_CPU(cs); - if (cpu->apic_state) { - device_legacy_reset(cpu->apic_state); - } + x86_cpu_after_reset(cpu); } } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 566accf..768982a 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -92,6 +92,7 @@ #include "hw/virtio/virtio-mem-pci.h" #include "hw/mem/memory-device.h" #include "sysemu/replay.h" +#include "target/i386/cpu.h" #include "qapi/qmp/qerror.h" #include "e820_memory_layout.h" #include "fw_cfg.h" @@ -1859,9 +1860,7 @@ static void pc_machine_reset(MachineState *machine) CPU_FOREACH(cs) { cpu = X86_CPU(cs); - if (cpu->apic_state) { - device_legacy_reset(cpu->apic_state); - } + x86_cpu_after_reset(cpu); } } diff --git a/hw/scsi/esp.c b/hw/scsi/esp.c index 2ff18ce..e5b281e 100644 --- a/hw/scsi/esp.c +++ b/hw/scsi/esp.c @@ -941,7 +941,7 @@ static void esp_soft_reset(ESPState *s) static void esp_bus_reset(ESPState *s) { - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); } static void parent_esp_reset(ESPState *s, int irq, int level) diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c index 05a43ec..5097964 100644 --- a/hw/scsi/lsi53c895a.c +++ b/hw/scsi/lsi53c895a.c @@ -1868,7 +1868,7 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val) } if (val & LSI_SCNTL1_RST) { if (!(s->sstat0 & LSI_SSTAT0_RST)) { - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); s->sstat0 |= LSI_SSTAT0_RST; lsi_script_scsi_interrupt(s, LSI_SIST0_RST, 0); } @@ -1926,7 +1926,7 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val) lsi_execute_script(s); } if (val & LSI_ISTAT0_SRST) { - qdev_reset_all(DEVICE(s)); + device_cold_reset(DEVICE(s)); } break; case 0x16: /* MBOX0 */ diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index 7082456..9cbbb16 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -1484,7 +1484,7 @@ static int megasas_cluster_reset_ld(MegasasState *s, MegasasCmd *cmd) MegasasCmd *tmp_cmd = &s->frames[i]; if (tmp_cmd->req && tmp_cmd->req->dev->id == target_id) { SCSIDevice *d = tmp_cmd->req->dev; - qdev_reset_all(&d->qdev); + device_cold_reset(&d->qdev); } } return MFI_STAT_OK; diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c index a90c254..c485da7 100644 --- a/hw/scsi/mptsas.c +++ b/hw/scsi/mptsas.c @@ -522,7 +522,7 @@ reply_maybe_async: reply.ResponseCode = MPI_SCSITASKMGMT_RSP_TM_INVALID_LUN; goto out; } - qdev_reset_all(&sdev->qdev); + device_cold_reset(&sdev->qdev); break; case MPI_SCSITASKMGMT_TASKTYPE_TARGET_RESET: @@ -538,13 +538,13 @@ reply_maybe_async: QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) { sdev = SCSI_DEVICE(kid->child); if (sdev->channel == 0 && sdev->id == req->TargetID) { - qdev_reset_all(kid->child); + device_cold_reset(kid->child); } } break; case MPI_SCSITASKMGMT_TASKTYPE_RESET_BUS: - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); break; default: @@ -807,7 +807,7 @@ static void mptsas_soft_reset(MPTSASState *s) s->intr_mask = MPI_HIM_DIM | MPI_HIM_RIM; mptsas_update_interrupt(s); - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); s->intr_status = 0; s->intr_mask = save_mask; diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index 4403717..ceceafb 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -1616,6 +1616,24 @@ static int scsi_ua_precedence(SCSISense sense) return (sense.asc << 8) | sense.ascq; } +void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense) +{ + int prec1, prec2; + if (sense.key != UNIT_ATTENTION) { + return; + } + + /* + * Override a pre-existing unit attention condition, except for a more + * important reset condition. + */ + prec1 = scsi_ua_precedence(bus->unit_attention); + prec2 = scsi_ua_precedence(sense); + if (prec2 < prec1) { + bus->unit_attention = sense; + } +} + void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense) { int prec1, prec2; diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c index 0a8cbf5..5bbbef6 100644 --- a/hw/scsi/spapr_vscsi.c +++ b/hw/scsi/spapr_vscsi.c @@ -865,7 +865,7 @@ static int vscsi_process_tsk_mgmt(VSCSIState *s, vscsi_req *req) break; } - qdev_reset_all(&d->qdev); + device_cold_reset(&d->qdev); break; case SRP_TSK_ABORT_TASK_SET: diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 41f2a56..6f6e2e3 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -365,7 +365,7 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) goto incorrect_lun; } s->resetting++; - qdev_reset_all(&d->qdev); + device_cold_reset(&d->qdev); s->resetting--; break; @@ -417,7 +417,7 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) { SCSIDevice *d1 = SCSI_DEVICE(kid->child); if (d1->channel == 0 && d1->id == target) { - qdev_reset_all(&d1->qdev); + device_cold_reset(&d1->qdev); } } rcu_read_unlock(); @@ -831,7 +831,7 @@ static void virtio_scsi_reset(VirtIODevice *vdev) assert(!s->dataplane_started); s->resetting++; - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); s->resetting--; vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; @@ -956,6 +956,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev, virtio_scsi_push_event(s, sd, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); + scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED)); virtio_scsi_release(s); } } @@ -973,6 +974,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev, virtio_scsi_push_event(s, sd, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); + scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED)); virtio_scsi_release(s); } diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c index 91e2f85..fa76696 100644 --- a/hw/scsi/vmw_pvscsi.c +++ b/hw/scsi/vmw_pvscsi.c @@ -445,7 +445,7 @@ static void pvscsi_reset_adapter(PVSCSIState *s) { s->resetting++; - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); s->resetting--; pvscsi_process_completion_queue(s); assert(QTAILQ_EMPTY(&s->pending_queue)); @@ -880,7 +880,7 @@ pvscsi_on_cmd_reset_device(PVSCSIState *s) if (sdev != NULL) { s->resetting++; - device_legacy_reset(&sdev->qdev); + device_cold_reset(&sdev->qdev); s->resetting--; return PVSCSI_COMMAND_PROCESSING_SUCCEEDED; } @@ -894,7 +894,7 @@ pvscsi_on_cmd_reset_bus(PVSCSIState *s) trace_pvscsi_on_cmd_arrived("PVSCSI_CMD_RESET_BUS"); s->resetting++; - qbus_reset_all(BUS(&s->bus)); + bus_cold_reset(BUS(&s->bus)); s->resetting--; return PVSCSI_COMMAND_PROCESSING_SUCCEEDED; } diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 0011034..3b1b3d2 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -186,6 +186,7 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, BlockdevOnError rerror, BlockdevOnError werror, const char *serial, Error **errp); +void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense); void scsi_bus_legacy_handle_cmdline(SCSIBus *bus); void scsi_legacy_handle_cmdline(void); diff --git a/target/i386/cpu-param.h b/target/i386/cpu-param.h index 1e79389..f579b16 100644 --- a/target/i386/cpu-param.h +++ b/target/i386/cpu-param.h @@ -23,7 +23,7 @@ # define TARGET_VIRT_ADDR_SPACE_BITS 32 #endif #define TARGET_PAGE_BITS 12 -#define NB_MMU_MODES 3 +#define NB_MMU_MODES 5 #ifndef CONFIG_USER_ONLY # define TARGET_TB_PCREL 1 diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 8a11470..0ebd610 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,12 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND) + CPUID_EXT_RDRAND | CPUID_EXT_AVX) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, - CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AVX, + CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_F16C */ #ifdef TARGET_X86_64 @@ -653,14 +653,14 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \ CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT | \ CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE | \ - CPUID_7_0_EBX_ERMS) + CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2) /* missing: - CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2, + CPUID_7_0_EBX_HLE CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, CPUID_7_0_EBX_RDSEED */ #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \ /* CPUID_7_0_ECX_OSPKE is dynamic */ \ - CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS) + CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS | CPUID_7_0_ECX_VAES) #define TCG_7_0_EDX_FEATURES 0 #define TCG_7_1_EAX_FEATURES 0 #define TCG_APM_FEATURES 0 @@ -6035,6 +6035,19 @@ static void x86_cpu_reset(DeviceState *dev) #endif } +void x86_cpu_after_reset(X86CPU *cpu) +{ +#ifndef CONFIG_USER_ONLY + if (kvm_enabled()) { + kvm_arch_after_reset_vcpu(cpu); + } + + if (cpu->apic_state) { + device_cold_reset(cpu->apic_state); + } +#endif +} + static void mce_init(X86CPU *cpu) { CPUX86State *cenv = &cpu->env; diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 7edf5df..dad2b2d 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -169,6 +169,7 @@ typedef enum X86Seg { #define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */ #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */ #define HF_UMIP_SHIFT 27 /* CR4.UMIP */ +#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */ #define HF_CPL_MASK (3 << HF_CPL_SHIFT) #define HF_INHIBIT_IRQ_MASK (1 << HF_INHIBIT_IRQ_SHIFT) @@ -195,6 +196,7 @@ typedef enum X86Seg { #define HF_MPX_EN_MASK (1 << HF_MPX_EN_SHIFT) #define HF_MPX_IU_MASK (1 << HF_MPX_IU_SHIFT) #define HF_UMIP_MASK (1 << HF_UMIP_SHIFT) +#define HF_AVX_EN_MASK (1 << HF_AVX_EN_SHIFT) /* hflags2 */ @@ -1233,18 +1235,34 @@ typedef struct SegmentCache { uint32_t flags; } SegmentCache; -#define MMREG_UNION(n, bits) \ - union n { \ - uint8_t _b_##n[(bits)/8]; \ - uint16_t _w_##n[(bits)/16]; \ - uint32_t _l_##n[(bits)/32]; \ - uint64_t _q_##n[(bits)/64]; \ - float32 _s_##n[(bits)/32]; \ - float64 _d_##n[(bits)/64]; \ - } - -typedef MMREG_UNION(ZMMReg, 512) ZMMReg; -typedef MMREG_UNION(MMXReg, 64) MMXReg; +typedef union MMXReg { + uint8_t _b_MMXReg[64 / 8]; + uint16_t _w_MMXReg[64 / 16]; + uint32_t _l_MMXReg[64 / 32]; + uint64_t _q_MMXReg[64 / 64]; + float32 _s_MMXReg[64 / 32]; + float64 _d_MMXReg[64 / 64]; +} MMXReg; + +typedef union XMMReg { + uint64_t _q_XMMReg[128 / 64]; +} XMMReg; + +typedef union YMMReg { + uint64_t _q_YMMReg[256 / 64]; + XMMReg _x_YMMReg[256 / 128]; +} YMMReg; + +typedef union ZMMReg { + uint8_t _b_ZMMReg[512 / 8]; + uint16_t _w_ZMMReg[512 / 16]; + uint32_t _l_ZMMReg[512 / 32]; + uint64_t _q_ZMMReg[512 / 64]; + float32 _s_ZMMReg[512 / 32]; + float64 _d_ZMMReg[512 / 64]; + XMMReg _x_ZMMReg[512 / 128]; + YMMReg _y_ZMMReg[512 / 256]; +} ZMMReg; typedef struct BNDReg { uint64_t lb; @@ -1267,6 +1285,13 @@ typedef struct BNDCSReg { #define ZMM_S(n) _s_ZMMReg[15 - (n)] #define ZMM_Q(n) _q_ZMMReg[7 - (n)] #define ZMM_D(n) _d_ZMMReg[7 - (n)] +#define ZMM_X(n) _x_ZMMReg[3 - (n)] +#define ZMM_Y(n) _y_ZMMReg[1 - (n)] + +#define XMM_Q(n) _q_XMMReg[1 - (n)] + +#define YMM_Q(n) _q_YMMReg[3 - (n)] +#define YMM_X(n) _x_YMMReg[1 - (n)] #define MMX_B(n) _b_MMXReg[7 - (n)] #define MMX_W(n) _w_MMXReg[3 - (n)] @@ -1279,6 +1304,13 @@ typedef struct BNDCSReg { #define ZMM_S(n) _s_ZMMReg[n] #define ZMM_Q(n) _q_ZMMReg[n] #define ZMM_D(n) _d_ZMMReg[n] +#define ZMM_X(n) _x_ZMMReg[n] +#define ZMM_Y(n) _y_ZMMReg[n] + +#define XMM_Q(n) _q_XMMReg[n] + +#define YMM_Q(n) _q_YMMReg[n] +#define YMM_X(n) _x_YMMReg[n] #define MMX_B(n) _b_MMXReg[n] #define MMX_W(n) _w_MMXReg[n] @@ -1556,8 +1588,8 @@ typedef struct CPUArchState { float_status mmx_status; /* for 3DNow! float ops */ float_status sse_status; uint32_t mxcsr; - ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32]; - ZMMReg xmm_t0; + ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32] QEMU_ALIGNED(16); + ZMMReg xmm_t0 QEMU_ALIGNED(16); MMXReg mmx_t0; uint64_t opmask_regs[NB_OPMASK_REGS]; @@ -2082,6 +2114,8 @@ typedef struct PropValue { } PropValue; void x86_cpu_apply_props(X86CPU *cpu, PropValue *props); +void x86_cpu_after_reset(X86CPU *cpu); + uint32_t cpu_x86_virtual_addr_width(CPUX86State *env); /* cpu.c other functions (cpuid) */ @@ -2094,6 +2128,7 @@ void host_cpuid(uint32_t function, uint32_t count, /* helper.c */ void x86_cpu_set_a20(X86CPU *cpu, int a20_state); +void cpu_sync_avx_hflag(CPUX86State *env); #ifndef CONFIG_USER_ONLY static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs) @@ -2147,6 +2182,9 @@ uint64_t cpu_get_tsc(CPUX86State *env); #define MMU_KSMAP_IDX 0 #define MMU_USER_IDX 1 #define MMU_KNOSMAP_IDX 2 +#define MMU_NESTED_IDX 3 +#define MMU_PHYS_IDX 4 + static inline int cpu_mmu_index(CPUX86State *env, bool ifetch) { return (env->hflags & HF_CPL_MASK) == 3 ? MMU_USER_IDX : @@ -2382,8 +2420,6 @@ static inline bool ctl_has_irq(CPUX86State *env) return (env->int_ctl & V_IRQ_MASK) && (int_prio >= tpr); } -hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, - int *prot); #if defined(TARGET_X86_64) && \ defined(CONFIG_USER_ONLY) && \ defined(CONFIG_LINUX) diff --git a/target/i386/helper.c b/target/i386/helper.c index b954ccd..b62a1e4 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -29,6 +29,17 @@ #endif #include "qemu/log.h" +void cpu_sync_avx_hflag(CPUX86State *env) +{ + if ((env->cr[4] & CR4_OSXSAVE_MASK) + && (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) + == (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) { + env->hflags |= HF_AVX_EN_MASK; + } else{ + env->hflags &= ~HF_AVX_EN_MASK; + } +} + void cpu_sync_bndcs_hflags(CPUX86State *env) { uint32_t hflags = env->hflags; @@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) env->hflags = hflags; cpu_sync_bndcs_hflags(env); + cpu_sync_avx_hflag(env); } #if !defined(CONFIG_USER_ONLY) diff --git a/target/i386/helper.h b/target/i386/helper.h index 39a3c24..88143b2 100644 --- a/target/i386/helper.h +++ b/target/i386/helper.h @@ -212,12 +212,13 @@ DEF_HELPER_2(ldmxcsr, void, env, i32) DEF_HELPER_1(update_mxcsr, void, env) DEF_HELPER_1(enter_mmx, void, env) DEF_HELPER_1(emms, void, env) -DEF_HELPER_3(movq, void, env, ptr, ptr) #define SHIFT 0 #include "ops_sse_header.h" #define SHIFT 1 #include "ops_sse_header.h" +#define SHIFT 2 +#include "ops_sse_header.h" DEF_HELPER_3(rclb, tl, env, tl, tl) DEF_HELPER_3(rclw, tl, env, tl, tl) diff --git a/target/i386/kvm/hyperv.c b/target/i386/kvm/hyperv.c index 9026ef3..e3ac978 100644 --- a/target/i386/kvm/hyperv.c +++ b/target/i386/kvm/hyperv.c @@ -23,6 +23,10 @@ int hyperv_x86_synic_add(X86CPU *cpu) return 0; } +/* + * All devices possibly using SynIC have to be reset before calling this to let + * them remove their SINT routes first. + */ void hyperv_x86_synic_reset(X86CPU *cpu) { hyperv_synic_reset(CPU(cpu)); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index bed6c00..dac100c 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2203,20 +2203,30 @@ void kvm_arch_reset_vcpu(X86CPU *cpu) env->mp_state = KVM_MP_STATE_RUNNABLE; } + /* enabled by default */ + env->poll_control_msr = 1; + + kvm_init_nested_state(env); + + sev_es_set_reset_vector(CPU(cpu)); +} + +void kvm_arch_after_reset_vcpu(X86CPU *cpu) +{ + CPUX86State *env = &cpu->env; + int i; + + /* + * Reset SynIC after all other devices have been reset to let them remove + * their SINT routes first. + */ if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { - int i; for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) { env->msr_hv_synic_sint[i] = HV_SINT_MASKED; } hyperv_x86_synic_reset(cpu); } - /* enabled by default */ - env->poll_control_msr = 1; - - kvm_init_nested_state(env); - - sev_es_set_reset_vector(CPU(cpu)); } void kvm_arch_do_init_vcpu(X86CPU *cpu) diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index 2ed586c..b7c38ba 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -38,6 +38,7 @@ bool kvm_has_adjust_clock_stable(void); bool kvm_has_exception_payload(void); void kvm_synchronize_all_tsc(void); void kvm_arch_reset_vcpu(X86CPU *cs); +void kvm_arch_after_reset_vcpu(X86CPU *cpu); void kvm_arch_do_init_vcpu(X86CPU *cs); void kvm_put_apicbase(X86CPU *cpu, uint64_t value); diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 7bf8bb9..d35fc15 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -35,7 +35,11 @@ #define W(n) ZMM_W(n) #define L(n) ZMM_L(n) #define Q(n) ZMM_Q(n) +#if SHIFT == 1 #define SUFFIX _xmm +#else +#define SUFFIX _ymm +#endif #endif #define LANE_WIDTH (SHIFT ? 16 : 8) @@ -48,9 +52,8 @@ #define FPSLL(x, c) ((x) << shift) #endif -void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 15) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -64,9 +67,8 @@ void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 15) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -80,9 +82,8 @@ void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 15) { shift = 15; @@ -94,9 +95,8 @@ void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 31) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -110,9 +110,8 @@ void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 31) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -126,9 +125,8 @@ void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 31) { shift = 31; @@ -140,9 +138,8 @@ void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 63) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -156,9 +153,8 @@ void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift; if (c->Q(0) > 63) { for (int i = 0; i < 1 << SHIFT; i++) { @@ -173,9 +169,8 @@ void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } #if SHIFT >= 1 -void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift, i, j; shift = c->L(0); @@ -192,9 +187,8 @@ void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { - Reg *s = d; int shift, i, j; shift = c->L(0); @@ -222,9 +216,8 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } #define SSE_HELPER_2(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ int n = num; \ for (int i = 0; i < n; i++) { \ d->elem(i) = F(v->elem(i), s->elem(i)); \ @@ -304,17 +297,6 @@ static inline int satsw(int x) #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) -#define FAND(a, b) ((a) & (b)) -#define FANDN(a, b) ((~(a)) & (b)) -#define FOR(a, b) ((a) | (b)) -#define FXOR(a, b) ((a) ^ (b)) - -#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0) -#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0) -#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0) -#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0) - -#define FMULLW(a, b) ((a) * (b)) #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16) #define FMULHUW(a, b) ((a) * (b) >> 16) #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16) @@ -322,58 +304,24 @@ static inline int satsw(int x) #define FAVG(a, b) (((a) + (b) + 1) >> 1) #endif -SSE_HELPER_B(helper_paddb, FADD) -SSE_HELPER_W(helper_paddw, FADD) -SSE_HELPER_L(helper_paddl, FADD) -SSE_HELPER_Q(helper_paddq, FADD) - -SSE_HELPER_B(helper_psubb, FSUB) -SSE_HELPER_W(helper_psubw, FSUB) -SSE_HELPER_L(helper_psubl, FSUB) -SSE_HELPER_Q(helper_psubq, FSUB) - -SSE_HELPER_B(helper_paddusb, FADDUB) -SSE_HELPER_B(helper_paddsb, FADDSB) -SSE_HELPER_B(helper_psubusb, FSUBUB) -SSE_HELPER_B(helper_psubsb, FSUBSB) - -SSE_HELPER_W(helper_paddusw, FADDUW) -SSE_HELPER_W(helper_paddsw, FADDSW) -SSE_HELPER_W(helper_psubusw, FSUBUW) -SSE_HELPER_W(helper_psubsw, FSUBSW) - -SSE_HELPER_B(helper_pminub, FMINUB) -SSE_HELPER_B(helper_pmaxub, FMAXUB) - -SSE_HELPER_W(helper_pminsw, FMINSW) -SSE_HELPER_W(helper_pmaxsw, FMAXSW) - -SSE_HELPER_Q(helper_pand, FAND) -SSE_HELPER_Q(helper_pandn, FANDN) -SSE_HELPER_Q(helper_por, FOR) -SSE_HELPER_Q(helper_pxor, FXOR) - -SSE_HELPER_B(helper_pcmpgtb, FCMPGTB) -SSE_HELPER_W(helper_pcmpgtw, FCMPGTW) -SSE_HELPER_L(helper_pcmpgtl, FCMPGTL) - -SSE_HELPER_B(helper_pcmpeqb, FCMPEQ) -SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) -SSE_HELPER_L(helper_pcmpeql, FCMPEQ) +SSE_HELPER_W(helper_pmulhuw, FMULHUW) +SSE_HELPER_W(helper_pmulhw, FMULHW) -SSE_HELPER_W(helper_pmullw, FMULLW) #if SHIFT == 0 -SSE_HELPER_W(helper_pmulhrw, FMULHRW) +void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->W(0) = FMULHRW(d->W(0), s->W(0)); + d->W(1) = FMULHRW(d->W(1), s->W(1)); + d->W(2) = FMULHRW(d->W(2), s->W(2)); + d->W(3) = FMULHRW(d->W(3), s->W(3)); +} #endif -SSE_HELPER_W(helper_pmulhuw, FMULHUW) -SSE_HELPER_W(helper_pmulhw, FMULHW) SSE_HELPER_B(helper_pavgb, FAVG) SSE_HELPER_W(helper_pavgw, FAVG) -void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < (1 << SHIFT); i++) { @@ -381,9 +329,8 @@ void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < (2 << SHIFT); i++) { @@ -402,10 +349,8 @@ static inline int abs1(int a) } } #endif - -void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < (1 << SHIFT); i++) { @@ -436,29 +381,6 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } #endif -void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) -{ - int i; - - d->L(0) = val; - d->L(1) = 0; - for (i = 1; i < (1 << SHIFT); i++) { - d->Q(i) = 0; - } -} - -#ifdef TARGET_X86_64 -void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) -{ - int i; - - d->Q(0) = val; - for (i = 1; i < (1 << SHIFT); i++) { - d->Q(i) = 0; - } -} -#endif - #define SHUFFLE4(F, a, b, offset) do { \ r0 = a->F((order & 3) + offset); \ r1 = a->F(((order >> 2) & 3) + offset); \ @@ -478,9 +400,8 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) SHUFFLE4(W, s, s, 0); } #else -void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) { - Reg *v = d; uint32_t r0, r1, r2, r3; int i; @@ -489,9 +410,8 @@ void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) } } -void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) { - Reg *v = d; uint64_t r0, r1; int i; @@ -543,9 +463,8 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) #define SSE_HELPER_P(name, F) \ void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ int i; \ for (i = 0; i < 2 << SHIFT; i++) { \ d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \ @@ -553,9 +472,8 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) } \ \ void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ int i; \ for (i = 0; i < 1 << SHIFT; i++) { \ d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \ @@ -567,16 +485,22 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) #define SSE_HELPER_S(name, F) \ SSE_HELPER_P(name, F) \ \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ { \ - Reg *v = d; \ + int i; \ d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ + for (i = 1; i < 2 << SHIFT; i++) { \ + d->ZMM_L(i) = v->ZMM_L(i); \ + } \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ { \ - Reg *v = d; \ + int i; \ d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ + for (i = 1; i < 1 << SHIFT; i++) { \ + d->ZMM_Q(i) = v->ZMM_Q(i); \ + } \ } #else @@ -623,14 +547,22 @@ void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } #if SHIFT == 1 -void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s) +void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s) { + int i; d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status); + for (i = 1; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = v->ZMM_L(i); + } } -void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s) +void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s) { + int i; d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status); + for (i = 1; i < 1 << SHIFT; i++) { + d->ZMM_Q(i) = v->ZMM_Q(i); + } } #endif @@ -655,14 +587,22 @@ void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } #if SHIFT == 1 -void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) +void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) { + int i; d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); + for (i = 1; i < 1 << SHIFT; i++) { + d->ZMM_Q(i) = v->ZMM_Q(i); + } } -void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) +void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) { + int i; d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); + for (i = 1; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = v->ZMM_L(i); + } } #endif @@ -882,13 +822,17 @@ void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) } #if SHIFT == 1 -void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int i; d->ZMM_S(0) = float32_div(float32_one, float32_sqrt(s->ZMM_S(0), &env->sse_status), &env->sse_status); set_float_exception_flags(old_flags, &env->sse_status); + for (i = 1; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = v->ZMM_L(i); + } } #endif @@ -903,10 +847,14 @@ void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) } #if SHIFT == 1 -void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); + int i; d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); + for (i = 1; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = v->ZMM_L(i); + } set_float_exception_flags(old_flags, &env->sse_status); } #endif @@ -958,9 +906,8 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int len #endif #define SSE_HELPER_HPS(name, F) \ -void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ float32 r[2 << SHIFT]; \ int i, j, k; \ for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ @@ -980,9 +927,8 @@ SSE_HELPER_HPS(haddps, float32_add) SSE_HELPER_HPS(hsubps, float32_sub) #define SSE_HELPER_HPD(name, F) \ -void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ float64 r[1 << SHIFT]; \ int i, j, k; \ for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ @@ -1001,9 +947,8 @@ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ SSE_HELPER_HPD(haddpd, float64_add) SSE_HELPER_HPD(hsubpd, float64_sub) -void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < 2 << SHIFT; i += 2) { d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); @@ -1011,9 +956,8 @@ void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < 1 << SHIFT; i += 2) { d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status); @@ -1023,9 +967,8 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #define SSE_HELPER_CMP_P(name, F, C) \ void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ int i; \ for (i = 0; i < 2 << SHIFT; i++) { \ d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \ @@ -1033,9 +976,8 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } \ \ void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ int i; \ for (i = 0; i < 1 << SHIFT; i++) { \ d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \ @@ -1045,22 +987,39 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #if SHIFT == 1 #define SSE_HELPER_CMP(name, F, C) \ SSE_HELPER_CMP_P(name, F, C) \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ + int i; \ d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \ + for (i = 1; i < 2 << SHIFT; i++) { \ + d->ZMM_L(i) = v->ZMM_L(i); \ + } \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ + int i; \ d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \ + for (i = 1; i < 1 << SHIFT; i++) { \ + d->ZMM_Q(i) = v->ZMM_Q(i); \ + } \ } +static inline bool FPU_EQU(FloatRelation x) +{ + return (x == float_relation_equal || x == float_relation_unordered); +} +static inline bool FPU_GE(FloatRelation x) +{ + return (x == float_relation_equal || x == float_relation_greater); +} #define FPU_EQ(x) (x == float_relation_equal) #define FPU_LT(x) (x == float_relation_less) #define FPU_LE(x) (x <= float_relation_equal) +#define FPU_GT(x) (x == float_relation_greater) #define FPU_UNORD(x) (x == float_relation_unordered) +/* We must make sure we evaluate the argument in case it is a signalling NAN */ +#define FPU_FALSE(x) (x == float_relation_equal && 0) #define FPU_CMPQ(size, a, b) \ float ## size ## _compare_quiet(a, b, &env->sse_status) @@ -1080,6 +1039,33 @@ SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) +SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) +SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) +SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) +SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) +SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU) +SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) +SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) +SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) + +SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) +SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) +SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) +SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) +SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) +SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) +SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) +SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) +SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) +SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) +SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) +SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU) +SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) +SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) +SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) + #undef SSE_HELPER_CMP #if SHIFT == 1 @@ -1156,32 +1142,10 @@ uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) #endif -uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) -{ - uint32_t val; - int i; - - val = 0; - for (i = 0; i < (1 << SHIFT); i++) { - uint8_t byte = 0; - byte |= (s->B(8 * i + 0) >> 7); - byte |= (s->B(8 * i + 1) >> 6) & 0x02; - byte |= (s->B(8 * i + 2) >> 5) & 0x04; - byte |= (s->B(8 * i + 3) >> 4) & 0x08; - byte |= (s->B(8 * i + 4) >> 3) & 0x10; - byte |= (s->B(8 * i + 5) >> 2) & 0x20; - byte |= (s->B(8 * i + 6) >> 1) & 0x40; - byte |= (s->B(8 * i + 7)) & 0x80; - val |= byte << (8 * i); - } - return val; -} - #define PACK_HELPER_B(name, F) \ void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint8_t r[PACK_WIDTH * 2]; \ int j, k; \ for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \ @@ -1200,9 +1164,8 @@ void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ PACK_HELPER_B(sswb, satsb) PACK_HELPER_B(uswb, satub) -void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; uint16_t r[PACK_WIDTH]; int j, k; @@ -1222,9 +1185,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #define UNPCK_OP(base_name, base) \ \ void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint8_t r[PACK_WIDTH * 2]; \ int j, i; \ \ @@ -1241,9 +1203,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } \ \ void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint16_t r[PACK_WIDTH]; \ int j, i; \ \ @@ -1260,9 +1221,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } \ \ void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint32_t r[PACK_WIDTH / 2]; \ int j, i; \ \ @@ -1280,9 +1240,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ XMM_ONLY( \ void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ - CPUX86State *env, Reg *d, Reg *s) \ + CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint64_t r[2]; \ int i; \ \ @@ -1453,9 +1412,8 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) #endif /* SSSE3 op helpers */ -void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; #if SHIFT == 0 uint8_t r[8]; @@ -1480,9 +1438,8 @@ void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } #define SSE_HELPER_HW(name, F) \ -void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint16_t r[4 << SHIFT]; \ int i, j, k; \ for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \ @@ -1499,9 +1456,8 @@ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ } #define SSE_HELPER_HL(name, F) \ -void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg *v = d; \ uint32_t r[2 << SHIFT]; \ int i, j, k; \ for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ @@ -1527,9 +1483,8 @@ SSE_HELPER_HL(phsubd, FSUB) #undef SSE_HELPER_HW #undef SSE_HELPER_HL -void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < 4 << SHIFT; i++) { d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + @@ -1537,13 +1492,6 @@ void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x) -#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x) -#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x) -SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB) -SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW) -SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL) - #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) @@ -1554,19 +1502,18 @@ SSE_HELPER_B(helper_psignb, FSIGNB) SSE_HELPER_W(helper_psignw, FSIGNW) SSE_HELPER_L(helper_psignd, FSIGNL) -void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, - int32_t shift) +void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, + uint32_t imm) { - Reg *v = d; int i; /* XXX could be checked during translation */ - if (shift >= (SHIFT ? 32 : 16)) { + if (imm >= (SHIFT ? 32 : 16)) { for (i = 0; i < (1 << SHIFT); i++) { d->Q(i) = 0; } } else { - shift <<= 3; + int shift = imm * 8; #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) #if SHIFT == 0 d->Q(0) = SHR(s->Q(0), shift - 0) | @@ -1594,10 +1541,9 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #if SHIFT >= 1 #define SSE_HELPER_V(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ + Reg *m) \ { \ - Reg *v = d; \ - Reg *m = &env->xmm_regs[0]; \ int i; \ for (i = 0; i < num; i++) { \ d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \ @@ -1605,10 +1551,9 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } #define SSE_HELPER_I(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ uint32_t imm) \ { \ - Reg *v = d; \ int i; \ for (i = 0; i < num; i++) { \ int j = i & 7; \ @@ -1636,6 +1581,10 @@ void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); } +#define FMOVSLDUP(i) s->L((i) & ~1) +#define FMOVSHDUP(i) s->L((i) | 1) +#define FMOVDLDUP(i) s->Q((i) & ~1) + #define SSE_HELPER_F(name, elem, num, F) \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ @@ -1658,11 +1607,13 @@ SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B) SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W) SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W) SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L) +SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP) +SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP) +SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP) #endif -void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; int i; for (i = 0; i < 1 << SHIFT; i++) { @@ -1670,12 +1621,8 @@ void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -#define FCMPEQQ(d, s) (d == s ? -1 : 0) -SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) - -void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - Reg *v = d; uint16_t r[8]; int i, j, k; @@ -1694,22 +1641,6 @@ void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) -#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s) -#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s) -#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s) -SSE_HELPER_B(helper_pminsb, FMINSB) -SSE_HELPER_L(helper_pminsd, FMINSD) -SSE_HELPER_W(helper_pminuw, MIN) -SSE_HELPER_L(helper_pminud, MIN) -SSE_HELPER_B(helper_pmaxsb, FMAXSB) -SSE_HELPER_L(helper_pmaxsd, FMAXSD) -SSE_HELPER_W(helper_pmaxuw, MAX) -SSE_HELPER_L(helper_pmaxud, MAX) - -#define FMULLD(d, s) ((int32_t)d * (int32_t)s) -SSE_HELPER_L(helper_pmulld, FMULLD) - #if SHIFT == 1 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { @@ -1819,11 +1750,12 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } #if SHIFT == 1 -void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t mode) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; + int i; prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { @@ -1844,6 +1776,9 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); + for (i = 1; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = v->ZMM_L(i); + } if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_status) & @@ -1853,11 +1788,12 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, env->sse_status.float_rounding_mode = prev_rounding_mode; } -void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t mode) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; + int i; prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { @@ -1878,6 +1814,9 @@ void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); + for (i = 1; i < 1 << SHIFT; i++) { + d->ZMM_Q(i) = v->ZMM_Q(i); + } if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_status) & @@ -1893,10 +1832,9 @@ SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP) SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP) SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP) -void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t mask) { - Reg *v = d; float32 prod1, prod2, temp2, temp3, temp4; int i; @@ -1939,9 +1877,8 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #if SHIFT == 1 /* Oddly, there is no ymm version of dppd */ void glue(helper_dppd, SUFFIX)(CPUX86State *env, - Reg *d, Reg *s, uint32_t mask) + Reg *d, Reg *v, Reg *s, uint32_t mask) { - Reg *v = d; float64 prod1, prod2, temp2; if (mask & (1 << 4)) { @@ -1960,10 +1897,9 @@ void glue(helper_dppd, SUFFIX)(CPUX86State *env, } #endif -void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t offset) { - Reg *v = d; int i, j; uint16_t r[8]; @@ -1985,9 +1921,6 @@ void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } /* SSE4.2 op helpers */ -#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0) -SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) - #if SHIFT == 1 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) { @@ -2043,7 +1976,7 @@ static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) } static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s, - int8_t ctrl, int valids, int validd) + uint8_t ctrl, int valids, int validd) { unsigned int res = 0; int v; @@ -2236,10 +2169,9 @@ static void clmulq(uint64_t *dest_l, uint64_t *dest_h, } #endif -void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t ctrl) { - Reg *v = d; uint64_t a, b; int i; @@ -2250,10 +2182,10 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } -void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st = *d; + Reg st = *v; Reg rk = *s; for (i = 0 ; i < 2 << SHIFT ; i++) { @@ -2265,10 +2197,10 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st = *d; + Reg st = *v; Reg rk = *s; for (i = 0; i < 8 << SHIFT; i++) { @@ -2276,10 +2208,10 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st = *d; + Reg st = *v; Reg rk = *s; for (i = 0 ; i < 2 << SHIFT ; i++) { @@ -2291,10 +2223,10 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } -void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st = *d; + Reg st = *v; Reg rk = *s; for (i = 0; i < 8 << SHIFT; i++) { @@ -2332,8 +2264,290 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #endif #endif +#if SHIFT >= 1 +void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + uint64_t r0, r1; + int i; + + for (i = 0; i < 1 << SHIFT; i += 2) { + r0 = v->Q(i + ((s->Q(i) >> 1) & 1)); + r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1)); + d->Q(i) = r0; + d->Q(i+1) = r1; + } +} + +void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + uint32_t r0, r1, r2, r3; + int i; + + for (i = 0; i < 2 << SHIFT; i += 4) { + r0 = v->L(i + (s->L(i) & 3)); + r1 = v->L(i + (s->L(i+1) & 3)); + r2 = v->L(i + (s->L(i+2) & 3)); + r3 = v->L(i + (s->L(i+3) & 3)); + d->L(i) = r0; + d->L(i+1) = r1; + d->L(i+2) = r2; + d->L(i+3) = r3; + } +} + +void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) +{ + uint64_t r0, r1; + int i; + + for (i = 0; i < 1 << SHIFT; i += 2) { + r0 = s->Q(i + ((order >> 0) & 1)); + r1 = s->Q(i + ((order >> 1) & 1)); + d->Q(i) = r0; + d->Q(i+1) = r1; + + order >>= 2; + } +} + +void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) +{ + uint32_t r0, r1, r2, r3; + int i; + + for (i = 0; i < 2 << SHIFT; i += 4) { + r0 = s->L(i + ((order >> 0) & 3)); + r1 = s->L(i + ((order >> 2) & 3)); + r2 = s->L(i + ((order >> 4) & 3)); + r3 = s->L(i + ((order >> 6) & 3)); + d->L(i) = r0; + d->L(i+1) = r1; + d->L(i+2) = r2; + d->L(i+3) = r3; + } +} + +#if SHIFT == 1 +#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0) +#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0) +#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31)) +#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63)) +#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0) +#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0) +#endif + +SSE_HELPER_L(helper_vpsrlvd, FPSRLVD) +SSE_HELPER_L(helper_vpsravd, FPSRAVD) +SSE_HELPER_L(helper_vpsllvd, FPSLLVD) + +SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) +SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) +SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) + +void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint32_t zf = 0, cf = 0; + int i; + + for (i = 0; i < 2 << SHIFT; i++) { + zf |= (s->L(i) & d->L(i)); + cf |= (s->L(i) & ~d->L(i)); + } + CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C); +} + +void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint64_t zf = 0, cf = 0; + int i; + + for (i = 0; i < 1 << SHIFT; i++) { + zf |= (s->Q(i) & d->Q(i)); + cf |= (s->Q(i) & ~d->Q(i)); + } + CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); +} + +void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env, + Reg *v, Reg *s, target_ulong a0) +{ + int i; + + for (i = 0; i < (2 << SHIFT); i++) { + if (v->L(i) >> 31) { + cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC()); + } + } +} + +void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env, + Reg *v, Reg *s, target_ulong a0) +{ + int i; + + for (i = 0; i < (1 << SHIFT); i++) { + if (v->Q(i) >> 63) { + cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC()); + } + } +} + +void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + int i; + + for (i = 0; i < (2 << SHIFT); i++) { + d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0; + } +} + +void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + int i; + + for (i = 0; i < (1 << SHIFT); i++) { + d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0; + } +} + +void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) +{ + int i; + for (i = 0; i < (2 << SHIFT); i++) { + if (v->L(i) >> 31) { + target_ulong addr = a0 + + ((target_ulong)(int32_t)s->L(i) << scale); + d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); + } + v->L(i) = 0; + } +} + +void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env, + Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) +{ + int i; + for (i = 0; i < (1 << SHIFT); i++) { + if (v->Q(i) >> 63) { + target_ulong addr = a0 + + ((target_ulong)(int32_t)s->L(i) << scale); + d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); + } + v->Q(i) = 0; + } +} + +void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) +{ + int i; + for (i = 0; i < (1 << SHIFT); i++) { + if (v->L(i) >> 31) { + target_ulong addr = a0 + + ((target_ulong)(int64_t)s->Q(i) << scale); + d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); + } + v->L(i) = 0; + } + for (i /= 2; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + v->Q(i) = 0; + } +} + +void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env, + Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) +{ + int i; + for (i = 0; i < (1 << SHIFT); i++) { + if (v->Q(i) >> 63) { + target_ulong addr = a0 + + ((target_ulong)(int64_t)s->Q(i) << scale); + d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); + } + v->Q(i) = 0; + } +} +#endif + +#if SHIFT >= 2 +void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order) +{ + uint64_t r0, r1, r2, r3; + + switch (order & 3) { + case 0: + r0 = v->Q(0); + r1 = v->Q(1); + break; + case 1: + r0 = v->Q(2); + r1 = v->Q(3); + break; + case 2: + r0 = s->Q(0); + r1 = s->Q(1); + break; + case 3: + r0 = s->Q(2); + r1 = s->Q(3); + break; + } + switch ((order >> 4) & 3) { + case 0: + r2 = v->Q(0); + r3 = v->Q(1); + break; + case 1: + r2 = v->Q(2); + r3 = v->Q(3); + break; + case 2: + r2 = s->Q(0); + r3 = s->Q(1); + break; + case 3: + r2 = s->Q(2); + r3 = s->Q(3); + break; + } + d->Q(0) = r0; + d->Q(1) = r1; + d->Q(2) = r2; + d->Q(3) = r3; +} + +void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order) +{ + uint64_t r0, r1, r2, r3; + r0 = s->Q(order & 3); + r1 = s->Q((order >> 2) & 3); + r2 = s->Q((order >> 4) & 3); + r3 = s->Q((order >> 6) & 3); + d->Q(0) = r0; + d->Q(1) = r1; + d->Q(2) = r2; + d->Q(3) = r3; +} + +void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s) +{ + uint32_t r[8]; + int i; + + for (i = 0; i < 8; i++) { + r[i] = s->L(v->L(i) & 7); + } + for (i = 0; i < 8; i++) { + d->L(i) = r[i]; + } +} +#endif + #undef SSE_HELPER_S +#undef LANE_WIDTH #undef SHIFT #undef XMM_ONLY #undef Reg diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 400b24c0..2f1f811 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -21,7 +21,11 @@ #define SUFFIX _mmx #else #define Reg ZMMReg +#if SHIFT == 1 #define SUFFIX _xmm +#else +#define SUFFIX _ymm +#endif #endif #define dh_alias_Reg ptr @@ -34,74 +38,34 @@ #define dh_typecode_ZMMReg dh_typecode_ptr #define dh_typecode_MMXReg dh_typecode_ptr -DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psllw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrld, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrad, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg) - -#if SHIFT == 1 -DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(psrlw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psraw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psllw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrld, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrad, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pslld, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrlq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psllq, SUFFIX), void, env, Reg, Reg, Reg) + +#if SHIFT >= 1 +DEF_HELPER_4(glue(psrldq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pslldq, SUFFIX), void, env, Reg, Reg, Reg) #endif #define SSE_HELPER_B(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) #define SSE_HELPER_W(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) #define SSE_HELPER_L(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) #define SSE_HELPER_Q(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) - -SSE_HELPER_B(paddb, FADD) -SSE_HELPER_W(paddw, FADD) -SSE_HELPER_L(paddl, FADD) -SSE_HELPER_Q(paddq, FADD) - -SSE_HELPER_B(psubb, FSUB) -SSE_HELPER_W(psubw, FSUB) -SSE_HELPER_L(psubl, FSUB) -SSE_HELPER_Q(psubq, FSUB) - -SSE_HELPER_B(paddusb, FADDUB) -SSE_HELPER_B(paddsb, FADDSB) -SSE_HELPER_B(psubusb, FSUBUB) -SSE_HELPER_B(psubsb, FSUBSB) - -SSE_HELPER_W(paddusw, FADDUW) -SSE_HELPER_W(paddsw, FADDSW) -SSE_HELPER_W(psubusw, FSUBUW) -SSE_HELPER_W(psubsw, FSUBSW) - -SSE_HELPER_B(pminub, FMINUB) -SSE_HELPER_B(pmaxub, FMAXUB) - -SSE_HELPER_W(pminsw, FMINSW) -SSE_HELPER_W(pmaxsw, FMAXSW) - -SSE_HELPER_Q(pand, FAND) -SSE_HELPER_Q(pandn, FANDN) -SSE_HELPER_Q(por, FOR) -SSE_HELPER_Q(pxor, FXOR) - -SSE_HELPER_B(pcmpgtb, FCMPGTB) -SSE_HELPER_W(pcmpgtw, FCMPGTW) -SSE_HELPER_L(pcmpgtl, FCMPGTL) - -SSE_HELPER_B(pcmpeqb, FCMPEQ) -SSE_HELPER_W(pcmpeqw, FCMPEQ) -SSE_HELPER_L(pcmpeql, FCMPEQ) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) -SSE_HELPER_W(pmullw, FMULLW) #if SHIFT == 0 -SSE_HELPER_W(pmulhrw, FMULHRW) +DEF_HELPER_3(glue(pmulhrw, SUFFIX), void, env, Reg, Reg) #endif SSE_HELPER_W(pmulhuw, FMULHUW) SSE_HELPER_W(pmulhw, FMULHW) @@ -109,51 +73,74 @@ SSE_HELPER_W(pmulhw, FMULHW) SSE_HELPER_B(pavgb, FAVG) SSE_HELPER_W(pavgw, FAVG) -DEF_HELPER_3(glue(pmuludq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaddwd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(pmuludq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaddwd, SUFFIX), void, env, Reg, Reg, Reg) -DEF_HELPER_3(glue(psadbw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(psadbw, SUFFIX), void, env, Reg, Reg, Reg) +#if SHIFT < 2 DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl) -DEF_HELPER_2(glue(movl_mm_T0, SUFFIX), void, Reg, i32) -#ifdef TARGET_X86_64 -DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64) #endif #if SHIFT == 0 DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int) #else -DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int) -DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) #endif -#if SHIFT == 1 +#if SHIFT >= 1 /* FPU ops */ /* XXX: not accurate */ -#define SSE_HELPER_S(name, F) \ - DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(name ## sd, void, env, Reg, Reg) +#define SSE_HELPER_P4(name) \ + DEF_HELPER_4(glue(name ## ps, SUFFIX), void, env, Reg, Reg, Reg) \ + DEF_HELPER_4(glue(name ## pd, SUFFIX), void, env, Reg, Reg, Reg) + +#define SSE_HELPER_P3(name, ...) \ + DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ + DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) + +#if SHIFT == 1 +#define SSE_HELPER_S4(name) \ + SSE_HELPER_P4(name) \ + DEF_HELPER_4(name ## ss, void, env, Reg, Reg, Reg) \ + DEF_HELPER_4(name ## sd, void, env, Reg, Reg, Reg) +#define SSE_HELPER_S3(name) \ + SSE_HELPER_P3(name) \ + DEF_HELPER_4(name ## ss, void, env, Reg, Reg, Reg) \ + DEF_HELPER_4(name ## sd, void, env, Reg, Reg, Reg) +#else +#define SSE_HELPER_S4(name, ...) SSE_HELPER_P4(name) +#define SSE_HELPER_S3(name, ...) SSE_HELPER_P3(name) +#endif + +DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, int) +DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, int) -SSE_HELPER_S(add, FPU_ADD) -SSE_HELPER_S(sub, FPU_SUB) -SSE_HELPER_S(mul, FPU_MUL) -SSE_HELPER_S(div, FPU_DIV) -SSE_HELPER_S(min, FPU_MIN) -SSE_HELPER_S(max, FPU_MAX) -SSE_HELPER_S(sqrt, FPU_SQRT) +SSE_HELPER_S4(add) +SSE_HELPER_S4(sub) +SSE_HELPER_S4(mul) +SSE_HELPER_S4(div) +SSE_HELPER_S4(min) +SSE_HELPER_S4(max) +SSE_HELPER_S3(sqrt) DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(cvtss2sd, void, env, Reg, Reg) -DEF_HELPER_3(cvtsd2ss, void, env, Reg, Reg) DEF_HELPER_3(glue(cvtdq2ps, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(cvtdq2pd, SUFFIX), void, env, Reg, Reg) + +DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) + +DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) + +#if SHIFT == 1 +DEF_HELPER_4(cvtss2sd, void, env, Reg, Reg, Reg) +DEF_HELPER_4(cvtsd2ss, void, env, Reg, Reg, Reg) DEF_HELPER_3(cvtpi2ps, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtpi2pd, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtsi2ss, void, env, ZMMReg, i32) @@ -164,8 +151,6 @@ DEF_HELPER_3(cvtsq2ss, void, env, ZMMReg, i64) DEF_HELPER_3(cvtsq2sd, void, env, ZMMReg, i64) #endif -DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvtps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvtpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvtss2si, s32, env, ZMMReg) @@ -175,8 +160,6 @@ DEF_HELPER_2(cvtss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvtsd2sq, s64, env, ZMMReg) #endif -DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvttps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvttpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvttss2si, s32, env, ZMMReg) @@ -185,27 +168,25 @@ DEF_HELPER_2(cvttsd2si, s32, env, ZMMReg) DEF_HELPER_2(cvttss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvttsd2sq, s64, env, ZMMReg) #endif +#endif DEF_HELPER_3(glue(rsqrtps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(rsqrtss, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(glue(rcpps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(rcpss, void, env, ZMMReg, ZMMReg) + +#if SHIFT == 1 +DEF_HELPER_4(rsqrtss, void, env, ZMMReg, ZMMReg, ZMMReg) +DEF_HELPER_4(rcpss, void, env, ZMMReg, ZMMReg, ZMMReg) DEF_HELPER_3(extrq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int) DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_5(insertq_i, void, env, ZMMReg, ZMMReg, int, int) -DEF_HELPER_3(glue(haddps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(haddpd, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(hsubps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(hsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(addsubps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(addsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) - -#define SSE_HELPER_CMP(name, F, C) \ - DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(name ## sd, void, env, Reg, Reg) +#endif + +SSE_HELPER_P4(hadd) +SSE_HELPER_P4(hsub) +SSE_HELPER_P4(addsub) + +#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_S4(name) SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) @@ -216,29 +197,58 @@ SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) +SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) +SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) +SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) +SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) +SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU) +SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) +SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) +SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) + +SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) +SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) +SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) +SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) +SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) +SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) +SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) +SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) +SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) +SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) +SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) +SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU) +SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) +SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) +SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) + +#if SHIFT == 1 DEF_HELPER_3(ucomiss, void, env, Reg, Reg) DEF_HELPER_3(comiss, void, env, Reg, Reg) DEF_HELPER_3(ucomisd, void, env, Reg, Reg) DEF_HELPER_3(comisd, void, env, Reg, Reg) +#endif + DEF_HELPER_2(glue(movmskps, SUFFIX), i32, env, Reg) DEF_HELPER_2(glue(movmskpd, SUFFIX), i32, env, Reg) #endif -DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg) -DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg) -#define UNPCK_OP(base_name, base) \ - DEF_HELPER_3(glue(punpck ## base_name ## bw, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(glue(punpck ## base_name ## wd, SUFFIX), void, env, Reg, Reg) \ - DEF_HELPER_3(glue(punpck ## base_name ## dq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(packsswb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packuswb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packssdw, SUFFIX), void, env, Reg, Reg, Reg) +#define UNPCK_OP(name, base) \ + DEF_HELPER_4(glue(punpck ## name ## bw, SUFFIX), void, env, Reg, Reg, Reg) \ + DEF_HELPER_4(glue(punpck ## name ## wd, SUFFIX), void, env, Reg, Reg, Reg) \ + DEF_HELPER_4(glue(punpck ## name ## dq, SUFFIX), void, env, Reg, Reg, Reg) UNPCK_OP(l, 0) UNPCK_OP(h, 1) -#if SHIFT == 1 -DEF_HELPER_3(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg) +#if SHIFT >= 1 +DEF_HELPER_4(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg, Reg) #endif /* 3DNow! float ops */ @@ -265,28 +275,25 @@ DEF_HELPER_3(pswapd, void, env, MMXReg, MMXReg) #endif /* SSSE3 op helpers */ -DEF_HELPER_3(glue(phaddw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phaddd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phaddsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pabsb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pabsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pabsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pshufb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_4(glue(palignr, SUFFIX), void, env, Reg, Reg, s32) +DEF_HELPER_4(glue(phaddw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phaddd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phaddsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pshufb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_5(glue(palignr, SUFFIX), void, env, Reg, Reg, Reg, i32) /* SSE4.1 op helpers */ -#if SHIFT == 1 -DEF_HELPER_3(glue(pblendvb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(blendvps, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(blendvpd, SUFFIX), void, env, Reg, Reg) +#if SHIFT >= 1 +DEF_HELPER_5(glue(pblendvb, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(blendvps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(blendvpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) DEF_HELPER_3(glue(ptest, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovsxbw, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovsxbd, SUFFIX), void, env, Reg, Reg) @@ -300,34 +307,32 @@ DEF_HELPER_3(glue(pmovzxbq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovzxwd, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovzxwq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovzxdq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmuldq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packusdw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminsb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminuw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminud, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxsb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxuw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxud, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmulld, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(pmovsldup, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(pmovshdup, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(pmovdldup, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(pmuldq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packusdw, SUFFIX), void, env, Reg, Reg, Reg) +#if SHIFT == 1 DEF_HELPER_3(glue(phminposuw, SUFFIX), void, env, Reg, Reg) +#endif DEF_HELPER_4(glue(roundps, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(roundpd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(roundss, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(roundsd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(blendps, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(blendpd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(pblendw, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(dpps, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(dppd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, i32) +#if SHIFT == 1 +DEF_HELPER_5(roundss_xmm, void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(roundsd_xmm, void, env, Reg, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(blendps, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(blendpd, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(pblendw, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(dpps, SUFFIX), void, env, Reg, Reg, Reg, i32) +#if SHIFT == 1 +DEF_HELPER_5(glue(dppd, SUFFIX), void, env, Reg, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif /* SSE4.2 op helpers */ #if SHIFT == 1 -DEF_HELPER_3(glue(pcmpgtq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(pcmpestri, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(pcmpestrm, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(pcmpistri, SUFFIX), void, env, Reg, Reg, i32) @@ -336,14 +341,45 @@ DEF_HELPER_3(crc32, tl, i32, tl, i32) #endif /* AES-NI op helpers */ +#if SHIFT >= 1 +DEF_HELPER_4(glue(aesdec, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesdeclast, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesenc, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesenclast, SUFFIX), void, env, Reg, Reg, Reg) #if SHIFT == 1 -DEF_HELPER_3(glue(aesdec, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesdeclast, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesenc, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesenclast, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(aesimc, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) +#endif + +/* AVX helpers */ +#if SHIFT >= 1 +DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_3(glue(vpermilpd_imm, SUFFIX), void, Reg, Reg, i32) +DEF_HELPER_3(glue(vpermilps_imm, SUFFIX), void, Reg, Reg, i32) +DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_6(glue(vpgatherdd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32) +DEF_HELPER_6(glue(vpgatherdq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32) +DEF_HELPER_6(glue(vpgatherqd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32) +DEF_HELPER_6(glue(vpgatherqq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32) +#if SHIFT == 2 +DEF_HELPER_3(vpermd_ymm, void, Reg, Reg, Reg) +DEF_HELPER_4(vpermdq_ymm, void, Reg, Reg, Reg, i32) +DEF_HELPER_3(vpermq_ymm, void, Reg, Reg, i32) +#endif #endif #undef SHIFT @@ -354,6 +390,9 @@ DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32) #undef SSE_HELPER_W #undef SSE_HELPER_L #undef SSE_HELPER_Q -#undef SSE_HELPER_S +#undef SSE_HELPER_S3 +#undef SSE_HELPER_S4 +#undef SSE_HELPER_P3 +#undef SSE_HELPER_P4 #undef SSE_HELPER_CMP #undef UNPCK_OP diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc new file mode 100644 index 0000000..8e1eb9d --- /dev/null +++ b/target/i386/tcg/decode-new.c.inc @@ -0,0 +1,1795 @@ +/* + * New-style decoder for i386 instructions + * + * Copyright (c) 2022 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * The decoder is mostly based on tables copied from the Intel SDM. As + * a result, most operand load and writeback is done entirely in common + * table-driven code using the same operand type (X86_TYPE_*) and + * size (X86_SIZE_*) codes used in the manual. + * + * The main difference is that the V, U and W types are extended to + * cover MMX as well; if an instruction is like + * + * por Pq, Qq + * 66 por Vx, Hx, Wx + * + * only the second row is included and the instruction is marked as a + * valid MMX instruction. The MMX flag directs the decoder to rewrite + * the V/U/H/W types to P/N/P/Q if there is no prefix, as well as changing + * "x" to "q" if there is no prefix. + * + * In addition, the ss/ps/sd/pd types are sometimes mushed together as "x" + * if the difference is expressed via prefixes. Individual instructions + * are separated by prefix in the generator functions. + * + * There are a couple cases in which instructions (e.g. MOVD) write the + * whole XMM or MM register but are established incorrectly in the manual + * as "d" or "q". These have to be fixed for the decoder to work correctly. + */ + +#define X86_OP_NONE { 0 }, + +#define X86_OP_GROUP3(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) { \ + .decode = glue(decode_, op), \ + .op0 = glue(X86_TYPE_, op0_), \ + .s0 = glue(X86_SIZE_, s0_), \ + .op1 = glue(X86_TYPE_, op1_), \ + .s1 = glue(X86_SIZE_, s1_), \ + .op2 = glue(X86_TYPE_, op2_), \ + .s2 = glue(X86_SIZE_, s2_), \ + .is_decode = true, \ + ## __VA_ARGS__ \ +} + +#define X86_OP_GROUP2(op, op0, s0, op1, s1, ...) \ + X86_OP_GROUP3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__) +#define X86_OP_GROUP0(op, ...) \ + X86_OP_GROUP3(op, None, None, None, None, None, None, ## __VA_ARGS__) + +#define X86_OP_ENTRY3(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) { \ + .gen = glue(gen_, op), \ + .op0 = glue(X86_TYPE_, op0_), \ + .s0 = glue(X86_SIZE_, s0_), \ + .op1 = glue(X86_TYPE_, op1_), \ + .s1 = glue(X86_SIZE_, s1_), \ + .op2 = glue(X86_TYPE_, op2_), \ + .s2 = glue(X86_SIZE_, s2_), \ + ## __VA_ARGS__ \ +} + +#define X86_OP_ENTRY4(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) \ + X86_OP_ENTRY3(op, op0_, s0_, op1_, s1_, op2_, s2_, \ + .op3 = X86_TYPE_I, .s3 = X86_SIZE_b, \ + ## __VA_ARGS__) + +#define X86_OP_ENTRY2(op, op0, s0, op1, s1, ...) \ + X86_OP_ENTRY3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__) +#define X86_OP_ENTRYw(op, op0, s0, ...) \ + X86_OP_ENTRY3(op, op0, s0, None, None, None, None, ## __VA_ARGS__) +#define X86_OP_ENTRYr(op, op0, s0, ...) \ + X86_OP_ENTRY3(op, None, None, None, None, op0, s0, ## __VA_ARGS__) +#define X86_OP_ENTRY0(op, ...) \ + X86_OP_ENTRY3(op, None, None, None, None, None, None, ## __VA_ARGS__) + +#define cpuid(feat) .cpuid = X86_FEAT_##feat, +#define i64 .special = X86_SPECIAL_i64, +#define o64 .special = X86_SPECIAL_o64, +#define xchg .special = X86_SPECIAL_Locked, +#define mmx .special = X86_SPECIAL_MMX, +#define zext0 .special = X86_SPECIAL_ZExtOp0, +#define zext2 .special = X86_SPECIAL_ZExtOp2, +#define avx_movx .special = X86_SPECIAL_AVXExtMov, + +#define vex1 .vex_class = 1, +#define vex1_rep3 .vex_class = 1, .vex_special = X86_VEX_REPScalar, +#define vex2 .vex_class = 2, +#define vex2_rep3 .vex_class = 2, .vex_special = X86_VEX_REPScalar, +#define vex3 .vex_class = 3, +#define vex4 .vex_class = 4, +#define vex4_unal .vex_class = 4, .vex_special = X86_VEX_SSEUnaligned, +#define vex5 .vex_class = 5, +#define vex6 .vex_class = 6, +#define vex7 .vex_class = 7, +#define vex8 .vex_class = 8, +#define vex11 .vex_class = 11, +#define vex12 .vex_class = 12, +#define vex13 .vex_class = 13, + +#define avx2_256 .vex_special = X86_VEX_AVX2_256, + +#define P_00 1 +#define P_66 (1 << PREFIX_DATA) +#define P_F3 (1 << PREFIX_REPZ) +#define P_F2 (1 << PREFIX_REPNZ) + +#define p_00 .valid_prefix = P_00, +#define p_66 .valid_prefix = P_66, +#define p_f3 .valid_prefix = P_F3, +#define p_f2 .valid_prefix = P_F2, +#define p_00_66 .valid_prefix = P_00 | P_66, +#define p_00_f3 .valid_prefix = P_00 | P_F3, +#define p_66_f2 .valid_prefix = P_66 | P_F2, +#define p_00_66_f3 .valid_prefix = P_00 | P_66 | P_F3, +#define p_66_f3_f2 .valid_prefix = P_66 | P_F3 | P_F2, +#define p_00_66_f3_f2 .valid_prefix = P_00 | P_66 | P_F3 | P_F2, + +static uint8_t get_modrm(DisasContext *s, CPUX86State *env) +{ + if (!s->has_modrm) { + s->modrm = x86_ldub_code(env, s); + s->has_modrm = true; + } + return s->modrm; +} + +static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEntry entries[4]) +{ + if (s->prefix & PREFIX_REPNZ) { + return &entries[3]; + } else if (s->prefix & PREFIX_REPZ) { + return &entries[2]; + } else if (s->prefix & PREFIX_DATA) { + return &entries[1]; + } else { + return &entries[0]; + } +} + +static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + /* only includes ldmxcsr and stmxcsr, because they have AVX variants. */ + static const X86OpEntry group15_reg[8] = { + }; + + static const X86OpEntry group15_mem[8] = { + [2] = X86_OP_ENTRYr(LDMXCSR, E,d, vex5), + [3] = X86_OP_ENTRYw(STMXCSR, E,d, vex5), + }; + + uint8_t modrm = get_modrm(s, env); + if ((modrm >> 6) == 3) { + *entry = group15_reg[(modrm >> 3) & 7]; + } else { + *entry = group15_mem[(modrm >> 3) & 7]; + } +} + +static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86GenFunc group17_gen[8] = { + NULL, gen_BLSR, gen_BLSMSK, gen_BLSI, + }; + int op = (get_modrm(s, env) >> 3) & 7; + entry->gen = group17_gen[op]; +} + +static void decode_group12(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_group12[8] = { + {}, + {}, + X86_OP_ENTRY3(PSRLW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + X86_OP_ENTRY3(PSRAW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + X86_OP_ENTRY3(PSLLW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + }; + + int op = (get_modrm(s, env) >> 3) & 7; + *entry = opcodes_group12[op]; +} + +static void decode_group13(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_group13[8] = { + {}, + {}, + X86_OP_ENTRY3(PSRLD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + X86_OP_ENTRY3(PSRAD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + X86_OP_ENTRY3(PSLLD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + {}, + }; + + int op = (get_modrm(s, env) >> 3) & 7; + *entry = opcodes_group13[op]; +} + +static void decode_group14(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_group14[8] = { + /* grp14 */ + {}, + {}, + X86_OP_ENTRY3(PSRLQ_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + X86_OP_ENTRY3(PSRLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66), + {}, + {}, + X86_OP_ENTRY3(PSLLQ_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66), + X86_OP_ENTRY3(PSLLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66), + }; + + int op = (get_modrm(s, env) >> 3) & 7; + *entry = opcodes_group14[op]; +} + +static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F6F[4] = { + X86_OP_ENTRY3(MOVDQ, P,q, None,None, Q,q, vex1 mmx), /* movq */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex1), /* movdqa */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* movdqu */ + {}, + }; + *entry = *decode_by_prefix(s, opcodes_0F6F); +} + +static void decode_0F70(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry pshufw[4] = { + X86_OP_ENTRY3(PSHUFW, P,q, Q,q, I,b, vex4 mmx), + X86_OP_ENTRY3(PSHUFD, V,x, W,x, I,b, vex4 avx2_256), + X86_OP_ENTRY3(PSHUFHW, V,x, W,x, I,b, vex4 avx2_256), + X86_OP_ENTRY3(PSHUFLW, V,x, W,x, I,b, vex4 avx2_256), + }; + + *entry = *decode_by_prefix(s, pshufw); +} + +static void decode_0F77(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + if (!(s->prefix & PREFIX_VEX)) { + entry->gen = gen_EMMS; + } else if (!s->vex_l) { + entry->gen = gen_VZEROUPPER; + entry->vex_class = 8; + } else { + entry->gen = gen_VZEROALL; + entry->vex_class = 8; + } +} + +static void decode_0F78(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F78[4] = { + {}, + X86_OP_ENTRY3(EXTRQ_i, V,x, None,None, I,w, cpuid(SSE4A)), + {}, + X86_OP_ENTRY3(INSERTQ_i, V,x, U,x, I,w, cpuid(SSE4A)), + }; + *entry = *decode_by_prefix(s, opcodes_0F78); +} + +static void decode_0F79(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + if (s->prefix & PREFIX_REPNZ) { + entry->gen = gen_INSERTQ_r; + } else if (s->prefix & PREFIX_DATA) { + entry->gen = gen_EXTRQ_r; + } else { + entry->gen = NULL; + }; +} + +static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F7E[4] = { + X86_OP_ENTRY3(MOVD_from, E,y, None,None, P,y, vex5 mmx), + X86_OP_ENTRY3(MOVD_from, E,y, None,None, V,y, vex5), + X86_OP_ENTRY3(MOVQ, V,x, None,None, W,q, vex5), /* wrong dest Vy on SDM! */ + {}, + }; + *entry = *decode_by_prefix(s, opcodes_0F7E); +} + +static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F7F[4] = { + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 mmx), /* movq */ + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1), /* movdqa */ + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4_unal), /* movdqu */ + {}, + }; + *entry = *decode_by_prefix(s, opcodes_0F7F); +} + +static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry movq[4] = { + {}, + X86_OP_ENTRY3(MOVQ, W,x, None, None, V,q, vex5), + X86_OP_ENTRY3(MOVq_dq, V,dq, None, None, N,q), + X86_OP_ENTRY3(MOVq_dq, P,q, None, None, U,q), + }; + + *entry = *decode_by_prefix(s, movq); +} + +static const X86OpEntry opcodes_0F38_00toEF[240] = { + [0x00] = X86_OP_ENTRY3(PSHUFB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x01] = X86_OP_ENTRY3(PHADDW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x02] = X86_OP_ENTRY3(PHADDD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x03] = X86_OP_ENTRY3(PHADDSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x04] = X86_OP_ENTRY3(PMADDUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x05] = X86_OP_ENTRY3(PHSUBW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x06] = X86_OP_ENTRY3(PHSUBD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x07] = X86_OP_ENTRY3(PHSUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + + [0x10] = X86_OP_ENTRY2(PBLENDVB, V,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x14] = X86_OP_ENTRY2(BLENDVPS, V,x, W,x, vex4 cpuid(SSE41) p_66), + [0x15] = X86_OP_ENTRY2(BLENDVPD, V,x, W,x, vex4 cpuid(SSE41) p_66), + /* Listed incorrectly as type 4 */ + [0x16] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66), + [0x17] = X86_OP_ENTRY3(VPTEST, None,None, V,x, W,x, vex4 cpuid(SSE41) p_66), + + /* + * Source operand listed as Mq/Ux and similar in the manual; incorrectly listed + * as 128-bit only in 2-17. + */ + [0x20] = X86_OP_ENTRY3(VPMOVSXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x21] = X86_OP_ENTRY3(VPMOVSXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x22] = X86_OP_ENTRY3(VPMOVSXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x23] = X86_OP_ENTRY3(VPMOVSXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x24] = X86_OP_ENTRY3(VPMOVSXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x25] = X86_OP_ENTRY3(VPMOVSXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + + /* Same as PMOVSX. */ + [0x30] = X86_OP_ENTRY3(VPMOVZXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x31] = X86_OP_ENTRY3(VPMOVZXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x32] = X86_OP_ENTRY3(VPMOVZXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x33] = X86_OP_ENTRY3(VPMOVZXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x34] = X86_OP_ENTRY3(VPMOVZXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x35] = X86_OP_ENTRY3(VPMOVZXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66), + [0x36] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66), + [0x37] = X86_OP_ENTRY3(PCMPGTQ, V,x, H,x, W,x, vex4 cpuid(SSE42) avx2_256 p_66), + + [0x40] = X86_OP_ENTRY3(PMULLD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x41] = X86_OP_ENTRY3(VPHMINPOSUW, V,dq, None,None, W,dq, vex4 cpuid(SSE41) p_66), + /* Listed incorrectly as type 4 */ + [0x45] = X86_OP_ENTRY3(VPSRLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66), + [0x46] = X86_OP_ENTRY3(VPSRAV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66), + [0x47] = X86_OP_ENTRY3(VPSLLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66), + + [0x90] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vpgatherdd/q */ + [0x91] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vpgatherqd/q */ + [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */ + [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */ + + [0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x0b] = X86_OP_ENTRY3(PMULHRSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x0c] = X86_OP_ENTRY3(VPERMILPS, V,x, H,x, W,x, vex4 cpuid(AVX) p_00_66), + [0x0d] = X86_OP_ENTRY3(VPERMILPD, V,x, H,x, W,x, vex4 cpuid(AVX) p_66), + [0x0e] = X86_OP_ENTRY3(VTESTPS, None,None, V,x, W,x, vex4 cpuid(AVX) p_66), + [0x0f] = X86_OP_ENTRY3(VTESTPD, None,None, V,x, W,x, vex4 cpuid(AVX) p_66), + + [0x18] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX) p_66), /* vbroadcastss */ + [0x19] = X86_OP_ENTRY3(VPBROADCASTQ, V,qq, None,None, W,q, vex6 cpuid(AVX) p_66), /* vbroadcastsd */ + [0x1a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX) p_66), + [0x1c] = X86_OP_ENTRY3(PABSB, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x1d] = X86_OP_ENTRY3(PABSW, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + [0x1e] = X86_OP_ENTRY3(PABSD, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + + [0x28] = X86_OP_ENTRY3(PMULDQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x29] = X86_OP_ENTRY3(PCMPEQQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x2a] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, WM,x, vex1 cpuid(SSE41) avx2_256 p_66), /* movntdqa */ + [0x2b] = X86_OP_ENTRY3(VPACKUSDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x2c] = X86_OP_ENTRY3(VMASKMOVPS, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66), + [0x2d] = X86_OP_ENTRY3(VMASKMOVPD, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66), + /* Incorrectly listed as Mx,Hx,Vx in the manual */ + [0x2e] = X86_OP_ENTRY3(VMASKMOVPS_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66), + [0x2f] = X86_OP_ENTRY3(VMASKMOVPD_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66), + + [0x38] = X86_OP_ENTRY3(PMINSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x39] = X86_OP_ENTRY3(PMINSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3a] = X86_OP_ENTRY3(PMINUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3b] = X86_OP_ENTRY3(PMINUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3c] = X86_OP_ENTRY3(PMAXSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3d] = X86_OP_ENTRY3(PMAXSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3e] = X86_OP_ENTRY3(PMAXUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x3f] = X86_OP_ENTRY3(PMAXUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + + [0x58] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX2) p_66), + [0x59] = X86_OP_ENTRY3(VPBROADCASTQ, V,x, None,None, W,q, vex6 cpuid(AVX2) p_66), + [0x5a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX2) p_66), + + [0x78] = X86_OP_ENTRY3(VPBROADCASTB, V,x, None,None, W,b, vex6 cpuid(AVX2) p_66), + [0x79] = X86_OP_ENTRY3(VPBROADCASTW, V,x, None,None, W,w, vex6 cpuid(AVX2) p_66), + + [0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66), + [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66), + + [0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66), + [0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66), + [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), + [0xde] = X86_OP_ENTRY3(VAESDEC, V,x, H,x, W,x, vex4 cpuid(AES) p_66), + [0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), +}; + +/* five rows for no prefix, 66, F3, F2, 66+F2 */ +static const X86OpEntry opcodes_0F38_F0toFF[16][5] = { + [0] = { + X86_OP_ENTRY3(MOVBE, G,y, M,y, None,None, cpuid(MOVBE)), + X86_OP_ENTRY3(MOVBE, G,w, M,w, None,None, cpuid(MOVBE)), + {}, + X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)), + X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)), + }, + [1] = { + X86_OP_ENTRY3(MOVBE, M,y, G,y, None,None, cpuid(MOVBE)), + X86_OP_ENTRY3(MOVBE, M,w, G,w, None,None, cpuid(MOVBE)), + {}, + X86_OP_ENTRY2(CRC32, G,d, E,y, cpuid(SSE42)), + X86_OP_ENTRY2(CRC32, G,d, E,w, cpuid(SSE42)), + }, + [2] = { + X86_OP_ENTRY3(ANDN, G,y, B,y, E,y, vex13 cpuid(BMI1)), + {}, + {}, + {}, + {}, + }, + [3] = { + X86_OP_GROUP3(group17, B,y, E,y, None,None, vex13 cpuid(BMI1)), + {}, + {}, + {}, + {}, + }, + [5] = { + X86_OP_ENTRY3(BZHI, G,y, E,y, B,y, vex13 cpuid(BMI1)), + {}, + X86_OP_ENTRY3(PEXT, G,y, B,y, E,y, vex13 cpuid(BMI2)), + X86_OP_ENTRY3(PDEP, G,y, B,y, E,y, vex13 cpuid(BMI2)), + {}, + }, + [6] = { + {}, + X86_OP_ENTRY2(ADCX, G,y, E,y, cpuid(ADX)), + X86_OP_ENTRY2(ADOX, G,y, E,y, cpuid(ADX)), + X86_OP_ENTRY3(MULX, /* B,y, */ G,y, E,y, 2,y, vex13 cpuid(BMI2)), + {}, + }, + [7] = { + X86_OP_ENTRY3(BEXTR, G,y, E,y, B,y, vex13 cpuid(BMI1)), + X86_OP_ENTRY3(SHLX, G,y, E,y, B,y, vex13 cpuid(BMI1)), + X86_OP_ENTRY3(SARX, G,y, E,y, B,y, vex13 cpuid(BMI1)), + X86_OP_ENTRY3(SHRX, G,y, E,y, B,y, vex13 cpuid(BMI1)), + {}, + }, +}; + +static void decode_0F38(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + *b = x86_ldub_code(env, s); + if (*b < 0xf0) { + *entry = opcodes_0F38_00toEF[*b]; + } else { + int row = 0; + if (s->prefix & PREFIX_REPZ) { + /* The REPZ (F3) prefix has priority over 66 */ + row = 2; + } else { + row += s->prefix & PREFIX_REPNZ ? 3 : 0; + row += s->prefix & PREFIX_DATA ? 1 : 0; + } + *entry = opcodes_0F38_F0toFF[*b & 15][row]; + } +} + +static void decode_VINSERTPS(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry + vinsertps_reg = X86_OP_ENTRY4(VINSERTPS_r, V,dq, H,dq, U,dq, vex5 cpuid(SSE41) p_66), + vinsertps_mem = X86_OP_ENTRY4(VINSERTPS_m, V,dq, H,dq, M,d, vex5 cpuid(SSE41) p_66); + + int modrm = get_modrm(s, env); + *entry = (modrm >> 6) == 3 ? vinsertps_reg : vinsertps_mem; +} + +static const X86OpEntry opcodes_0F3A[256] = { + /* + * These are VEX-only, but incorrectly listed in the manual as exception type 4. + * Also the "qq" instructions are sometimes omitted by Table 2-17, but are VEX256 + * only. + */ + [0x00] = X86_OP_ENTRY3(VPERMQ, V,qq, W,qq, I,b, vex6 cpuid(AVX2) p_66), + [0x01] = X86_OP_ENTRY3(VPERMQ, V,qq, W,qq, I,b, vex6 cpuid(AVX2) p_66), /* VPERMPD */ + [0x02] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66), /* VPBLENDD */ + [0x04] = X86_OP_ENTRY3(VPERMILPS_i, V,x, W,x, I,b, vex6 cpuid(AVX) p_66), + [0x05] = X86_OP_ENTRY3(VPERMILPD_i, V,x, W,x, I,b, vex6 cpuid(AVX) p_66), + [0x06] = X86_OP_ENTRY4(VPERM2x128, V,qq, H,qq, W,qq, vex6 cpuid(AVX) p_66), + + [0x14] = X86_OP_ENTRY3(PEXTRB, E,b, V,dq, I,b, vex5 cpuid(SSE41) zext0 p_66), + [0x15] = X86_OP_ENTRY3(PEXTRW, E,w, V,dq, I,b, vex5 cpuid(SSE41) zext0 p_66), + [0x16] = X86_OP_ENTRY3(PEXTR, E,y, V,dq, I,b, vex5 cpuid(SSE41) p_66), + [0x17] = X86_OP_ENTRY3(VEXTRACTPS, E,d, V,dq, I,b, vex5 cpuid(SSE41) p_66), + + [0x20] = X86_OP_ENTRY4(PINSRB, V,dq, H,dq, E,b, vex5 cpuid(SSE41) zext2 p_66), + [0x21] = X86_OP_GROUP0(VINSERTPS), + [0x22] = X86_OP_ENTRY4(PINSR, V,dq, H,dq, E,y, vex5 cpuid(SSE41) p_66), + + [0x40] = X86_OP_ENTRY4(VDDPS, V,x, H,x, W,x, vex2 cpuid(SSE41) p_66), + [0x41] = X86_OP_ENTRY4(VDDPD, V,dq, H,dq, W,dq, vex2 cpuid(SSE41) p_66), + [0x42] = X86_OP_ENTRY4(VMPSADBW, V,x, H,x, W,x, vex2 cpuid(SSE41) avx2_256 p_66), + [0x44] = X86_OP_ENTRY4(PCLMULQDQ, V,dq, H,dq, W,dq, vex4 cpuid(PCLMULQDQ) p_66), + [0x46] = X86_OP_ENTRY4(VPERM2x128, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66), + + [0x60] = X86_OP_ENTRY4(PCMPESTRM, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66), + [0x61] = X86_OP_ENTRY4(PCMPESTRI, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66), + [0x62] = X86_OP_ENTRY4(PCMPISTRM, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66), + [0x63] = X86_OP_ENTRY4(PCMPISTRI, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66), + + [0x08] = X86_OP_ENTRY3(VROUNDPS, V,x, W,x, I,b, vex2 cpuid(SSE41) p_66), + [0x09] = X86_OP_ENTRY3(VROUNDPD, V,x, W,x, I,b, vex2 cpuid(SSE41) p_66), + /* + * Not listed as four operand in the manual. Also writes and reads 128-bits + * from the first two operands due to the V operand picking higher entries of + * the H operand; the "Vss,Hss,Wss" description from the manual is incorrect. + * For other unary operations such as VSQRTSx this is hidden by the "REPScalar" + * value of vex_special, because the table lists the operand types of VSQRTPx. + */ + [0x0a] = X86_OP_ENTRY4(VROUNDSS, V,x, H,x, W,ss, vex3 cpuid(SSE41) p_66), + [0x0b] = X86_OP_ENTRY4(VROUNDSD, V,x, H,x, W,sd, vex3 cpuid(SSE41) p_66), + [0x0c] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66), + [0x0d] = X86_OP_ENTRY4(VBLENDPD, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66), + [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), + + [0x18] = X86_OP_ENTRY4(VINSERTx128, V,qq, H,qq, W,qq, vex6 cpuid(AVX) p_66), + [0x19] = X86_OP_ENTRY3(VEXTRACTx128, W,dq, V,qq, I,b, vex6 cpuid(AVX) p_66), + + [0x38] = X86_OP_ENTRY4(VINSERTx128, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66), + [0x39] = X86_OP_ENTRY3(VEXTRACTx128, W,dq, V,qq, I,b, vex6 cpuid(AVX2) p_66), + + /* Listed incorrectly as type 4 */ + [0x4a] = X86_OP_ENTRY4(VBLENDVPS, V,x, H,x, W,x, vex6 cpuid(AVX) p_66), + [0x4b] = X86_OP_ENTRY4(VBLENDVPD, V,x, H,x, W,x, vex6 cpuid(AVX) p_66), + [0x4c] = X86_OP_ENTRY4(VPBLENDVB, V,x, H,x, W,x, vex6 cpuid(AVX) p_66 avx2_256), + + [0xdf] = X86_OP_ENTRY3(VAESKEYGEN, V,dq, W,dq, I,b, vex4 cpuid(AES) p_66), + + [0xF0] = X86_OP_ENTRY3(RORX, G,y, E,y, I,b, vex13 cpuid(BMI2) p_f2), +}; + +static void decode_0F3A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + *b = x86_ldub_code(env, s); + *entry = opcodes_0F3A[*b]; +} + +/* + * There are some mistakes in the operands in the manual, and the load/store/register + * cases are easiest to keep separate, so the entries for 10-17 follow simplicity and + * efficiency of implementation rather than copying what the manual says. + * + * In particular: + * + * 1) "VMOVSS m32, xmm1" and "VMOVSD m64, xmm1" do not support VEX.vvvv != 1111b, + * but this is not mentioned in the tables. + * + * 2) MOVHLPS, MOVHPS, MOVHPD, MOVLPD, MOVLPS read the high quadword of one of their + * operands, which must therefore be dq; MOVLPD and MOVLPS also write the high + * quadword of the V operand. + */ +static void decode_0F10(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F10_reg[4] = { + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */ + X86_OP_ENTRY3(VMOVSS, V,x, H,x, W,x, vex4), + X86_OP_ENTRY3(VMOVLPx, V,x, H,x, W,x, vex4), /* MOVSD */ + }; + + static const X86OpEntry opcodes_0F10_mem[4] = { + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */ + X86_OP_ENTRY3(VMOVSS_ld, V,x, H,x, M,ss, vex4), + X86_OP_ENTRY3(VMOVSD_ld, V,x, H,x, M,sd, vex4), + }; + + if ((get_modrm(s, env) >> 6) == 3) { + *entry = *decode_by_prefix(s, opcodes_0F10_reg); + } else { + *entry = *decode_by_prefix(s, opcodes_0F10_mem); + } +} + +static void decode_0F11(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F11_reg[4] = { + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */ + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */ + X86_OP_ENTRY3(VMOVSS, W,x, H,x, V,x, vex4), + X86_OP_ENTRY3(VMOVLPx, W,x, H,x, V,q, vex4), /* MOVSD */ + }; + + static const X86OpEntry opcodes_0F11_mem[4] = { + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */ + X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */ + X86_OP_ENTRY3(VMOVSS_st, M,ss, None,None, V,x, vex4), + X86_OP_ENTRY3(VMOVLPx_st, M,sd, None,None, V,x, vex4), /* MOVSD */ + }; + + if ((get_modrm(s, env) >> 6) == 3) { + *entry = *decode_by_prefix(s, opcodes_0F11_reg); + } else { + *entry = *decode_by_prefix(s, opcodes_0F11_mem); + } +} + +static void decode_0F12(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F12_mem[4] = { + /* + * Use dq for operand for compatibility with gen_MOVSD and + * to allow VEX128 only. + */ + X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPS */ + X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPD */ + X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)), + X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, WM,q, vex4 cpuid(SSE3)), /* qq if VEX.256 */ + }; + static const X86OpEntry opcodes_0F12_reg[4] = { + X86_OP_ENTRY3(VMOVHLPS, V,dq, H,dq, U,dq, vex4), + X86_OP_ENTRY3(VMOVLPx, W,x, H,x, U,q, vex4), /* MOVLPD */ + X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)), + X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)), + }; + + if ((get_modrm(s, env) >> 6) == 3) { + *entry = *decode_by_prefix(s, opcodes_0F12_reg); + } else { + *entry = *decode_by_prefix(s, opcodes_0F12_mem); + if ((s->prefix & PREFIX_REPNZ) && s->vex_l) { + entry->s2 = X86_SIZE_qq; + } + } +} + +static void decode_0F16(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F16_mem[4] = { + /* + * Operand 1 technically only reads the low 64 bits, but uses dq so that + * it is easier to check for op0 == op1 in an endianness-neutral manner. + */ + X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPS */ + X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPD */ + X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)), + {}, + }; + static const X86OpEntry opcodes_0F16_reg[4] = { + /* Same as above, operand 1 could be Hq if it wasn't for big-endian. */ + X86_OP_ENTRY3(VMOVLHPS, V,dq, H,dq, U,q, vex4), + X86_OP_ENTRY3(VMOVHPx, V,x, H,x, U,x, vex4), /* MOVHPD */ + X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)), + {}, + }; + + if ((get_modrm(s, env) >> 6) == 3) { + *entry = *decode_by_prefix(s, opcodes_0F16_reg); + } else { + *entry = *decode_by_prefix(s, opcodes_0F16_mem); + } +} + +static void decode_0F2A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F2A[4] = { + X86_OP_ENTRY3(CVTPI2Px, V,x, None,None, Q,q), + X86_OP_ENTRY3(CVTPI2Px, V,x, None,None, Q,q), + X86_OP_ENTRY3(VCVTSI2Sx, V,x, H,x, E,y, vex3), + X86_OP_ENTRY3(VCVTSI2Sx, V,x, H,x, E,y, vex3), + }; + *entry = *decode_by_prefix(s, opcodes_0F2A); +} + +static void decode_0F2B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F2B[4] = { + X86_OP_ENTRY3(MOVDQ, M,x, None,None, V,x, vex4), /* MOVNTPS */ + X86_OP_ENTRY3(MOVDQ, M,x, None,None, V,x, vex4), /* MOVNTPD */ + X86_OP_ENTRY3(VMOVSS_st, M,ss, None,None, V,x, vex4 cpuid(SSE4A)), /* MOVNTSS */ + X86_OP_ENTRY3(VMOVLPx_st, M,sd, None,None, V,x, vex4 cpuid(SSE4A)), /* MOVNTSD */ + }; + + *entry = *decode_by_prefix(s, opcodes_0F2B); +} + +static void decode_0F2C(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F2C[4] = { + /* Listed as ps/pd in the manual, but CVTTPS2PI only reads 64-bit. */ + X86_OP_ENTRY3(CVTTPx2PI, P,q, None,None, W,q), + X86_OP_ENTRY3(CVTTPx2PI, P,q, None,None, W,dq), + X86_OP_ENTRY3(VCVTTSx2SI, G,y, None,None, W,ss, vex3), + X86_OP_ENTRY3(VCVTTSx2SI, G,y, None,None, W,sd, vex3), + }; + *entry = *decode_by_prefix(s, opcodes_0F2C); +} + +static void decode_0F2D(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F2D[4] = { + /* Listed as ps/pd in the manual, but CVTPS2PI only reads 64-bit. */ + X86_OP_ENTRY3(CVTPx2PI, P,q, None,None, W,q), + X86_OP_ENTRY3(CVTPx2PI, P,q, None,None, W,dq), + X86_OP_ENTRY3(VCVTSx2SI, G,y, None,None, W,ss, vex3), + X86_OP_ENTRY3(VCVTSx2SI, G,y, None,None, W,sd, vex3), + }; + *entry = *decode_by_prefix(s, opcodes_0F2D); +} + +static void decode_sse_unary(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))) { + entry->op1 = X86_TYPE_None; + entry->s1 = X86_SIZE_None; + } + switch (*b) { + case 0x51: entry->gen = gen_VSQRT; break; + case 0x52: entry->gen = gen_VRSQRT; break; + case 0x53: entry->gen = gen_VRCP; break; + case 0x5A: entry->gen = gen_VCVTfp2fp; break; + } +} + +static void decode_0F5B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F5B[4] = { + X86_OP_ENTRY2(VCVTDQ2PS, V,x, W,x, vex2), + X86_OP_ENTRY2(VCVTPS2DQ, V,x, W,x, vex2), + X86_OP_ENTRY2(VCVTTPS2DQ, V,x, W,x, vex2), + {}, + }; + *entry = *decode_by_prefix(s, opcodes_0F5B); +} + +static void decode_0FE6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0FE6[4] = { + {}, + X86_OP_ENTRY2(VCVTTPD2DQ, V,x, W,x, vex2), + X86_OP_ENTRY2(VCVTDQ2PD, V,x, W,x, vex2), + X86_OP_ENTRY2(VCVTPD2DQ, V,x, W,x, vex2), + }; + *entry = *decode_by_prefix(s, opcodes_0FE6); +} + +static const X86OpEntry opcodes_0F[256] = { + [0x0E] = X86_OP_ENTRY0(EMMS, cpuid(3DNOW)), /* femms */ + /* + * 3DNow!'s opcode byte comes *after* modrm and displacements, making it + * more like an Ib operand. Dispatch to the right helper in a single gen_* + * function. + */ + [0x0F] = X86_OP_ENTRY3(3dnow, P,q, Q,q, I,b, cpuid(3DNOW)), + + [0x10] = X86_OP_GROUP0(0F10), + [0x11] = X86_OP_GROUP0(0F11), + [0x12] = X86_OP_GROUP0(0F12), + [0x13] = X86_OP_ENTRY3(VMOVLPx_st, M,q, None,None, V,q, vex4 p_00_66), + [0x14] = X86_OP_ENTRY3(VUNPCKLPx, V,x, H,x, W,x, vex4 p_00_66), + [0x15] = X86_OP_ENTRY3(VUNPCKHPx, V,x, H,x, W,x, vex4 p_00_66), + [0x16] = X86_OP_GROUP0(0F16), + /* Incorrectly listed as Mq,Vq in the manual */ + [0x17] = X86_OP_ENTRY3(VMOVHPx_st, M,q, None,None, V,dq, vex4 p_00_66), + + [0x50] = X86_OP_ENTRY3(MOVMSK, G,y, None,None, U,x, vex7 p_00_66), + [0x51] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x52] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex5 p_00_f3), + [0x53] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex5 p_00_f3), + [0x54] = X86_OP_ENTRY3(PAND, V,x, H,x, W,x, vex4 p_00_66), /* vand */ + [0x55] = X86_OP_ENTRY3(PANDN, V,x, H,x, W,x, vex4 p_00_66), /* vandn */ + [0x56] = X86_OP_ENTRY3(POR, V,x, H,x, W,x, vex4 p_00_66), /* vor */ + [0x57] = X86_OP_ENTRY3(PXOR, V,x, H,x, W,x, vex4 p_00_66), /* vxor */ + + [0x60] = X86_OP_ENTRY3(PUNPCKLBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x61] = X86_OP_ENTRY3(PUNPCKLWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x62] = X86_OP_ENTRY3(PUNPCKLDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x63] = X86_OP_ENTRY3(PACKSSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x64] = X86_OP_ENTRY3(PCMPGTB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x65] = X86_OP_ENTRY3(PCMPGTW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x66] = X86_OP_ENTRY3(PCMPGTD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x67] = X86_OP_ENTRY3(PACKUSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + + [0x70] = X86_OP_GROUP0(0F70), + [0x71] = X86_OP_GROUP0(group12), + [0x72] = X86_OP_GROUP0(group13), + [0x73] = X86_OP_GROUP0(group14), + [0x74] = X86_OP_ENTRY3(PCMPEQB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x75] = X86_OP_ENTRY3(PCMPEQW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x76] = X86_OP_ENTRY3(PCMPEQD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x77] = X86_OP_GROUP0(0F77), + + [0x28] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex1 p_00_66), /* MOVAPS */ + [0x29] = X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 p_00_66), /* MOVAPS */ + [0x2A] = X86_OP_GROUP0(0F2A), + [0x2B] = X86_OP_GROUP0(0F2B), + [0x2C] = X86_OP_GROUP0(0F2C), + [0x2D] = X86_OP_GROUP0(0F2D), + [0x2E] = X86_OP_ENTRY3(VUCOMI, None,None, V,x, W,x, vex4 p_00_66), + [0x2F] = X86_OP_ENTRY3(VCOMI, None,None, V,x, W,x, vex4 p_00_66), + + [0x38] = X86_OP_GROUP0(0F38), + [0x3a] = X86_OP_GROUP0(0F3A), + + [0x58] = X86_OP_ENTRY3(VADD, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x59] = X86_OP_ENTRY3(VMUL, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x5a] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex3 p_00_66_f3_f2), + [0x5b] = X86_OP_GROUP0(0F5B), + [0x5c] = X86_OP_ENTRY3(VSUB, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x5d] = X86_OP_ENTRY3(VMIN, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x5e] = X86_OP_ENTRY3(VDIV, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0x5f] = X86_OP_ENTRY3(VMAX, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + + [0x68] = X86_OP_ENTRY3(PUNPCKHBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x69] = X86_OP_ENTRY3(PUNPCKHWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6a] = X86_OP_ENTRY3(PUNPCKHDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6b] = X86_OP_ENTRY3(PACKSSDW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6c] = X86_OP_ENTRY3(PUNPCKLQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256), + [0x6d] = X86_OP_ENTRY3(PUNPCKHQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256), + [0x6e] = X86_OP_ENTRY3(MOVD_to, V,x, None,None, E,y, vex5 mmx p_00_66), /* wrong dest Vy on SDM! */ + [0x6f] = X86_OP_GROUP0(0F6F), + + [0x78] = X86_OP_GROUP0(0F78), + [0x79] = X86_OP_GROUP2(0F79, V,x, U,x, cpuid(SSE4A)), + [0x7c] = X86_OP_ENTRY3(VHADD, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2), + [0x7d] = X86_OP_ENTRY3(VHSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2), + [0x7e] = X86_OP_GROUP0(0F7E), + [0x7f] = X86_OP_GROUP0(0F7F), + + [0xae] = X86_OP_GROUP0(group15), + + [0xc2] = X86_OP_ENTRY4(VCMP, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2), + [0xc4] = X86_OP_ENTRY4(PINSRW, V,dq,H,dq,E,w, vex5 mmx p_00_66), + [0xc5] = X86_OP_ENTRY3(PEXTRW, G,d, U,dq,I,b, vex5 mmx p_00_66), + [0xc6] = X86_OP_ENTRY4(VSHUF, V,x, H,x, W,x, vex4 p_00_66), + + [0xd0] = X86_OP_ENTRY3(VADDSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2), + [0xd1] = X86_OP_ENTRY3(PSRLW_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd2] = X86_OP_ENTRY3(PSRLD_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd3] = X86_OP_ENTRY3(PSRLQ_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd4] = X86_OP_ENTRY3(PADDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd5] = X86_OP_ENTRY3(PMULLW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd6] = X86_OP_GROUP0(0FD6), + [0xd7] = X86_OP_ENTRY3(PMOVMSKB, G,d, None,None, U,x, vex7 mmx avx2_256 p_00_66), + + [0xe0] = X86_OP_ENTRY3(PAVGB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xe1] = X86_OP_ENTRY3(PSRAW_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66), + [0xe2] = X86_OP_ENTRY3(PSRAD_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66), + [0xe3] = X86_OP_ENTRY3(PAVGW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xe4] = X86_OP_ENTRY3(PMULHUW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xe5] = X86_OP_ENTRY3(PMULHW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xe6] = X86_OP_GROUP0(0FE6), + [0xe7] = X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 mmx p_00_66), /* MOVNTQ/MOVNTDQ */ + + [0xf0] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, WM,x, vex4_unal cpuid(SSE3) p_f2), /* LDDQU */ + [0xf1] = X86_OP_ENTRY3(PSLLW_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66), + [0xf2] = X86_OP_ENTRY3(PSLLD_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66), + [0xf3] = X86_OP_ENTRY3(PSLLQ_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66), + [0xf4] = X86_OP_ENTRY3(PMULUDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xf5] = X86_OP_ENTRY3(PMADDWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xf6] = X86_OP_ENTRY3(PSADBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xf7] = X86_OP_ENTRY3(MASKMOV, None,None, V,dq, U,dq, vex4_unal avx2_256 mmx p_00_66), + + /* Incorrectly missing from 2-17 */ + [0xd8] = X86_OP_ENTRY3(PSUBUSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xd9] = X86_OP_ENTRY3(PSUBUSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xda] = X86_OP_ENTRY3(PMINUB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xdb] = X86_OP_ENTRY3(PAND, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xdc] = X86_OP_ENTRY3(PADDUSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xdd] = X86_OP_ENTRY3(PADDUSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xde] = X86_OP_ENTRY3(PMAXUB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xdf] = X86_OP_ENTRY3(PANDN, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + + [0xe8] = X86_OP_ENTRY3(PSUBSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xe9] = X86_OP_ENTRY3(PSUBSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xea] = X86_OP_ENTRY3(PMINSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xeb] = X86_OP_ENTRY3(POR, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xec] = X86_OP_ENTRY3(PADDSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xed] = X86_OP_ENTRY3(PADDSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xee] = X86_OP_ENTRY3(PMAXSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xef] = X86_OP_ENTRY3(PXOR, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + + [0xf8] = X86_OP_ENTRY3(PSUBB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xf9] = X86_OP_ENTRY3(PSUBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xfa] = X86_OP_ENTRY3(PSUBD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xfb] = X86_OP_ENTRY3(PSUBQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xfc] = X86_OP_ENTRY3(PADDB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xfd] = X86_OP_ENTRY3(PADDW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0xfe] = X86_OP_ENTRY3(PADDD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + /* 0xff = UD0 */ +}; + +static void do_decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + *entry = opcodes_0F[*b]; +} + +static void decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + *b = x86_ldub_code(env, s); + do_decode_0F(s, env, entry, b); +} + +static const X86OpEntry opcodes_root[256] = { + [0x0F] = X86_OP_GROUP0(0F), +}; + +#undef mmx +#undef vex1 +#undef vex2 +#undef vex3 +#undef vex4 +#undef vex4_unal +#undef vex5 +#undef vex6 +#undef vex7 +#undef vex8 +#undef vex11 +#undef vex12 +#undef vex13 + +/* + * Decode the fixed part of the opcode and place the last + * in b. + */ +static void decode_root(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + *entry = opcodes_root[*b]; +} + + +static int decode_modrm(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + X86DecodedOp *op, X86OpType type) +{ + int modrm = get_modrm(s, env); + if ((modrm >> 6) == 3) { + if (s->prefix & PREFIX_LOCK) { + decode->e.gen = gen_illegal; + return 0xff; + } + op->n = (modrm & 7); + if (type != X86_TYPE_Q && type != X86_TYPE_N) { + op->n |= REX_B(s); + } + } else { + op->has_ea = true; + op->n = -1; + decode->mem = gen_lea_modrm_0(env, s, get_modrm(s, env)); + } + return modrm; +} + +static bool decode_op_size(DisasContext *s, X86OpEntry *e, X86OpSize size, MemOp *ot) +{ + switch (size) { + case X86_SIZE_b: /* byte */ + *ot = MO_8; + return true; + + case X86_SIZE_d: /* 32-bit */ + case X86_SIZE_ss: /* SSE/AVX scalar single precision */ + *ot = MO_32; + return true; + + case X86_SIZE_p: /* Far pointer, return offset size */ + case X86_SIZE_s: /* Descriptor, return offset size */ + case X86_SIZE_v: /* 16/32/64-bit, based on operand size */ + *ot = s->dflag; + return true; + + case X86_SIZE_pi: /* MMX */ + case X86_SIZE_q: /* 64-bit */ + case X86_SIZE_sd: /* SSE/AVX scalar double precision */ + *ot = MO_64; + return true; + + case X86_SIZE_w: /* 16-bit */ + *ot = MO_16; + return true; + + case X86_SIZE_y: /* 32/64-bit, based on operand size */ + *ot = s->dflag == MO_16 ? MO_32 : s->dflag; + return true; + + case X86_SIZE_z: /* 16-bit for 16-bit operand size, else 32-bit */ + *ot = s->dflag == MO_16 ? MO_16 : MO_32; + return true; + + case X86_SIZE_dq: /* SSE/AVX 128-bit */ + if (e->special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + *ot = MO_64; + return true; + } + if (s->vex_l && e->s0 != X86_SIZE_qq && e->s1 != X86_SIZE_qq) { + return false; + } + *ot = MO_128; + return true; + + case X86_SIZE_qq: /* AVX 256-bit */ + if (!s->vex_l) { + return false; + } + *ot = MO_256; + return true; + + case X86_SIZE_x: /* 128/256-bit, based on operand size */ + if (e->special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + *ot = MO_64; + return true; + } + /* fall through */ + case X86_SIZE_ps: /* SSE/AVX packed single precision */ + case X86_SIZE_pd: /* SSE/AVX packed double precision */ + *ot = s->vex_l ? MO_256 : MO_128; + return true; + + case X86_SIZE_d64: /* Default to 64-bit in 64-bit mode */ + *ot = CODE64(s) && s->dflag == MO_32 ? MO_64 : s->dflag; + return true; + + case X86_SIZE_f64: /* Ignore size override prefix in 64-bit mode */ + *ot = CODE64(s) ? MO_64 : s->dflag; + return true; + + default: + *ot = -1; + return true; + } +} + +static bool decode_op(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + X86DecodedOp *op, X86OpType type, int b) +{ + int modrm; + + switch (type) { + case X86_TYPE_None: /* Implicit or absent */ + case X86_TYPE_A: /* Implicit */ + case X86_TYPE_F: /* EFLAGS/RFLAGS */ + break; + + case X86_TYPE_B: /* VEX.vvvv selects a GPR */ + op->unit = X86_OP_INT; + op->n = s->vex_v; + break; + + case X86_TYPE_C: /* REG in the modrm byte selects a control register */ + op->unit = X86_OP_CR; + goto get_reg; + + case X86_TYPE_D: /* REG in the modrm byte selects a debug register */ + op->unit = X86_OP_DR; + goto get_reg; + + case X86_TYPE_G: /* REG in the modrm byte selects a GPR */ + op->unit = X86_OP_INT; + goto get_reg; + + case X86_TYPE_S: /* reg selects a segment register */ + op->unit = X86_OP_SEG; + goto get_reg; + + case X86_TYPE_P: + op->unit = X86_OP_MMX; + goto get_reg; + + case X86_TYPE_V: /* reg in the modrm byte selects an XMM/YMM register */ + if (decode->e.special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + op->unit = X86_OP_MMX; + } else { + op->unit = X86_OP_SSE; + } + get_reg: + op->n = ((get_modrm(s, env) >> 3) & 7) | REX_R(s); + break; + + case X86_TYPE_E: /* ALU modrm operand */ + op->unit = X86_OP_INT; + goto get_modrm; + + case X86_TYPE_Q: /* MMX modrm operand */ + op->unit = X86_OP_MMX; + goto get_modrm; + + case X86_TYPE_W: /* XMM/YMM modrm operand */ + if (decode->e.special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + op->unit = X86_OP_MMX; + } else { + op->unit = X86_OP_SSE; + } + goto get_modrm; + + case X86_TYPE_N: /* R/M in the modrm byte selects an MMX register */ + op->unit = X86_OP_MMX; + goto get_modrm_reg; + + case X86_TYPE_U: /* R/M in the modrm byte selects an XMM/YMM register */ + if (decode->e.special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + op->unit = X86_OP_MMX; + } else { + op->unit = X86_OP_SSE; + } + goto get_modrm_reg; + + case X86_TYPE_R: /* R/M in the modrm byte selects a register */ + op->unit = X86_OP_INT; + get_modrm_reg: + modrm = get_modrm(s, env); + if ((modrm >> 6) != 3) { + return false; + } + goto get_modrm; + + case X86_TYPE_WM: /* modrm byte selects an XMM/YMM memory operand */ + op->unit = X86_OP_SSE; + /* fall through */ + case X86_TYPE_M: /* modrm byte selects a memory operand */ + modrm = get_modrm(s, env); + if ((modrm >> 6) == 3) { + return false; + } + get_modrm: + decode_modrm(s, env, decode, op, type); + break; + + case X86_TYPE_O: /* Absolute address encoded in the instruction */ + op->unit = X86_OP_INT; + op->has_ea = true; + op->n = -1; + decode->mem = (AddressParts) { + .def_seg = R_DS, + .base = -1, + .index = -1, + .disp = insn_get_addr(env, s, s->aflag) + }; + break; + + case X86_TYPE_H: /* For AVX, VEX.vvvv selects an XMM/YMM register */ + if ((s->prefix & PREFIX_VEX)) { + op->unit = X86_OP_SSE; + op->n = s->vex_v; + break; + } + if (op == &decode->op[0]) { + /* shifts place the destination in VEX.vvvv, use modrm */ + return decode_op(s, env, decode, op, decode->e.op1, b); + } else { + return decode_op(s, env, decode, op, decode->e.op0, b); + } + + case X86_TYPE_I: /* Immediate */ + op->unit = X86_OP_IMM; + decode->immediate = insn_get_signed(env, s, op->ot); + break; + + case X86_TYPE_J: /* Relative offset for a jump */ + op->unit = X86_OP_IMM; + decode->immediate = insn_get_signed(env, s, op->ot); + decode->immediate += s->pc - s->cs_base; + if (s->dflag == MO_16) { + decode->immediate &= 0xffff; + } else if (!CODE64(s)) { + decode->immediate &= 0xffffffffu; + } + break; + + case X86_TYPE_L: /* The upper 4 bits of the immediate select a 128-bit register */ + op->n = insn_get(env, s, op->ot) >> 4; + break; + + case X86_TYPE_X: /* string source */ + op->n = -1; + decode->mem = (AddressParts) { + .def_seg = R_DS, + .base = R_ESI, + .index = -1, + }; + break; + + case X86_TYPE_Y: /* string destination */ + op->n = -1; + decode->mem = (AddressParts) { + .def_seg = R_ES, + .base = R_EDI, + .index = -1, + }; + break; + + case X86_TYPE_2op: + *op = decode->op[0]; + break; + + case X86_TYPE_LoBits: + op->n = (b & 7) | REX_B(s); + op->unit = X86_OP_INT; + break; + + case X86_TYPE_0 ... X86_TYPE_7: + op->n = type - X86_TYPE_0; + op->unit = X86_OP_INT; + break; + + case X86_TYPE_ES ... X86_TYPE_GS: + op->n = type - X86_TYPE_ES; + op->unit = X86_OP_SEG; + break; + } + + return true; +} + +static bool validate_sse_prefix(DisasContext *s, X86OpEntry *e) +{ + uint16_t sse_prefixes; + + if (!e->valid_prefix) { + return true; + } + if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) { + /* In SSE instructions, 0xF3 and 0xF2 cancel 0x66. */ + s->prefix &= ~PREFIX_DATA; + } + + /* Now, either zero or one bit is set in sse_prefixes. */ + sse_prefixes = s->prefix & (PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); + return e->valid_prefix & (1 << sse_prefixes); +} + +static bool decode_insn(DisasContext *s, CPUX86State *env, X86DecodeFunc decode_func, + X86DecodedInsn *decode) +{ + X86OpEntry *e = &decode->e; + + decode_func(s, env, e, &decode->b); + while (e->is_decode) { + e->is_decode = false; + e->decode(s, env, e, &decode->b); + } + + if (!validate_sse_prefix(s, e)) { + return false; + } + + /* First compute size of operands in order to initialize s->rip_offset. */ + if (e->op0 != X86_TYPE_None) { + if (!decode_op_size(s, e, e->s0, &decode->op[0].ot)) { + return false; + } + if (e->op0 == X86_TYPE_I) { + s->rip_offset += 1 << decode->op[0].ot; + } + } + if (e->op1 != X86_TYPE_None) { + if (!decode_op_size(s, e, e->s1, &decode->op[1].ot)) { + return false; + } + if (e->op1 == X86_TYPE_I) { + s->rip_offset += 1 << decode->op[1].ot; + } + } + if (e->op2 != X86_TYPE_None) { + if (!decode_op_size(s, e, e->s2, &decode->op[2].ot)) { + return false; + } + if (e->op2 == X86_TYPE_I) { + s->rip_offset += 1 << decode->op[2].ot; + } + } + if (e->op3 != X86_TYPE_None) { + /* + * A couple instructions actually use the extra immediate byte for an Lx + * register operand; those are handled in the gen_* functions as one off. + */ + assert(e->op3 == X86_TYPE_I && e->s3 == X86_SIZE_b); + s->rip_offset += 1; + } + + if (e->op0 != X86_TYPE_None && + !decode_op(s, env, decode, &decode->op[0], e->op0, decode->b)) { + return false; + } + + if (e->op1 != X86_TYPE_None && + !decode_op(s, env, decode, &decode->op[1], e->op1, decode->b)) { + return false; + } + + if (e->op2 != X86_TYPE_None && + !decode_op(s, env, decode, &decode->op[2], e->op2, decode->b)) { + return false; + } + + if (e->op3 != X86_TYPE_None) { + decode->immediate = insn_get_signed(env, s, MO_8); + } + + return true; +} + +static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) +{ + switch (cpuid) { + case X86_FEAT_None: + return true; + case X86_FEAT_MOVBE: + return (s->cpuid_ext_features & CPUID_EXT_MOVBE); + case X86_FEAT_PCLMULQDQ: + return (s->cpuid_ext_features & CPUID_EXT_PCLMULQDQ); + case X86_FEAT_SSE: + return (s->cpuid_ext_features & CPUID_SSE); + case X86_FEAT_SSE2: + return (s->cpuid_ext_features & CPUID_SSE2); + case X86_FEAT_SSE3: + return (s->cpuid_ext_features & CPUID_EXT_SSE3); + case X86_FEAT_SSSE3: + return (s->cpuid_ext_features & CPUID_EXT_SSSE3); + case X86_FEAT_SSE41: + return (s->cpuid_ext_features & CPUID_EXT_SSE41); + case X86_FEAT_SSE42: + return (s->cpuid_ext_features & CPUID_EXT_SSE42); + case X86_FEAT_AES: + if (!(s->cpuid_ext_features & CPUID_EXT_AES)) { + return false; + } else if (!(s->prefix & PREFIX_VEX)) { + return true; + } else if (!(s->cpuid_ext_features & CPUID_EXT_AVX)) { + return false; + } else { + return !s->vex_l || (s->cpuid_7_0_ecx_features & CPUID_7_0_ECX_VAES); + } + + case X86_FEAT_AVX: + return (s->cpuid_ext_features & CPUID_EXT_AVX); + + case X86_FEAT_3DNOW: + return (s->cpuid_ext2_features & CPUID_EXT2_3DNOW); + case X86_FEAT_SSE4A: + return (s->cpuid_ext3_features & CPUID_EXT3_SSE4A); + + case X86_FEAT_ADX: + return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX); + case X86_FEAT_BMI1: + return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1); + case X86_FEAT_BMI2: + return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2); + case X86_FEAT_AVX2: + return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2); + } + g_assert_not_reached(); +} + +static bool validate_vex(DisasContext *s, X86DecodedInsn *decode) +{ + X86OpEntry *e = &decode->e; + + switch (e->vex_special) { + case X86_VEX_REPScalar: + /* + * Instructions which differ between 00/66 and F2/F3 in the + * exception classification and the size of the memory operand. + */ + assert(e->vex_class == 1 || e->vex_class == 2); + if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) { + e->vex_class = 3; + if (s->vex_l) { + goto illegal; + } + assert(decode->e.s2 == X86_SIZE_x); + if (decode->op[2].has_ea) { + decode->op[2].ot = s->prefix & PREFIX_REPZ ? MO_32 : MO_64; + } + } + break; + + case X86_VEX_SSEUnaligned: + /* handled in sse_needs_alignment. */ + break; + + case X86_VEX_AVX2_256: + if ((s->prefix & PREFIX_VEX) && s->vex_l && !has_cpuid_feature(s, X86_FEAT_AVX2)) { + goto illegal; + } + } + + /* TODO: instructions that require VEX.W=0 (Table 2-16) */ + + switch (e->vex_class) { + case 0: + if (s->prefix & PREFIX_VEX) { + goto illegal; + } + return true; + case 1: + case 2: + case 3: + case 4: + case 5: + case 7: + if (s->prefix & PREFIX_VEX) { + if (!(s->flags & HF_AVX_EN_MASK)) { + goto illegal; + } + } else { + if (!(s->flags & HF_OSFXSR_MASK)) { + goto illegal; + } + } + break; + case 12: + /* Must have a VSIB byte and no address prefix. */ + assert(s->has_modrm); + if ((s->modrm & 7) != 4 || s->aflag == MO_16) { + goto illegal; + } + + /* Check no overlap between registers. */ + if (!decode->op[0].has_ea && + (decode->op[0].n == decode->mem.index || decode->op[0].n == decode->op[1].n)) { + goto illegal; + } + assert(!decode->op[1].has_ea); + if (decode->op[1].n == decode->mem.index) { + goto illegal; + } + if (!decode->op[2].has_ea && + (decode->op[2].n == decode->mem.index || decode->op[2].n == decode->op[1].n)) { + goto illegal; + } + /* fall through */ + case 6: + case 11: + if (!(s->prefix & PREFIX_VEX)) { + goto illegal; + } + if (!(s->flags & HF_AVX_EN_MASK)) { + goto illegal; + } + break; + case 8: + /* Non-VEX case handled in decode_0F77. */ + assert(s->prefix & PREFIX_VEX); + if (!(s->flags & HF_AVX_EN_MASK)) { + goto illegal; + } + break; + case 13: + if (!(s->prefix & PREFIX_VEX)) { + goto illegal; + } + if (s->vex_l) { + goto illegal; + } + /* All integer instructions use VEX.vvvv, so exit. */ + return true; + } + + if (s->vex_v != 0 && + e->op0 != X86_TYPE_H && e->op0 != X86_TYPE_B && + e->op1 != X86_TYPE_H && e->op1 != X86_TYPE_B && + e->op2 != X86_TYPE_H && e->op2 != X86_TYPE_B) { + goto illegal; + } + + if (s->flags & HF_TS_MASK) { + goto nm_exception; + } + if (s->flags & HF_EM_MASK) { + goto illegal; + } + return true; + +nm_exception: + gen_NM_exception(s); + return false; +illegal: + gen_illegal_opcode(s); + return false; +} + +static void decode_temp_free(X86DecodedOp *op) +{ + if (op->v_ptr) { + tcg_temp_free_ptr(op->v_ptr); + } +} + +static void decode_temps_free(X86DecodedInsn *decode) +{ + decode_temp_free(&decode->op[0]); + decode_temp_free(&decode->op[1]); + decode_temp_free(&decode->op[2]); +} + +/* + * Convert one instruction. s->base.is_jmp is set if the translation must + * be stopped. + */ +static void disas_insn_new(DisasContext *s, CPUState *cpu, int b) +{ + CPUX86State *env = cpu->env_ptr; + bool first = true; + X86DecodedInsn decode; + X86DecodeFunc decode_func = decode_root; + + s->has_modrm = false; + + next_byte: + if (first) { + first = false; + } else { + b = x86_ldub_code(env, s); + } + /* Collect prefixes. */ + switch (b) { + case 0xf3: + s->prefix |= PREFIX_REPZ; + s->prefix &= ~PREFIX_REPNZ; + goto next_byte; + case 0xf2: + s->prefix |= PREFIX_REPNZ; + s->prefix &= ~PREFIX_REPZ; + goto next_byte; + case 0xf0: + s->prefix |= PREFIX_LOCK; + goto next_byte; + case 0x2e: + s->override = R_CS; + goto next_byte; + case 0x36: + s->override = R_SS; + goto next_byte; + case 0x3e: + s->override = R_DS; + goto next_byte; + case 0x26: + s->override = R_ES; + goto next_byte; + case 0x64: + s->override = R_FS; + goto next_byte; + case 0x65: + s->override = R_GS; + goto next_byte; + case 0x66: + s->prefix |= PREFIX_DATA; + goto next_byte; + case 0x67: + s->prefix |= PREFIX_ADR; + goto next_byte; +#ifdef TARGET_X86_64 + case 0x40 ... 0x4f: + if (CODE64(s)) { + /* REX prefix */ + s->prefix |= PREFIX_REX; + s->vex_w = (b >> 3) & 1; + s->rex_r = (b & 0x4) << 1; + s->rex_x = (b & 0x2) << 2; + s->rex_b = (b & 0x1) << 3; + goto next_byte; + } + break; +#endif + case 0xc5: /* 2-byte VEX */ + case 0xc4: /* 3-byte VEX */ + /* + * VEX prefixes cannot be used except in 32-bit mode. + * Otherwise the instruction is LES or LDS. + */ + if (CODE32(s) && !VM86(s)) { + static const int pp_prefix[4] = { + 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ + }; + int vex3, vex2 = x86_ldub_code(env, s); + + if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) { + /* + * 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b, + * otherwise the instruction is LES or LDS. + */ + s->pc--; /* rewind the advance_pc() x86_ldub_code() did */ + break; + } + + /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */ + if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ + | PREFIX_LOCK | PREFIX_DATA | PREFIX_REX)) { + goto illegal_op; + } +#ifdef TARGET_X86_64 + s->rex_r = (~vex2 >> 4) & 8; +#endif + if (b == 0xc5) { + /* 2-byte VEX prefix: RVVVVlpp, implied 0f leading opcode byte */ + vex3 = vex2; + decode_func = decode_0F; + } else { + /* 3-byte VEX prefix: RXBmmmmm wVVVVlpp */ + vex3 = x86_ldub_code(env, s); +#ifdef TARGET_X86_64 + s->rex_x = (~vex2 >> 3) & 8; + s->rex_b = (~vex2 >> 2) & 8; +#endif + s->vex_w = (vex3 >> 7) & 1; + switch (vex2 & 0x1f) { + case 0x01: /* Implied 0f leading opcode bytes. */ + decode_func = decode_0F; + break; + case 0x02: /* Implied 0f 38 leading opcode bytes. */ + decode_func = decode_0F38; + break; + case 0x03: /* Implied 0f 3a leading opcode bytes. */ + decode_func = decode_0F3A; + break; + default: /* Reserved for future use. */ + goto unknown_op; + } + } + s->vex_v = (~vex3 >> 3) & 0xf; + s->vex_l = (vex3 >> 2) & 1; + s->prefix |= pp_prefix[vex3 & 3] | PREFIX_VEX; + } + break; + default: + if (b >= 0x100) { + b -= 0x100; + decode_func = do_decode_0F; + } + break; + } + + /* Post-process prefixes. */ + if (CODE64(s)) { + /* + * In 64-bit mode, the default data size is 32-bit. Select 64-bit + * data with rex_w, and 16-bit data with 0x66; rex_w takes precedence + * over 0x66 if both are present. + */ + s->dflag = (REX_W(s) ? MO_64 : s->prefix & PREFIX_DATA ? MO_16 : MO_32); + /* In 64-bit mode, 0x67 selects 32-bit addressing. */ + s->aflag = (s->prefix & PREFIX_ADR ? MO_32 : MO_64); + } else { + /* In 16/32-bit mode, 0x66 selects the opposite data size. */ + if (CODE32(s) ^ ((s->prefix & PREFIX_DATA) != 0)) { + s->dflag = MO_32; + } else { + s->dflag = MO_16; + } + /* In 16/32-bit mode, 0x67 selects the opposite addressing. */ + if (CODE32(s) ^ ((s->prefix & PREFIX_ADR) != 0)) { + s->aflag = MO_32; + } else { + s->aflag = MO_16; + } + } + + memset(&decode, 0, sizeof(decode)); + decode.b = b; + if (!decode_insn(s, env, decode_func, &decode)) { + goto illegal_op; + } + if (!decode.e.gen) { + goto unknown_op; + } + + if (!has_cpuid_feature(s, decode.e.cpuid)) { + goto illegal_op; + } + + switch (decode.e.special) { + case X86_SPECIAL_None: + break; + + case X86_SPECIAL_Locked: + if (decode.op[0].has_ea) { + s->prefix |= PREFIX_LOCK; + } + break; + + case X86_SPECIAL_ProtMode: + if (!PE(s) || VM86(s)) { + goto illegal_op; + } + break; + + case X86_SPECIAL_i64: + if (CODE64(s)) { + goto illegal_op; + } + break; + case X86_SPECIAL_o64: + if (!CODE64(s)) { + goto illegal_op; + } + break; + + case X86_SPECIAL_ZExtOp0: + assert(decode.op[0].unit == X86_OP_INT); + if (!decode.op[0].has_ea) { + decode.op[0].ot = MO_32; + } + break; + + case X86_SPECIAL_ZExtOp2: + assert(decode.op[2].unit == X86_OP_INT); + if (!decode.op[2].has_ea) { + decode.op[2].ot = MO_32; + } + break; + + case X86_SPECIAL_AVXExtMov: + if (!decode.op[2].has_ea) { + decode.op[2].ot = s->vex_l ? MO_256 : MO_128; + } else if (s->vex_l) { + decode.op[2].ot++; + } + break; + + case X86_SPECIAL_MMX: + if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA))) { + gen_helper_enter_mmx(cpu_env); + } + break; + } + + if (!validate_vex(s, &decode)) { + return; + } + if (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea) { + gen_load_ea(s, &decode.mem, decode.e.vex_class == 12); + } + if (s->prefix & PREFIX_LOCK) { + if (decode.op[0].unit != X86_OP_INT || !decode.op[0].has_ea) { + goto illegal_op; + } + gen_load(s, &decode, 2, s->T1); + decode.e.gen(s, env, &decode); + } else { + if (decode.op[0].unit == X86_OP_MMX) { + compute_mmx_offset(&decode.op[0]); + } else if (decode.op[0].unit == X86_OP_SSE) { + compute_xmm_offset(&decode.op[0]); + } + gen_load(s, &decode, 1, s->T0); + gen_load(s, &decode, 2, s->T1); + decode.e.gen(s, env, &decode); + gen_writeback(s, &decode, 0, s->T0); + } + decode_temps_free(&decode); + return; + illegal_op: + gen_illegal_opcode(s); + return; + unknown_op: + gen_unknown_opcode(env, s); +} diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h new file mode 100644 index 0000000..f159c26 --- /dev/null +++ b/target/i386/tcg/decode-new.h @@ -0,0 +1,249 @@ +/* + * Decode table flags, mostly based on Intel SDM. + * + * Copyright (c) 2022 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +typedef enum X86OpType { + X86_TYPE_None, + + X86_TYPE_A, /* Implicit */ + X86_TYPE_B, /* VEX.vvvv selects a GPR */ + X86_TYPE_C, /* REG in the modrm byte selects a control register */ + X86_TYPE_D, /* REG in the modrm byte selects a debug register */ + X86_TYPE_E, /* ALU modrm operand */ + X86_TYPE_F, /* EFLAGS/RFLAGS */ + X86_TYPE_G, /* REG in the modrm byte selects a GPR */ + X86_TYPE_H, /* For AVX, VEX.vvvv selects an XMM/YMM register */ + X86_TYPE_I, /* Immediate */ + X86_TYPE_J, /* Relative offset for a jump */ + X86_TYPE_L, /* The upper 4 bits of the immediate select a 128-bit register */ + X86_TYPE_M, /* modrm byte selects a memory operand */ + X86_TYPE_N, /* R/M in the modrm byte selects an MMX register */ + X86_TYPE_O, /* Absolute address encoded in the instruction */ + X86_TYPE_P, /* reg in the modrm byte selects an MMX register */ + X86_TYPE_Q, /* MMX modrm operand */ + X86_TYPE_R, /* R/M in the modrm byte selects a register */ + X86_TYPE_S, /* reg selects a segment register */ + X86_TYPE_U, /* R/M in the modrm byte selects an XMM/YMM register */ + X86_TYPE_V, /* reg in the modrm byte selects an XMM/YMM register */ + X86_TYPE_W, /* XMM/YMM modrm operand */ + X86_TYPE_X, /* string source */ + X86_TYPE_Y, /* string destination */ + + /* Custom */ + X86_TYPE_WM, /* modrm byte selects an XMM/YMM memory operand */ + X86_TYPE_2op, /* 2-operand RMW instruction */ + X86_TYPE_LoBits, /* encoded in bits 0-2 of the operand + REX.B */ + X86_TYPE_0, /* Hard-coded GPRs (RAX..RDI) */ + X86_TYPE_1, + X86_TYPE_2, + X86_TYPE_3, + X86_TYPE_4, + X86_TYPE_5, + X86_TYPE_6, + X86_TYPE_7, + X86_TYPE_ES, /* Hard-coded segment registers */ + X86_TYPE_CS, + X86_TYPE_SS, + X86_TYPE_DS, + X86_TYPE_FS, + X86_TYPE_GS, +} X86OpType; + +typedef enum X86OpSize { + X86_SIZE_None, + + X86_SIZE_a, /* BOUND operand */ + X86_SIZE_b, /* byte */ + X86_SIZE_d, /* 32-bit */ + X86_SIZE_dq, /* SSE/AVX 128-bit */ + X86_SIZE_p, /* Far pointer */ + X86_SIZE_pd, /* SSE/AVX packed double precision */ + X86_SIZE_pi, /* MMX */ + X86_SIZE_ps, /* SSE/AVX packed single precision */ + X86_SIZE_q, /* 64-bit */ + X86_SIZE_qq, /* AVX 256-bit */ + X86_SIZE_s, /* Descriptor */ + X86_SIZE_sd, /* SSE/AVX scalar double precision */ + X86_SIZE_ss, /* SSE/AVX scalar single precision */ + X86_SIZE_si, /* 32-bit GPR */ + X86_SIZE_v, /* 16/32/64-bit, based on operand size */ + X86_SIZE_w, /* 16-bit */ + X86_SIZE_x, /* 128/256-bit, based on operand size */ + X86_SIZE_y, /* 32/64-bit, based on operand size */ + X86_SIZE_z, /* 16-bit for 16-bit operand size, else 32-bit */ + + /* Custom */ + X86_SIZE_d64, + X86_SIZE_f64, +} X86OpSize; + +typedef enum X86CPUIDFeature { + X86_FEAT_None, + X86_FEAT_3DNOW, + X86_FEAT_ADX, + X86_FEAT_AES, + X86_FEAT_AVX, + X86_FEAT_AVX2, + X86_FEAT_BMI1, + X86_FEAT_BMI2, + X86_FEAT_MOVBE, + X86_FEAT_PCLMULQDQ, + X86_FEAT_SSE, + X86_FEAT_SSE2, + X86_FEAT_SSE3, + X86_FEAT_SSSE3, + X86_FEAT_SSE41, + X86_FEAT_SSE42, + X86_FEAT_SSE4A, +} X86CPUIDFeature; + +/* Execution flags */ + +typedef enum X86OpUnit { + X86_OP_SKIP, /* not valid or managed by emission function */ + X86_OP_SEG, /* segment selector */ + X86_OP_CR, /* control register */ + X86_OP_DR, /* debug register */ + X86_OP_INT, /* loaded into/stored from s->T0/T1 */ + X86_OP_IMM, /* immediate */ + X86_OP_SSE, /* address in either s->ptrX or s->A0 depending on has_ea */ + X86_OP_MMX, /* address in either s->ptrX or s->A0 depending on has_ea */ +} X86OpUnit; + +typedef enum X86InsnSpecial { + X86_SPECIAL_None, + + /* Always locked if it has a memory operand (XCHG) */ + X86_SPECIAL_Locked, + + /* Fault outside protected mode */ + X86_SPECIAL_ProtMode, + + /* + * Register operand 0/2 is zero extended to 32 bits. Rd/Mb or Rd/Mw + * in the manual. + */ + X86_SPECIAL_ZExtOp0, + X86_SPECIAL_ZExtOp2, + + /* + * Register operand 2 is extended to full width, while a memory operand + * is doubled in size if VEX.L=1. + */ + X86_SPECIAL_AVXExtMov, + + /* + * MMX instruction exists with no prefix; if there is no prefix, V/H/W/U operands + * become P/P/Q/N, and size "x" becomes "q". + */ + X86_SPECIAL_MMX, + + /* Illegal or exclusive to 64-bit mode */ + X86_SPECIAL_i64, + X86_SPECIAL_o64, +} X86InsnSpecial; + +/* + * Special cases for instructions that operate on XMM/YMM registers. Intel + * retconned all of them to have VEX exception classes other than 0 and 13, so + * all these only matter for instructions that have a VEX exception class. + * Based on tables in the "AVX and SSE Instruction Exception Specification" + * section of the manual. + */ +typedef enum X86VEXSpecial { + /* Legacy SSE instructions that allow unaligned operands */ + X86_VEX_SSEUnaligned, + + /* + * Used for instructions that distinguish the XMM operand type with an + * instruction prefix; legacy SSE encodings will allow unaligned operands + * for scalar operands only (identified by a REP prefix). In this case, + * the decoding table uses "x" for the vector operands instead of specifying + * pd/ps/sd/ss individually. + */ + X86_VEX_REPScalar, + + /* + * VEX instructions that only support 256-bit operands with AVX2 (Table 2-17 + * column 3). Columns 2 and 4 (instructions limited to 256- and 127-bit + * operands respectively) are implicit in the presence of dq and qq + * operands, and thus handled by decode_op_size. + */ + X86_VEX_AVX2_256, +} X86VEXSpecial; + + +typedef struct X86OpEntry X86OpEntry; +typedef struct X86DecodedInsn X86DecodedInsn; + +/* Decode function for multibyte opcodes. */ +typedef void (*X86DecodeFunc)(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b); + +/* Code generation function. */ +typedef void (*X86GenFunc)(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode); + +struct X86OpEntry { + /* Based on the is_decode flags. */ + union { + X86GenFunc gen; + X86DecodeFunc decode; + }; + /* op0 is always written, op1 and op2 are always read. */ + X86OpType op0:8; + X86OpSize s0:8; + X86OpType op1:8; + X86OpSize s1:8; + X86OpType op2:8; + X86OpSize s2:8; + /* Must be I and b respectively if present. */ + X86OpType op3:8; + X86OpSize s3:8; + + X86InsnSpecial special:8; + X86CPUIDFeature cpuid:8; + unsigned vex_class:8; + X86VEXSpecial vex_special:8; + uint16_t valid_prefix:16; + bool is_decode:1; +}; + +typedef struct X86DecodedOp { + int8_t n; + MemOp ot; /* For b/c/d/p/s/q/v/w/y/z */ + X86OpUnit unit; + bool has_ea; + int offset; /* For MMX and SSE */ + + /* + * This field is used internally by macros OP0_PTR/OP1_PTR/OP2_PTR, + * do not access directly! + */ + TCGv_ptr v_ptr; +} X86DecodedOp; + +struct X86DecodedInsn { + X86OpEntry e; + X86DecodedOp op[3]; + target_ulong immediate; + AddressParts mem; + + uint8_t b; +}; + diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc new file mode 100644 index 0000000..27eca59 --- /dev/null +++ b/target/i386/tcg/emit.c.inc @@ -0,0 +1,2234 @@ +/* + * New-style TCG opcode generator for i386 instructions + * + * Copyright (c) 2022 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) + +typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); +typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); +typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b); +typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c); +typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c, TCGv_ptr reg_d); +typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_i32 val); +typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c, TCGv_i32 val); +typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val); +typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c, + TCGv_i32 val); +typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv val); +typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale); + +static inline TCGv_i32 tcg_constant8u_i32(uint8_t val) +{ + return tcg_constant_i32(val); +} + +static void gen_NM_exception(DisasContext *s) +{ + gen_exception(s, EXCP07_PREX); +} + +static void gen_illegal(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_illegal_opcode(s); +} + +static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib) +{ + TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib); + gen_lea_v_seg(s, s->aflag, ea, mem->def_seg, s->override); +} + +static inline int mmx_offset(MemOp ot) +{ + switch (ot) { + case MO_8: + return offsetof(MMXReg, MMX_B(0)); + case MO_16: + return offsetof(MMXReg, MMX_W(0)); + case MO_32: + return offsetof(MMXReg, MMX_L(0)); + case MO_64: + return offsetof(MMXReg, MMX_Q(0)); + default: + g_assert_not_reached(); + } +} + +static inline int xmm_offset(MemOp ot) +{ + switch (ot) { + case MO_8: + return offsetof(ZMMReg, ZMM_B(0)); + case MO_16: + return offsetof(ZMMReg, ZMM_W(0)); + case MO_32: + return offsetof(ZMMReg, ZMM_L(0)); + case MO_64: + return offsetof(ZMMReg, ZMM_Q(0)); + case MO_128: + return offsetof(ZMMReg, ZMM_X(0)); + case MO_256: + return offsetof(ZMMReg, ZMM_Y(0)); + default: + g_assert_not_reached(); + } +} + +static int vector_reg_offset(X86DecodedOp *op) +{ + assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE); + + if (op->unit == X86_OP_MMX) { + return op->offset - mmx_offset(op->ot); + } else { + return op->offset - xmm_offset(op->ot); + } +} + +static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n) +{ + int base_ofs = vector_reg_offset(op); + switch(ot) { + case MO_8: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_B(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_B(n)); + } + case MO_16: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_W(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_W(n)); + } + case MO_32: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_L(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_L(n)); + } + case MO_64: + if (op->unit == X86_OP_MMX) { + return base_ofs; + } else { + return base_ofs + offsetof(ZMMReg, ZMM_Q(n)); + } + case MO_128: + assert(op->unit == X86_OP_SSE); + return base_ofs + offsetof(ZMMReg, ZMM_X(n)); + case MO_256: + assert(op->unit == X86_OP_SSE); + return base_ofs + offsetof(ZMMReg, ZMM_Y(n)); + default: + g_assert_not_reached(); + } +} + +static void compute_mmx_offset(X86DecodedOp *op) +{ + if (!op->has_ea) { + op->offset = offsetof(CPUX86State, fpregs[op->n].mmx) + mmx_offset(op->ot); + } else { + op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot); + } +} + +static void compute_xmm_offset(X86DecodedOp *op) +{ + if (!op->has_ea) { + op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot); + } else { + op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot); + } +} + +static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned) +{ + switch(ot) { + case MO_8: + gen_op_ld_v(s, MO_8, temp, s->A0); + tcg_gen_st8_tl(temp, cpu_env, dest_ofs); + break; + case MO_16: + gen_op_ld_v(s, MO_16, temp, s->A0); + tcg_gen_st16_tl(temp, cpu_env, dest_ofs); + break; + case MO_32: + gen_op_ld_v(s, MO_32, temp, s->A0); + tcg_gen_st32_tl(temp, cpu_env, dest_ofs); + break; + case MO_64: + gen_ldq_env_A0(s, dest_ofs); + break; + case MO_128: + gen_ldo_env_A0(s, dest_ofs, aligned); + break; + case MO_256: + gen_ldy_env_A0(s, dest_ofs, aligned); + break; + default: + g_assert_not_reached(); + } +} + +static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot) +{ + switch (decode->e.vex_class) { + case 2: + case 4: + if ((s->prefix & PREFIX_VEX) || + decode->e.vex_special == X86_VEX_SSEUnaligned) { + /* MOST legacy SSE instructions require aligned memory operands, but not all. */ + return false; + } + /* fall through */ + case 1: + return ot >= MO_128; + + default: + return false; + } +} + +static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v) +{ + X86DecodedOp *op = &decode->op[opn]; + + switch (op->unit) { + case X86_OP_SKIP: + return; + case X86_OP_SEG: + tcg_gen_ld32u_tl(v, cpu_env, + offsetof(CPUX86State,segs[op->n].selector)); + break; + case X86_OP_CR: + tcg_gen_ld_tl(v, cpu_env, offsetof(CPUX86State, cr[op->n])); + break; + case X86_OP_DR: + tcg_gen_ld_tl(v, cpu_env, offsetof(CPUX86State, dr[op->n])); + break; + case X86_OP_INT: + if (op->has_ea) { + gen_op_ld_v(s, op->ot, v, s->A0); + } else { + gen_op_mov_v_reg(s, op->ot, v, op->n); + } + break; + case X86_OP_IMM: + tcg_gen_movi_tl(v, decode->immediate); + break; + + case X86_OP_MMX: + compute_mmx_offset(op); + goto load_vector; + + case X86_OP_SSE: + compute_xmm_offset(op); + load_vector: + if (op->has_ea) { + bool aligned = sse_needs_alignment(s, decode, op->ot); + gen_load_sse(s, v, op->ot, op->offset, aligned); + } + break; + + default: + g_assert_not_reached(); + } +} + +static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn) +{ + X86DecodedOp *op = &decode->op[opn]; + if (op->v_ptr) { + return op->v_ptr; + } + op->v_ptr = tcg_temp_new_ptr(); + + /* The temporary points to the MMXReg or ZMMReg. */ + tcg_gen_addi_ptr(op->v_ptr, cpu_env, vector_reg_offset(op)); + return op->v_ptr; +} + +#define OP_PTR0 op_ptr(decode, 0) +#define OP_PTR1 op_ptr(decode, 1) +#define OP_PTR2 op_ptr(decode, 2) + +static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v) +{ + X86DecodedOp *op = &decode->op[opn]; + switch (op->unit) { + case X86_OP_SKIP: + break; + case X86_OP_SEG: + /* Note that gen_movl_seg_T0 takes care of interrupt shadow and TF. */ + gen_movl_seg_T0(s, op->n); + break; + case X86_OP_INT: + if (op->has_ea) { + gen_op_st_v(s, op->ot, v, s->A0); + } else { + gen_op_mov_reg_v(s, op->ot, op->n, v); + } + break; + case X86_OP_MMX: + break; + case X86_OP_SSE: + if ((s->prefix & PREFIX_VEX) && op->ot == MO_128) { + tcg_gen_gvec_dup_imm(MO_64, + offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)), + 16, 16, 0); + } + break; + case X86_OP_CR: + case X86_OP_DR: + default: + g_assert_not_reached(); + } +} + +static inline int vector_len(DisasContext *s, X86DecodedInsn *decode) +{ + if (decode->e.special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + return 8; + } + return s->vex_l ? 32 : 16; +} + +static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs) +{ + MemOp ot = decode->op[0].ot; + int vec_len = vector_len(s, decode); + bool aligned = sse_needs_alignment(s, decode, ot); + + if (!decode->op[0].has_ea) { + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len); + return; + } + + switch (ot) { + case MO_64: + gen_stq_env_A0(s, src_ofs); + break; + case MO_128: + gen_sto_env_A0(s, src_ofs, aligned); + break; + case MO_256: + gen_sty_env_A0(s, src_ofs, aligned); + break; + default: + g_assert_not_reached(); + } +} + +static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b) +{ + gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b); +} + +#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1) +static const SSEFunc_0_epp fns_3dnow[] = { + [0x0c] = gen_helper_pi2fw, + [0x0d] = gen_helper_pi2fd, + [0x1c] = gen_helper_pf2iw, + [0x1d] = gen_helper_pf2id, + [0x8a] = gen_helper_pfnacc, + [0x8e] = gen_helper_pfpnacc, + [0x90] = gen_helper_pfcmpge, + [0x94] = gen_helper_pfmin, + [0x96] = gen_helper_pfrcp, + [0x97] = gen_helper_pfrsqrt, + [0x9a] = gen_helper_pfsub, + [0x9e] = gen_helper_pfadd, + [0xa0] = gen_helper_pfcmpgt, + [0xa4] = gen_helper_pfmax, + [0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */ + [0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */ + [0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */ + [0xaa] = gen_helper_pfsubr, + [0xae] = gen_helper_pfacc, + [0xb0] = gen_helper_pfcmpeq, + [0xb4] = gen_helper_pfmul, + [0xb7] = gen_helper_pmulhrw_mmx, + [0xbb] = gen_helper_pswapd, + [0xbf] = gen_helper_pavgusb, +}; + +static void gen_3dnow(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + uint8_t b = decode->immediate; + SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL; + + if (!fn) { + gen_illegal_opcode(s); + return; + } + if (s->flags & HF_TS_MASK) { + gen_NM_exception(s); + return; + } + if (s->flags & HF_EM_MASK) { + gen_illegal_opcode(s); + return; + } + + gen_helper_enter_mmx(cpu_env); + if (fn == FN_3DNOW_MOVE) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset); + } else { + fn(cpu_env, OP_PTR0, OP_PTR1); + } +} + +/* + * 00 = v*ps Vps, Hps, Wpd + * 66 = v*pd Vpd, Hpd, Wps + * f3 = v*ss Vss, Hss, Wps + * f2 = v*sd Vsd, Hsd, Wps + */ +static inline void gen_unary_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm, + SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm, + SSEFunc_0_eppp sd, SSEFunc_0_eppp ss) +{ + if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) { + SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd; + if (!fn) { + gen_illegal_opcode(s); + return; + } + fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else { + SSEFunc_0_epp ps, pd, fn; + ps = s->vex_l ? ps_ymm : ps_xmm; + pd = s->vex_l ? pd_ymm : pd_xmm; + fn = s->prefix & PREFIX_DATA ? pd : ps; + if (!fn) { + gen_illegal_opcode(s); + return; + } + fn(cpu_env, OP_PTR0, OP_PTR2); + } +} +#define UNARY_FP_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_unary_fp_sse(s, env, decode, \ + gen_helper_##lname##pd_xmm, \ + gen_helper_##lname##ps_xmm, \ + gen_helper_##lname##pd_ymm, \ + gen_helper_##lname##ps_ymm, \ + gen_helper_##lname##sd, \ + gen_helper_##lname##ss); \ +} +UNARY_FP_SSE(VSQRT, sqrt) + +/* + * 00 = v*ps Vps, Hps, Wpd + * 66 = v*pd Vpd, Hpd, Wps + * f3 = v*ss Vss, Hss, Wps + * f2 = v*sd Vsd, Hsd, Wps + */ +static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm, + SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm, + SSEFunc_0_eppp sd, SSEFunc_0_eppp ss) +{ + SSEFunc_0_eppp ps, pd, fn; + if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) { + fn = s->prefix & PREFIX_REPZ ? ss : sd; + } else { + ps = s->vex_l ? ps_ymm : ps_xmm; + pd = s->vex_l ? pd_ymm : pd_xmm; + fn = s->prefix & PREFIX_DATA ? pd : ps; + } + if (fn) { + fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else { + gen_illegal_opcode(s); + } +} + +#define FP_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_fp_sse(s, env, decode, \ + gen_helper_##lname##pd_xmm, \ + gen_helper_##lname##ps_xmm, \ + gen_helper_##lname##pd_ymm, \ + gen_helper_##lname##ps_ymm, \ + gen_helper_##lname##sd, \ + gen_helper_##lname##ss); \ +} +FP_SSE(VADD, add) +FP_SSE(VMUL, mul) +FP_SSE(VSUB, sub) +FP_SSE(VMIN, min) +FP_SSE(VDIV, div) +FP_SSE(VMAX, max) + +#define FP_UNPACK_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + /* PS maps to the DQ integer instruction, PD maps to QDQ. */ \ + gen_fp_sse(s, env, decode, \ + gen_helper_##lname##qdq_xmm, \ + gen_helper_##lname##dq_xmm, \ + gen_helper_##lname##qdq_ymm, \ + gen_helper_##lname##dq_ymm, \ + NULL, NULL); \ +} +FP_UNPACK_SSE(VUNPCKLPx, punpckl) +FP_UNPACK_SSE(VUNPCKHPx, punpckh) + +/* + * 00 = v*ps Vps, Wpd + * f3 = v*ss Vss, Wps + */ +static inline void gen_unary_fp32_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_epp ps_xmm, + SSEFunc_0_epp ps_ymm, + SSEFunc_0_eppp ss) +{ + if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) { + goto illegal_op; + } else if (s->prefix & PREFIX_REPZ) { + if (!ss) { + goto illegal_op; + } + ss(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else { + SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm; + if (!fn) { + goto illegal_op; + } + fn(cpu_env, OP_PTR0, OP_PTR2); + } + return; + +illegal_op: + gen_illegal_opcode(s); +} +#define UNARY_FP32_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_unary_fp32_sse(s, env, decode, \ + gen_helper_##lname##ps_xmm, \ + gen_helper_##lname##ps_ymm, \ + gen_helper_##lname##ss); \ +} +UNARY_FP32_SSE(VRSQRT, rsqrt) +UNARY_FP32_SSE(VRCP, rcp) + +/* + * 66 = v*pd Vpd, Hpd, Wpd + * f2 = v*ps Vps, Hps, Wps + */ +static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm, + SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm) +{ + SSEFunc_0_eppp ps, pd, fn; + ps = s->vex_l ? ps_ymm : ps_xmm; + pd = s->vex_l ? pd_ymm : pd_xmm; + fn = s->prefix & PREFIX_DATA ? pd : ps; + fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); +} +#define HORIZONTAL_FP_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_horizontal_fp_sse(s, env, decode, \ + gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm, \ + gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \ +} +HORIZONTAL_FP_SSE(VHADD, hadd) +HORIZONTAL_FP_SSE(VHSUB, hsub) +HORIZONTAL_FP_SSE(VADDSUB, addsub) + +static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm) +{ + SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm; + TCGv_ptr ptr3 = tcg_temp_new_ptr(); + + /* The format of the fourth input is Lx */ + tcg_gen_addi_ptr(ptr3, cpu_env, ZMM_OFFSET(op3)); + fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3); + tcg_temp_free_ptr(ptr3); +} +#define TERNARY_SSE(uname, uvname, lname) \ +static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4, \ + gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \ +} \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_ternary_sse(s, env, decode, 0, \ + gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \ +} +TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps) +TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd) +TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb) + +static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + if (!s->vex_l) { + xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); + } else { + ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); + } +} + +#define BINARY_IMM_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_binary_imm_sse(s, env, decode, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +BINARY_IMM_SSE(VBLENDPD, blendpd) +BINARY_IMM_SSE(VBLENDPS, blendps) +BINARY_IMM_SSE(VPBLENDW, pblendw) +BINARY_IMM_SSE(VDDPS, dpps) +#define gen_helper_dppd_ymm NULL +BINARY_IMM_SSE(VDDPD, dppd) +BINARY_IMM_SSE(VMPSADBW, mpsadbw) +BINARY_IMM_SSE(PCLMULQDQ, pclmulqdq) + + +#define UNARY_INT_GVEC(uname, func, ...) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + int vec_len = vector_len(s, decode); \ + \ + func(__VA_ARGS__, decode->op[0].offset, \ + decode->op[2].offset, vec_len, vec_len); \ +} +UNARY_INT_GVEC(PABSB, tcg_gen_gvec_abs, MO_8) +UNARY_INT_GVEC(PABSW, tcg_gen_gvec_abs, MO_16) +UNARY_INT_GVEC(PABSD, tcg_gen_gvec_abs, MO_32) +UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128) +UNARY_INT_GVEC(VPBROADCASTB, tcg_gen_gvec_dup_mem, MO_8) +UNARY_INT_GVEC(VPBROADCASTW, tcg_gen_gvec_dup_mem, MO_16) +UNARY_INT_GVEC(VPBROADCASTD, tcg_gen_gvec_dup_mem, MO_32) +UNARY_INT_GVEC(VPBROADCASTQ, tcg_gen_gvec_dup_mem, MO_64) + + +#define BINARY_INT_GVEC(uname, func, ...) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + int vec_len = vector_len(s, decode); \ + \ + func(__VA_ARGS__, \ + decode->op[0].offset, decode->op[1].offset, \ + decode->op[2].offset, vec_len, vec_len); \ +} + +BINARY_INT_GVEC(PADDB, tcg_gen_gvec_add, MO_8) +BINARY_INT_GVEC(PADDW, tcg_gen_gvec_add, MO_16) +BINARY_INT_GVEC(PADDD, tcg_gen_gvec_add, MO_32) +BINARY_INT_GVEC(PADDQ, tcg_gen_gvec_add, MO_64) +BINARY_INT_GVEC(PADDSB, tcg_gen_gvec_ssadd, MO_8) +BINARY_INT_GVEC(PADDSW, tcg_gen_gvec_ssadd, MO_16) +BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8) +BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16) +BINARY_INT_GVEC(PAND, tcg_gen_gvec_and, MO_64) +BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8) +BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32) +BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16) +BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64) +BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8) +BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16) +BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32) +BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64) +BINARY_INT_GVEC(PMAXSB, tcg_gen_gvec_smax, MO_8) +BINARY_INT_GVEC(PMAXSW, tcg_gen_gvec_smax, MO_16) +BINARY_INT_GVEC(PMAXSD, tcg_gen_gvec_smax, MO_32) +BINARY_INT_GVEC(PMAXUB, tcg_gen_gvec_umax, MO_8) +BINARY_INT_GVEC(PMAXUW, tcg_gen_gvec_umax, MO_16) +BINARY_INT_GVEC(PMAXUD, tcg_gen_gvec_umax, MO_32) +BINARY_INT_GVEC(PMINSB, tcg_gen_gvec_smin, MO_8) +BINARY_INT_GVEC(PMINSW, tcg_gen_gvec_smin, MO_16) +BINARY_INT_GVEC(PMINSD, tcg_gen_gvec_smin, MO_32) +BINARY_INT_GVEC(PMINUB, tcg_gen_gvec_umin, MO_8) +BINARY_INT_GVEC(PMINUW, tcg_gen_gvec_umin, MO_16) +BINARY_INT_GVEC(PMINUD, tcg_gen_gvec_umin, MO_32) +BINARY_INT_GVEC(PMULLW, tcg_gen_gvec_mul, MO_16) +BINARY_INT_GVEC(PMULLD, tcg_gen_gvec_mul, MO_32) +BINARY_INT_GVEC(POR, tcg_gen_gvec_or, MO_64) +BINARY_INT_GVEC(PSUBB, tcg_gen_gvec_sub, MO_8) +BINARY_INT_GVEC(PSUBW, tcg_gen_gvec_sub, MO_16) +BINARY_INT_GVEC(PSUBD, tcg_gen_gvec_sub, MO_32) +BINARY_INT_GVEC(PSUBQ, tcg_gen_gvec_sub, MO_64) +BINARY_INT_GVEC(PSUBSB, tcg_gen_gvec_sssub, MO_8) +BINARY_INT_GVEC(PSUBSW, tcg_gen_gvec_sssub, MO_16) +BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8) +BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16) +BINARY_INT_GVEC(PXOR, tcg_gen_gvec_xor, MO_64) + + +/* + * 00 = p* Pq, Qq (if mmx not NULL; no VEX) + * 66 = vp* Vx, Hx, Wx + * + * These are really the same encoding, because 1) V is the same as P when VEX.V + * is not present 2) P and Q are the same as H and W apart from MM/XMM + */ +static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm) +{ + assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX)); + + if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) { + /* VEX encoding is not applicable to MMX instructions. */ + gen_illegal_opcode(s); + return; + } + if (!(s->prefix & PREFIX_DATA)) { + mmx(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else if (!s->vex_l) { + xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else { + ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } +} + + +#define BINARY_INT_MMX(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_binary_int_sse(s, env, decode, \ + gen_helper_##lname##_mmx, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} +BINARY_INT_MMX(PUNPCKLBW, punpcklbw) +BINARY_INT_MMX(PUNPCKLWD, punpcklwd) +BINARY_INT_MMX(PUNPCKLDQ, punpckldq) +BINARY_INT_MMX(PACKSSWB, packsswb) +BINARY_INT_MMX(PACKUSWB, packuswb) +BINARY_INT_MMX(PUNPCKHBW, punpckhbw) +BINARY_INT_MMX(PUNPCKHWD, punpckhwd) +BINARY_INT_MMX(PUNPCKHDQ, punpckhdq) +BINARY_INT_MMX(PACKSSDW, packssdw) + +BINARY_INT_MMX(PAVGB, pavgb) +BINARY_INT_MMX(PAVGW, pavgw) +BINARY_INT_MMX(PMADDWD, pmaddwd) +BINARY_INT_MMX(PMULHUW, pmulhuw) +BINARY_INT_MMX(PMULHW, pmulhw) +BINARY_INT_MMX(PMULUDQ, pmuludq) +BINARY_INT_MMX(PSADBW, psadbw) + +BINARY_INT_MMX(PSLLW_r, psllw) +BINARY_INT_MMX(PSLLD_r, pslld) +BINARY_INT_MMX(PSLLQ_r, psllq) +BINARY_INT_MMX(PSRLW_r, psrlw) +BINARY_INT_MMX(PSRLD_r, psrld) +BINARY_INT_MMX(PSRLQ_r, psrlq) +BINARY_INT_MMX(PSRAW_r, psraw) +BINARY_INT_MMX(PSRAD_r, psrad) + +BINARY_INT_MMX(PHADDW, phaddw) +BINARY_INT_MMX(PHADDSW, phaddsw) +BINARY_INT_MMX(PHADDD, phaddd) +BINARY_INT_MMX(PHSUBW, phsubw) +BINARY_INT_MMX(PHSUBSW, phsubsw) +BINARY_INT_MMX(PHSUBD, phsubd) +BINARY_INT_MMX(PMADDUBSW, pmaddubsw) +BINARY_INT_MMX(PSHUFB, pshufb) +BINARY_INT_MMX(PSIGNB, psignb) +BINARY_INT_MMX(PSIGNW, psignw) +BINARY_INT_MMX(PSIGND, psignd) +BINARY_INT_MMX(PMULHRSW, pmulhrsw) + +/* Instructions with no MMX equivalent. */ +#define BINARY_INT_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_binary_int_sse(s, env, decode, \ + NULL, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +/* Instructions with no MMX equivalent. */ +BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq) +BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq) +BINARY_INT_SSE(VPACKUSDW, packusdw) +BINARY_INT_SSE(VPERMILPS, vpermilps) +BINARY_INT_SSE(VPERMILPD, vpermilpd) +BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd) +BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq) + +BINARY_INT_SSE(PMULDQ, pmuldq) + +BINARY_INT_SSE(VAESDEC, aesdec) +BINARY_INT_SSE(VAESDECLAST, aesdeclast) +BINARY_INT_SSE(VAESENC, aesenc) +BINARY_INT_SSE(VAESENCLAST, aesenclast) + +#define UNARY_CMP_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + if (!s->vex_l) { \ + gen_helper_##lname##_xmm(cpu_env, OP_PTR1, OP_PTR2); \ + } else { \ + gen_helper_##lname##_ymm(cpu_env, OP_PTR1, OP_PTR2); \ + } \ + set_cc_op(s, CC_OP_EFLAGS); \ +} +UNARY_CMP_SSE(VPTEST, ptest) +UNARY_CMP_SSE(VTESTPS, vtestps) +UNARY_CMP_SSE(VTESTPD, vtestpd) + +static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_epp xmm, SSEFunc_0_epp ymm) +{ + if (!s->vex_l) { + xmm(cpu_env, OP_PTR0, OP_PTR2); + } else { + ymm(cpu_env, OP_PTR0, OP_PTR2); + } +} + +#define UNARY_INT_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_unary_int_sse(s, env, decode, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +UNARY_INT_SSE(VPMOVSXBW, pmovsxbw) +UNARY_INT_SSE(VPMOVSXBD, pmovsxbd) +UNARY_INT_SSE(VPMOVSXBQ, pmovsxbq) +UNARY_INT_SSE(VPMOVSXWD, pmovsxwd) +UNARY_INT_SSE(VPMOVSXWQ, pmovsxwq) +UNARY_INT_SSE(VPMOVSXDQ, pmovsxdq) + +UNARY_INT_SSE(VPMOVZXBW, pmovzxbw) +UNARY_INT_SSE(VPMOVZXBD, pmovzxbd) +UNARY_INT_SSE(VPMOVZXBQ, pmovzxbq) +UNARY_INT_SSE(VPMOVZXWD, pmovzxwd) +UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq) +UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq) + +UNARY_INT_SSE(VMOVSLDUP, pmovsldup) +UNARY_INT_SSE(VMOVSHDUP, pmovshdup) +UNARY_INT_SSE(VMOVDDUP, pmovdldup) + +UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd) +UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq) +UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq) +UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps) +UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq) +UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq) + + +static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + if (!s->vex_l) { + xmm(OP_PTR0, OP_PTR1, imm); + } else { + ymm(OP_PTR0, OP_PTR1, imm); + } +} + +#define UNARY_IMM_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_unary_imm_sse(s, env, decode, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +UNARY_IMM_SSE(PSHUFD, pshufd) +UNARY_IMM_SSE(PSHUFHW, pshufhw) +UNARY_IMM_SSE(PSHUFLW, pshuflw) +#define gen_helper_vpermq_xmm NULL +UNARY_IMM_SSE(VPERMQ, vpermq) +UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm) +UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm) + +static inline void gen_unary_imm_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + if (!s->vex_l) { + xmm(cpu_env, OP_PTR0, OP_PTR1, imm); + } else { + ymm(cpu_env, OP_PTR0, OP_PTR1, imm); + } +} + +#define UNARY_IMM_FP_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_unary_imm_fp_sse(s, env, decode, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +UNARY_IMM_FP_SSE(VROUNDPS, roundps) +UNARY_IMM_FP_SSE(VROUNDPD, roundpd) + +static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm, + SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm) +{ + SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm; + SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm; + SSEFunc_0_eppp fn = s->vex_w ? q : d; + fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); +} + +/* VEX.W affects whether to operate on 32- or 64-bit elements. */ +#define VEXW_AVX(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_vexw_avx(s, env, decode, \ + gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \ + gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \ +} +VEXW_AVX(VPSLLV, vpsllv) +VEXW_AVX(VPSRLV, vpsrlv) +VEXW_AVX(VPSRAV, vpsrav) +VEXW_AVX(VPMASKMOV, vpmaskmov) + +/* Same as above, but with extra arguments to the helper. */ +static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm, + SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm) +{ + SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm; + SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm; + SSEFunc_0_epppti fn = s->vex_w ? q : d; + TCGv_i32 scale = tcg_constant_i32(decode->mem.scale); + TCGv_ptr index = tcg_temp_new_ptr(); + + /* Pass third input as (index, base, scale) */ + tcg_gen_addi_ptr(index, cpu_env, ZMM_OFFSET(decode->mem.index)); + fn(cpu_env, OP_PTR0, OP_PTR1, index, s->A0, scale); + + /* + * There are two output operands, so zero OP1's high 128 bits + * in the VEX.128 case. + */ + if (!s->vex_l) { + int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1); + tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0); + } + tcg_temp_free_ptr(index); +} +#define VSIB_AVX(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_vsib_avx(s, env, decode, \ + gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \ + gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \ +} +VSIB_AVX(VPGATHERD, vpgatherd) +VSIB_AVX(VPGATHERQ, vpgatherq) + +static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op) +{ + TCGv carry_in = NULL; + TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2); + TCGv zero; + + if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) { + /* Re-use the carry-out from a previous round. */ + carry_in = carry_out; + cc_op = s->cc_op; + } else if (s->cc_op == CC_OP_ADCX || s->cc_op == CC_OP_ADOX) { + /* Merge with the carry-out from the opposite instruction. */ + cc_op = CC_OP_ADCOX; + } + + /* If we don't have a carry-in, get it out of EFLAGS. */ + if (!carry_in) { + if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) { + gen_compute_eflags(s); + } + carry_in = s->tmp0; + tcg_gen_extract_tl(carry_in, cpu_cc_src, + ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1); + } + + switch (ot) { +#ifdef TARGET_X86_64 + case MO_32: + /* If TL is 64-bit just do everything in 64-bit arithmetic. */ + tcg_gen_add_i64(s->T0, s->T0, s->T1); + tcg_gen_add_i64(s->T0, s->T0, carry_in); + tcg_gen_shri_i64(carry_out, s->T0, 32); + break; +#endif + default: + zero = tcg_constant_tl(0); + tcg_gen_add2_tl(s->T0, carry_out, s->T0, zero, carry_in, zero); + tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero); + break; + } + set_cc_op(s, cc_op); +} + +static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADCX); +} + +static void gen_ADOX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADOX); +} + +static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + tcg_gen_andc_tl(s->T0, s->T1, s->T0); + gen_op_update1_cc(s); + set_cc_op(s, CC_OP_LOGICB + ot); +} + +static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + TCGv bound, zero; + + /* + * Extract START, and shift the operand. + * Shifts larger than operand size get zeros. + */ + tcg_gen_ext8u_tl(s->A0, s->T1); + tcg_gen_shr_tl(s->T0, s->T0, s->A0); + + bound = tcg_constant_tl(ot == MO_64 ? 63 : 31); + zero = tcg_constant_tl(0); + tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero); + + /* + * Extract the LEN into a mask. Lengths larger than + * operand size get all ones. + */ + tcg_gen_extract_tl(s->A0, s->T1, 8, 8); + tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->A0, bound, s->A0, bound); + + tcg_gen_movi_tl(s->T1, 1); + tcg_gen_shl_tl(s->T1, s->T1, s->A0); + tcg_gen_subi_tl(s->T1, s->T1, 1); + tcg_gen_and_tl(s->T0, s->T0, s->T1); + + gen_op_update1_cc(s); + set_cc_op(s, CC_OP_LOGICB + ot); +} + +static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + tcg_gen_neg_tl(s->T1, s->T0); + tcg_gen_and_tl(s->T0, s->T0, s->T1); + tcg_gen_mov_tl(cpu_cc_dst, s->T0); + set_cc_op(s, CC_OP_BMILGB + ot); +} + +static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + tcg_gen_subi_tl(s->T1, s->T0, 1); + tcg_gen_xor_tl(s->T0, s->T0, s->T1); + tcg_gen_mov_tl(cpu_cc_dst, s->T0); + set_cc_op(s, CC_OP_BMILGB + ot); +} + +static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + tcg_gen_subi_tl(s->T1, s->T0, 1); + tcg_gen_and_tl(s->T0, s->T0, s->T1); + tcg_gen_mov_tl(cpu_cc_dst, s->T0); + set_cc_op(s, CC_OP_BMILGB + ot); +} + +static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + TCGv bound; + + tcg_gen_ext8u_tl(s->T1, cpu_regs[s->vex_v]); + bound = tcg_constant_tl(ot == MO_64 ? 63 : 31); + + /* + * Note that since we're using BMILG (in order to get O + * cleared) we need to store the inverse into C. + */ + tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src, s->T1, bound); + tcg_gen_movcond_tl(TCG_COND_GT, s->T1, s->T1, bound, bound, s->T1); + + tcg_gen_movi_tl(s->A0, -1); + tcg_gen_shl_tl(s->A0, s->A0, s->T1); + tcg_gen_andc_tl(s->T0, s->T0, s->A0); + + gen_op_update1_cc(s); + set_cc_op(s, CC_OP_BMILGB + ot); +} + +static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[2].ot; + + tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); + gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot)); +} + +static void gen_CVTPI2Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_enter_mmx(cpu_env); + if (s->prefix & PREFIX_DATA) { + gen_helper_cvtpi2pd(cpu_env, OP_PTR0, OP_PTR2); + } else { + gen_helper_cvtpi2ps(cpu_env, OP_PTR0, OP_PTR2); + } +} + +static void gen_CVTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_enter_mmx(cpu_env); + if (s->prefix & PREFIX_DATA) { + gen_helper_cvtpd2pi(cpu_env, OP_PTR0, OP_PTR2); + } else { + gen_helper_cvtps2pi(cpu_env, OP_PTR0, OP_PTR2); + } +} + +static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_enter_mmx(cpu_env); + if (s->prefix & PREFIX_DATA) { + gen_helper_cvttpd2pi(cpu_env, OP_PTR0, OP_PTR2); + } else { + gen_helper_cvttps2pi(cpu_env, OP_PTR0, OP_PTR2); + } +} + +static void gen_EMMS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_emms(cpu_env); +} + +static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 length = tcg_constant_i32(decode->immediate & 63); + TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63); + + gen_helper_extrq_i(cpu_env, OP_PTR0, index, length); +} + +static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_extrq_r(cpu_env, OP_PTR0, OP_PTR2); +} + +static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 length = tcg_constant_i32(decode->immediate & 63); + TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63); + + gen_helper_insertq_i(cpu_env, OP_PTR0, OP_PTR1, index, length); +} + +static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_insertq_r(cpu_env, OP_PTR0, OP_PTR2); +} + +static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + if (s->vex_l) { + gen_illegal_opcode(s); + return; + } + tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1); + gen_helper_ldmxcsr(cpu_env, s->tmp2_i32); +} + +static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]); + gen_extu(s->aflag, s->A0); + gen_add_A0_ds_seg(s); + + if (s->prefix & PREFIX_DATA) { + gen_helper_maskmov_xmm(cpu_env, OP_PTR1, OP_PTR2, s->A0); + } else { + gen_helper_maskmov_mmx(cpu_env, OP_PTR1, OP_PTR2, s->A0); + } +} + +static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* M operand type does not load/store */ + if (decode->e.op0 == X86_TYPE_M) { + tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE); + } else { + tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE); + } +} + +static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[2].ot; + + switch (ot) { + case MO_32: +#ifdef TARGET_X86_64 + tcg_gen_ld32u_tl(s->T0, cpu_env, decode->op[2].offset); + break; + case MO_64: +#endif + tcg_gen_ld_tl(s->T0, cpu_env, decode->op[2].offset); + break; + default: + abort(); + } +} + +static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[2].ot; + int vec_len = vector_len(s, decode); + int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0); + + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + + switch (ot) { + case MO_32: +#ifdef TARGET_X86_64 + tcg_gen_st32_tl(s->T1, cpu_env, lo_ofs); + break; + case MO_64: +#endif + tcg_gen_st_tl(s->T1, cpu_env, lo_ofs); + break; + default: + g_assert_not_reached(); + } +} + +static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_store_sse(s, decode, decode->op[2].offset); +} + +static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn; + ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm; + pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm; + fn = s->prefix & PREFIX_DATA ? pd : ps; + fn(s->tmp2_i32, cpu_env, OP_PTR2); + tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32); +} + +static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0); + + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset); + if (decode->op[0].has_ea) { + tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ); + } else { + /* + * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would + * seem to work, but it does not on big-endian platforms; the cleared parts + * are always at higher addresses, but cross-endian emulation inverts the + * byte order so that the cleared parts need to be at *lower* addresses. + * Because oprsz is 8, we see this here even for SSE; but more in general, + * it disqualifies using oprsz < maxsz to emulate VEX128. + */ + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs); + } +} + +static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_helper_enter_mmx(cpu_env); + /* Otherwise the same as any other movq. */ + return gen_MOVQ(s, env, decode); +} + +static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* low part of result in VEX.vvvv, high in MODRM */ + switch (ot) { + default: + tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); + tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1); + tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32, + s->tmp2_i32, s->tmp3_i32); + tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32); + tcg_gen_extu_i32_tl(s->T0, s->tmp3_i32); + break; +#ifdef TARGET_X86_64 + case MO_64: + tcg_gen_mulu2_i64(cpu_regs[s->vex_v], s->T0, s->T0, s->T1); + break; +#endif + } + +} + +static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + if (!(s->prefix & PREFIX_DATA)) { + gen_helper_palignr_mmx(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); + } else if (!s->vex_l) { + gen_helper_palignr_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); + } else { + gen_helper_palignr_ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); + } +} + +static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + /* Careful, operand order is reversed! */ + tcg_gen_gvec_andc(MO_64, + decode->op[0].offset, decode->op[2].offset, + decode->op[1].offset, vec_len, vec_len); +} + +static void gen_PCMPESTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + gen_helper_pcmpestri_xmm(cpu_env, OP_PTR1, OP_PTR2, imm); + set_cc_op(s, CC_OP_EFLAGS); +} + +static void gen_PCMPESTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + gen_helper_pcmpestrm_xmm(cpu_env, OP_PTR1, OP_PTR2, imm); + set_cc_op(s, CC_OP_EFLAGS); + if ((s->prefix & PREFIX_VEX) && !s->vex_l) { + tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)), + 16, 16, 0); + } +} + +static void gen_PCMPISTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + gen_helper_pcmpistri_xmm(cpu_env, OP_PTR1, OP_PTR2, imm); + set_cc_op(s, CC_OP_EFLAGS); +} + +static void gen_PCMPISTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + gen_helper_pcmpistrm_xmm(cpu_env, OP_PTR1, OP_PTR2, imm); + set_cc_op(s, CC_OP_EFLAGS); + if ((s->prefix & PREFIX_VEX) && !s->vex_l) { + tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)), + 16, 16, 0); + } +} + +static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[1].ot; + if (ot < MO_64) { + tcg_gen_ext32u_tl(s->T0, s->T0); + } + gen_helper_pdep(s->T0, s->T0, s->T1); +} + +static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[1].ot; + if (ot < MO_64) { + tcg_gen_ext32u_tl(s->T0, s->T0); + } + gen_helper_pext(s->T0, s->T0, s->T1); +} + +static inline void gen_pextr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot) +{ + int vec_len = vector_len(s, decode); + int mask = (vec_len >> ot) - 1; + int val = decode->immediate & mask; + + switch (ot) { + case MO_8: + tcg_gen_ld8u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val)); + break; + case MO_16: + tcg_gen_ld16u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val)); + break; + case MO_32: +#ifdef TARGET_X86_64 + tcg_gen_ld32u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val)); + break; + case MO_64: +#endif + tcg_gen_ld_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val)); + break; + default: + abort(); + } +} + +static void gen_PEXTRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pextr(s, env, decode, MO_8); +} + +static void gen_PEXTRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pextr(s, env, decode, MO_16); +} + +static void gen_PEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + gen_pextr(s, env, decode, ot); +} + +static inline void gen_pinsr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot) +{ + int vec_len = vector_len(s, decode); + int mask = (vec_len >> ot) - 1; + int val = decode->immediate & mask; + + if (decode->op[1].offset != decode->op[0].offset) { + assert(vec_len == 16); + gen_store_sse(s, decode, decode->op[1].offset); + } + + switch (ot) { + case MO_8: + tcg_gen_st8_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val)); + break; + case MO_16: + tcg_gen_st16_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val)); + break; + case MO_32: +#ifdef TARGET_X86_64 + tcg_gen_st32_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val)); + break; + case MO_64: +#endif + tcg_gen_st_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val)); + break; + default: + abort(); + } +} + +static void gen_PINSRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pinsr(s, env, decode, MO_8); +} + +static void gen_PINSRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pinsr(s, env, decode, MO_16); +} + +static void gen_PINSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pinsr(s, env, decode, decode->op[2].ot); +} + +static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_andi_i64(d, s, 0x8080808080808080ull); + + /* + * After each shift+or pair: + * 0: a.......b.......c.......d.......e.......f.......g.......h....... + * 7: ab......bc......cd......de......ef......fg......gh......h....... + * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h....... + * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h....... + * The result is left in the high bits of the word. + */ + tcg_gen_shli_i64(t, d, 7); + tcg_gen_or_i64(d, d, t); + tcg_gen_shli_i64(t, d, 14); + tcg_gen_or_i64(d, d, t); + tcg_gen_shli_i64(t, d, 28); + tcg_gen_or_i64(d, d, t); +} + +static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80); + + /* See above */ + tcg_gen_and_vec(vece, d, s, m); + tcg_gen_shli_vec(vece, t, d, 7); + tcg_gen_or_vec(vece, d, d, t); + tcg_gen_shli_vec(vece, t, d, 14); + tcg_gen_or_vec(vece, d, d, t); + tcg_gen_shli_vec(vece, t, d, 28); + tcg_gen_or_vec(vece, d, d, t); +} + +#ifdef TARGET_X86_64 +#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i64 +#else +#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i32 +#endif + +static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; + static const GVecGen2 g = { + .fni8 = gen_pmovmskb_i64, + .fniv = gen_pmovmskb_vec, + .opt_opc = vecop_list, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 + }; + MemOp ot = decode->op[2].ot; + int vec_len = vector_len(s, decode); + TCGv t = tcg_temp_new(); + + tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset, + vec_len, vec_len, &g); + tcg_gen_ld8u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1))); + while (vec_len > 8) { + vec_len -= 8; + if (TCG_TARGET_HAS_extract2_tl) { + /* + * Load the next byte of the result into the high byte of T. + * TCG does a similar expansion of deposit to shl+extract2; by + * loading the whole word, the shift left is avoided. + */ +#ifdef TARGET_X86_64 + tcg_gen_ld_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8))); +#else + tcg_gen_ld_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4))); +#endif + + tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8); + } else { + /* + * The _previous_ value is deposited into bits 8 and higher of t. Because + * those bits are known to be zero after ld8u, this becomes a shift+or + * if deposit is not available. + */ + tcg_gen_ld8u_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1))); + tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8); + } + } + tcg_temp_free(t); +} + +static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm); +} + +static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 16) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shri(MO_16, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 16) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shli(MO_16, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 16) { + decode->immediate = 15; + } + tcg_gen_gvec_sari(MO_16, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); +} + +static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 32) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shri(MO_32, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 32) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shli(MO_32, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 32) { + decode->immediate = 31; + } + tcg_gen_gvec_sari(MO_32, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); +} + +static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 64) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shri(MO_64, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + if (decode->immediate >= 64) { + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else { + tcg_gen_gvec_shli(MO_64, + decode->op[0].offset, decode->op[1].offset, + decode->immediate, vec_len, vec_len); + } +} + +static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len) +{ + MemOp ot = vec_len == 16 ? MO_128 : MO_256; + TCGv_i32 imm_v = tcg_constant8u_i32(imm); + TCGv_ptr ptr = tcg_temp_new_ptr(); + + tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), + vec_len, vec_len, 0); + + tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0)); + tcg_gen_st_i32(imm_v, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0))); + return ptr; +} + +static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len); + + if (s->vex_l) { + gen_helper_psrldq_ymm(cpu_env, OP_PTR0, OP_PTR1, imm_vec); + } else { + gen_helper_psrldq_xmm(cpu_env, OP_PTR0, OP_PTR1, imm_vec); + } + tcg_temp_free_ptr(imm_vec); +} + +static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len); + + if (s->vex_l) { + gen_helper_pslldq_ymm(cpu_env, OP_PTR0, OP_PTR1, imm_vec); + } else { + gen_helper_pslldq_xmm(cpu_env, OP_PTR0, OP_PTR1, imm_vec); + } + tcg_temp_free_ptr(imm_vec); +} + +static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + int b = decode->immediate; + + if (ot == MO_64) { + tcg_gen_rotri_tl(s->T0, s->T0, b & 63); + } else { + tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); + tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b & 31); + tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32); + } +} + +static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + int mask; + + mask = ot == MO_64 ? 63 : 31; + tcg_gen_andi_tl(s->T1, s->T1, mask); + if (ot != MO_64) { + tcg_gen_ext32s_tl(s->T0, s->T0); + } + tcg_gen_sar_tl(s->T0, s->T0, s->T1); +} + +static void gen_SHLX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + int mask; + + mask = ot == MO_64 ? 63 : 31; + tcg_gen_andi_tl(s->T1, s->T1, mask); + tcg_gen_shl_tl(s->T0, s->T0, s->T1); +} + +static void gen_SHRX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + int mask; + + mask = ot == MO_64 ? 63 : 31; + tcg_gen_andi_tl(s->T1, s->T1, mask); + if (ot != MO_64) { + tcg_gen_ext32u_tl(s->T0, s->T0); + } + tcg_gen_shr_tl(s->T0, s->T0, s->T1); +} + +static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + assert(!s->vex_l); + gen_helper_aeskeygenassist_xmm(cpu_env, OP_PTR0, OP_PTR1, imm); +} + +static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + if (s->vex_l) { + gen_illegal_opcode(s); + return; + } + gen_helper_update_mxcsr(cpu_env); + tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, mxcsr)); +} + +static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + assert(!s->vex_l); + gen_helper_aesimc_xmm(cpu_env, OP_PTR0, OP_PTR2); +} + +/* + * 00 = v*ps Vps, Hps, Wpd + * 66 = v*pd Vpd, Hpd, Wps + * f3 = v*ss Vss, Hss, Wps + * f2 = v*sd Vsd, Hsd, Wps + */ +#define SSE_CMP(x) { \ + gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \ + gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \ + gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm} +static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = { + SSE_CMP(cmpeq), + SSE_CMP(cmplt), + SSE_CMP(cmple), + SSE_CMP(cmpunord), + SSE_CMP(cmpneq), + SSE_CMP(cmpnlt), + SSE_CMP(cmpnle), + SSE_CMP(cmpord), + + SSE_CMP(cmpequ), + SSE_CMP(cmpnge), + SSE_CMP(cmpngt), + SSE_CMP(cmpfalse), + SSE_CMP(cmpnequ), + SSE_CMP(cmpge), + SSE_CMP(cmpgt), + SSE_CMP(cmptrue), + + SSE_CMP(cmpeqs), + SSE_CMP(cmpltq), + SSE_CMP(cmpleq), + SSE_CMP(cmpunords), + SSE_CMP(cmpneqq), + SSE_CMP(cmpnltq), + SSE_CMP(cmpnleq), + SSE_CMP(cmpords), + + SSE_CMP(cmpequs), + SSE_CMP(cmpngeq), + SSE_CMP(cmpngtq), + SSE_CMP(cmpfalses), + SSE_CMP(cmpnequs), + SSE_CMP(cmpgeq), + SSE_CMP(cmpgtq), + SSE_CMP(cmptrues), +}; +#undef SSE_CMP + +static void gen_VCMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7); + int b = + s->prefix & PREFIX_REPZ ? 2 /* ss */ : + s->prefix & PREFIX_REPNZ ? 3 /* sd */ : + !!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2); + + gen_helper_cmp_funcs[index][b](cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); +} + +static void gen_VCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + SSEFunc_0_epp fn; + fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss; + fn(cpu_env, OP_PTR1, OP_PTR2); + set_cc_op(s, CC_OP_EFLAGS); +} + +static void gen_VCVTfp2fp(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_unary_fp_sse(s, env, decode, + gen_helper_cvtpd2ps_xmm, gen_helper_cvtps2pd_xmm, + gen_helper_cvtpd2ps_ymm, gen_helper_cvtps2pd_ymm, + gen_helper_cvtsd2ss, gen_helper_cvtss2sd); +} + +static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + TCGv_i32 in; + + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len); + +#ifdef TARGET_X86_64 + MemOp ot = decode->op[2].ot; + if (ot == MO_64) { + if (s->prefix & PREFIX_REPNZ) { + gen_helper_cvtsq2sd(cpu_env, OP_PTR0, s->T1); + } else { + gen_helper_cvtsq2ss(cpu_env, OP_PTR0, s->T1); + } + return; + } + in = s->tmp2_i32; + tcg_gen_trunc_tl_i32(in, s->T1); +#else + in = s->T1; +#endif + + if (s->prefix & PREFIX_REPNZ) { + gen_helper_cvtsi2sd(cpu_env, OP_PTR0, in); + } else { + gen_helper_cvtsi2ss(cpu_env, OP_PTR0, in); + } +} + +static inline void gen_VCVTtSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq, + SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq) +{ + TCGv_i32 out; + +#ifdef TARGET_X86_64 + MemOp ot = decode->op[0].ot; + if (ot == MO_64) { + if (s->prefix & PREFIX_REPNZ) { + sd2sq(s->T0, cpu_env, OP_PTR2); + } else { + ss2sq(s->T0, cpu_env, OP_PTR2); + } + return; + } + + out = s->tmp2_i32; +#else + out = s->T0; +#endif + if (s->prefix & PREFIX_REPNZ) { + sd2si(out, cpu_env, OP_PTR2); + } else { + ss2si(out, cpu_env, OP_PTR2); + } +#ifdef TARGET_X86_64 + tcg_gen_extu_i32_tl(s->T0, out); +#endif +} + +#ifndef TARGET_X86_64 +#define gen_helper_cvtss2sq NULL +#define gen_helper_cvtsd2sq NULL +#define gen_helper_cvttss2sq NULL +#define gen_helper_cvttsd2sq NULL +#endif + +static void gen_VCVTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_VCVTtSx2SI(s, env, decode, + gen_helper_cvtss2si, gen_helper_cvtss2sq, + gen_helper_cvtsd2si, gen_helper_cvtsd2sq); +} + +static void gen_VCVTTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_VCVTtSx2SI(s, env, decode, + gen_helper_cvttss2si, gen_helper_cvttss2sq, + gen_helper_cvttsd2si, gen_helper_cvttsd2sq); +} + +static void gen_VEXTRACTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int mask = decode->immediate & 1; + int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask); + if (decode->op[0].has_ea) { + /* VEX-only instruction, no alignment requirements. */ + gen_sto_env_A0(s, src_ofs, false); + } else { + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16); + } +} + +static void gen_VEXTRACTPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pextr(s, env, decode, MO_32); +} + +static void gen_vinsertps(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int val = decode->immediate; + int dest_word = (val >> 4) & 3; + int new_mask = (val & 15) | (1 << dest_word); + int vec_len = 16; + + assert(!s->vex_l); + + if (new_mask == 15) { + /* All zeroes except possibly for the inserted element */ + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + } else if (decode->op[1].offset != decode->op[0].offset) { + gen_store_sse(s, decode, decode->op[1].offset); + } + + if (new_mask != (val & 15)) { + tcg_gen_st_i32(s->tmp2_i32, cpu_env, + vector_elem_offset(&decode->op[0], MO_32, dest_word)); + } + + if (new_mask != 15) { + TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */ + int i; + for (i = 0; i < 4; i++) { + if ((val >> i) & 1) { + tcg_gen_st_i32(zero, cpu_env, + vector_elem_offset(&decode->op[0], MO_32, i)); + } + } + } +} + +static void gen_VINSERTPS_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int val = decode->immediate; + tcg_gen_ld_i32(s->tmp2_i32, cpu_env, + vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3)); + gen_vinsertps(s, env, decode); +} + +static void gen_VINSERTPS_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL); + gen_vinsertps(s, env, decode); +} + +static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int mask = decode->immediate & 1; + tcg_gen_gvec_mov(MO_64, + decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)), + decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16); + tcg_gen_gvec_mov(MO_64, + decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)), + decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16); +} + +static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm) +{ + if (!s->vex_l) { + xmm(cpu_env, OP_PTR2, OP_PTR1, s->A0); + } else { + ymm(cpu_env, OP_PTR2, OP_PTR1, s->A0); + } +} + +static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm); +} + +static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm); +} + +static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1))); + if (decode->op[0].offset != decode->op[1].offset) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0))); + } +} + +static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1))); +} + +static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + if (decode->op[0].offset != decode->op[2].offset) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1))); + } + if (decode->op[0].offset != decode->op[1].offset) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0))); + } +} + +static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0))); + if (decode->op[0].offset != decode->op[1].offset) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1))); + } +} + +static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1))); + if (decode->op[0].offset != decode->op[1].offset) { + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0))); + } +} + +/* + * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx. + * Use a gvec move to move everything above the bottom 64 bits. + */ + +static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0))); + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0))); +} + +static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ); + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len); + tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0))); +} + +static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ); +} + +static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i64 zero = tcg_constant_i64(0); + + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ); + tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1))); + tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0))); +} + +static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0))); + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len); + tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0))); +} + +static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int vec_len = vector_len(s, decode); + + tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL); + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0))); +} + +static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0))); + tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL); +} + +static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + if (s->vex_w) { + gen_VMASKMOVPD_st(s, env, decode); + } else { + gen_VMASKMOVPS_st(s, env, decode); + } +} + +static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + assert(s->vex_l); + gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2); +} + +static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + assert(s->vex_l); + gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm); +} + +static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + assert(!s->vex_l); + gen_helper_phminposuw_xmm(cpu_env, OP_PTR0, OP_PTR2); +} + +static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + assert(!s->vex_l); + gen_helper_roundsd_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); +} + +static void gen_VROUNDSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant8u_i32(decode->immediate); + assert(!s->vex_l); + gen_helper_roundss_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm); +} + +static void gen_VSHUF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_i32 imm = tcg_constant_i32(decode->immediate); + SSEFunc_0_pppi ps, pd, fn; + ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm; + pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm; + fn = s->prefix & PREFIX_DATA ? pd : ps; + fn(OP_PTR0, OP_PTR1, OP_PTR2, imm); +} + +static void gen_VUCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + SSEFunc_0_epp fn; + fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss; + fn(cpu_env, OP_PTR1, OP_PTR2); + set_cc_op(s, CC_OP_EFLAGS); +} + +static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGv_ptr ptr = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0)); + gen_helper_memset(ptr, ptr, tcg_constant_i32(0), + tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg))); + tcg_temp_free_ptr(ptr); +} + +static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + int i; + + for (i = 0; i < CPU_NB_REGS; i++) { + int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1)); + tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0); + } +} diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index ad58931..a6a90a1 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -2559,6 +2559,22 @@ static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra) } } +static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra) +{ + int i, nb_xmm_regs; + + if (env->hflags & HF_CS64_MASK) { + nb_xmm_regs = 16; + } else { + nb_xmm_regs = 8; + } + + for (i = 0; i < nb_xmm_regs; i++, ptr += 16) { + cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra); + cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra); + } +} + static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra) { target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs); @@ -2651,6 +2667,9 @@ static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm, if (opt & XSTATE_SSE_MASK) { do_xsave_sse(env, ptr, ra); } + if (opt & XSTATE_YMM_MASK) { + do_xsave_ymmh(env, ptr + XO(avx_state), ra); + } if (opt & XSTATE_BNDREGS_MASK) { do_xsave_bndregs(env, ptr + XO(bndreg_state), ra); } @@ -2725,6 +2744,54 @@ static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra) } } +static void do_clear_sse(CPUX86State *env) +{ + int i, nb_xmm_regs; + + if (env->hflags & HF_CS64_MASK) { + nb_xmm_regs = 16; + } else { + nb_xmm_regs = 8; + } + + for (i = 0; i < nb_xmm_regs; i++) { + env->xmm_regs[i].ZMM_Q(0) = 0; + env->xmm_regs[i].ZMM_Q(1) = 0; + } +} + +static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra) +{ + int i, nb_xmm_regs; + + if (env->hflags & HF_CS64_MASK) { + nb_xmm_regs = 16; + } else { + nb_xmm_regs = 8; + } + + for (i = 0; i < nb_xmm_regs; i++, ptr += 16) { + env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra); + env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra); + } +} + +static void do_clear_ymmh(CPUX86State *env) +{ + int i, nb_xmm_regs; + + if (env->hflags & HF_CS64_MASK) { + nb_xmm_regs = 16; + } else { + nb_xmm_regs = 8; + } + + for (i = 0; i < nb_xmm_regs; i++) { + env->xmm_regs[i].ZMM_Q(2) = 0; + env->xmm_regs[i].ZMM_Q(3) = 0; + } +} + static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra) { target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs); @@ -2831,9 +2898,14 @@ static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr if (xstate_bv & XSTATE_SSE_MASK) { do_xrstor_sse(env, ptr, ra); } else { - /* ??? When AVX is implemented, we may have to be more - selective in the clearing. */ - memset(env->xmm_regs, 0, sizeof(env->xmm_regs)); + do_clear_sse(env); + } + } + if (rfbm & XSTATE_YMM_MASK) { + if (xstate_bv & XSTATE_YMM_MASK) { + do_xrstor_ymmh(env, ptr + XO(avx_state), ra); + } else { + do_clear_ymmh(env); } } if (rfbm & XSTATE_BNDREGS_MASK) { @@ -2955,6 +3027,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask) env->xcr0 = mask; cpu_sync_bndcs_hflags(env); + cpu_sync_avx_hflag(env); return; do_gpf: @@ -3053,14 +3126,11 @@ void helper_emms(CPUX86State *env) *(uint32_t *)(env->fptags + 4) = 0x01010101; } -/* XXX: suppress */ -void helper_movq(CPUX86State *env, void *d, void *s) -{ - *(uint64_t *)d = *(uint64_t *)s; -} - #define SHIFT 0 #include "ops_sse.h" #define SHIFT 1 #include "ops_sse.h" + +#define SHIFT 2 +#include "ops_sse.h" diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index 796dc2a..d51b5d7 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -22,150 +22,274 @@ #include "exec/exec-all.h" #include "tcg/helper-tcg.h" -#define PG_ERROR_OK (-1) +typedef struct TranslateParams { + target_ulong addr; + target_ulong cr3; + int pg_mode; + int mmu_idx; + int ptw_idx; + MMUAccessType access_type; +} TranslateParams; + +typedef struct TranslateResult { + hwaddr paddr; + int prot; + int page_size; +} TranslateResult; + +typedef enum TranslateFaultStage2 { + S2_NONE, + S2_GPA, + S2_GPT, +} TranslateFaultStage2; + +typedef struct TranslateFault { + int exception_index; + int error_code; + target_ulong cr2; + TranslateFaultStage2 stage2; +} TranslateFault; + +typedef struct PTETranslate { + CPUX86State *env; + TranslateFault *err; + int ptw_idx; + void *haddr; + hwaddr gaddr; +} PTETranslate; + +static bool ptw_translate(PTETranslate *inout, hwaddr addr) +{ + CPUTLBEntryFull *full; + int flags; + + inout->gaddr = addr; + flags = probe_access_full(inout->env, addr, MMU_DATA_STORE, + inout->ptw_idx, true, &inout->haddr, &full, 0); + + if (unlikely(flags & TLB_INVALID_MASK)) { + TranslateFault *err = inout->err; + + assert(inout->ptw_idx == MMU_NESTED_IDX); + err->exception_index = 0; /* unused */ + err->error_code = inout->env->error_code; + err->cr2 = addr; + err->stage2 = S2_GPT; + return false; + } + return true; +} + +static inline uint32_t ptw_ldl(const PTETranslate *in) +{ + if (likely(in->haddr)) { + return ldl_p(in->haddr); + } + return cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); +} + +static inline uint64_t ptw_ldq(const PTETranslate *in) +{ + if (likely(in->haddr)) { + return ldq_p(in->haddr); + } + return cpu_ldq_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); +} -typedef hwaddr (*MMUTranslateFunc)(CPUState *cs, hwaddr gphys, MMUAccessType access_type, - int *prot); +/* + * Note that we can use a 32-bit cmpxchg for all page table entries, + * even 64-bit ones, because PG_PRESENT_MASK, PG_ACCESSED_MASK and + * PG_DIRTY_MASK are all in the low 32 bits. + */ +static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) +{ + uint32_t cmp; -#define GET_HPHYS(cs, gpa, access_type, prot) \ - (get_hphys_func ? get_hphys_func(cs, gpa, access_type, prot) : gpa) + /* Does x86 really perform a rmw cycle on mmio for ptw? */ + start_exclusive(); + cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); + if (cmp == old) { + cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0); + } + end_exclusive(); + return cmp == old; +} -static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_func, - uint64_t cr3, int is_write1, int mmu_idx, int pg_mode, - hwaddr *xlat, int *page_size, int *prot) +static inline bool ptw_setl(const PTETranslate *in, uint32_t old, uint32_t set) { - X86CPU *cpu = X86_CPU(cs); - CPUX86State *env = &cpu->env; - uint64_t ptep, pte; - int32_t a20_mask; - target_ulong pde_addr, pte_addr; - int error_code = 0; - int is_dirty, is_write, is_user; - uint64_t rsvd_mask = PG_ADDRESS_MASK & ~MAKE_64BIT_MASK(0, cpu->phys_bits); - uint32_t page_offset; - uint32_t pkr; + if (set & ~old) { + uint32_t new = old | set; + if (likely(in->haddr)) { + old = cpu_to_le32(old); + new = cpu_to_le32(new); + return qatomic_cmpxchg((uint32_t *)in->haddr, old, new) == old; + } + return ptw_setl_slow(in, old, new); + } + return true; +} - is_user = (mmu_idx == MMU_USER_IDX); - is_write = is_write1 & 1; - a20_mask = x86_get_a20_mask(env); +static bool mmu_translate(CPUX86State *env, const TranslateParams *in, + TranslateResult *out, TranslateFault *err) +{ + const int32_t a20_mask = x86_get_a20_mask(env); + const target_ulong addr = in->addr; + const int pg_mode = in->pg_mode; + const bool is_user = (in->mmu_idx == MMU_USER_IDX); + const MMUAccessType access_type = in->access_type; + uint64_t ptep, pte, rsvd_mask; + PTETranslate pte_trans = { + .env = env, + .err = err, + .ptw_idx = in->ptw_idx, + }; + hwaddr pte_addr, paddr; + uint32_t pkr; + int page_size; + restart_all: + rsvd_mask = ~MAKE_64BIT_MASK(0, env_archcpu(env)->phys_bits); + rsvd_mask &= PG_ADDRESS_MASK; if (!(pg_mode & PG_MODE_NXE)) { rsvd_mask |= PG_NX_MASK; } if (pg_mode & PG_MODE_PAE) { - uint64_t pde, pdpe; - target_ulong pdpe_addr; - #ifdef TARGET_X86_64 if (pg_mode & PG_MODE_LMA) { - bool la57 = pg_mode & PG_MODE_LA57; - uint64_t pml5e_addr, pml5e; - uint64_t pml4e_addr, pml4e; - - if (la57) { - pml5e_addr = ((cr3 & ~0xfff) + - (((addr >> 48) & 0x1ff) << 3)) & a20_mask; - pml5e_addr = GET_HPHYS(cs, pml5e_addr, MMU_DATA_STORE, NULL); - pml5e = x86_ldq_phys(cs, pml5e_addr); - if (!(pml5e & PG_PRESENT_MASK)) { + if (pg_mode & PG_MODE_LA57) { + /* + * Page table level 5 + */ + pte_addr = ((in->cr3 & ~0xfff) + + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; + } + restart_5: + pte = ptw_ldq(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } - if (pml5e & (rsvd_mask | PG_PSE_MASK)) { + if (pte & (rsvd_mask | PG_PSE_MASK)) { goto do_fault_rsvd; } - if (!(pml5e & PG_ACCESSED_MASK)) { - pml5e |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_5; } - ptep = pml5e ^ PG_NX_MASK; + ptep = pte ^ PG_NX_MASK; } else { - pml5e = cr3; + pte = in->cr3; ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; } - pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + - (((addr >> 39) & 0x1ff) << 3)) & a20_mask; - pml4e_addr = GET_HPHYS(cs, pml4e_addr, MMU_DATA_STORE, NULL); - pml4e = x86_ldq_phys(cs, pml4e_addr); - if (!(pml4e & PG_PRESENT_MASK)) { + /* + * Page table level 4 + */ + pte_addr = ((pte & PG_ADDRESS_MASK) + + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; + } + restart_4: + pte = ptw_ldq(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } - if (pml4e & (rsvd_mask | PG_PSE_MASK)) { + if (pte & (rsvd_mask | PG_PSE_MASK)) { goto do_fault_rsvd; } - if (!(pml4e & PG_ACCESSED_MASK)) { - pml4e |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_4; + } + ptep &= pte ^ PG_NX_MASK; + + /* + * Page table level 3 + */ + pte_addr = ((pte & PG_ADDRESS_MASK) + + (((addr >> 30) & 0x1ff) << 3)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; } - ptep &= pml4e ^ PG_NX_MASK; - pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & - a20_mask; - pdpe_addr = GET_HPHYS(cs, pdpe_addr, MMU_DATA_STORE, NULL); - pdpe = x86_ldq_phys(cs, pdpe_addr); - if (!(pdpe & PG_PRESENT_MASK)) { + restart_3_lma: + pte = ptw_ldq(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } - if (pdpe & rsvd_mask) { + if (pte & rsvd_mask) { goto do_fault_rsvd; } - ptep &= pdpe ^ PG_NX_MASK; - if (!(pdpe & PG_ACCESSED_MASK)) { - pdpe |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_3_lma; } - if (pdpe & PG_PSE_MASK) { + ptep &= pte ^ PG_NX_MASK; + if (pte & PG_PSE_MASK) { /* 1 GB page */ - *page_size = 1024 * 1024 * 1024; - pte_addr = pdpe_addr; - pte = pdpe; + page_size = 1024 * 1024 * 1024; goto do_check_protect; } } else #endif { - /* XXX: load them when cr3 is loaded ? */ - pdpe_addr = ((cr3 & ~0x1f) + ((addr >> 27) & 0x18)) & - a20_mask; - pdpe_addr = GET_HPHYS(cs, pdpe_addr, MMU_DATA_STORE, NULL); - pdpe = x86_ldq_phys(cs, pdpe_addr); - if (!(pdpe & PG_PRESENT_MASK)) { - goto do_fault; + /* + * Page table level 3 + */ + pte_addr = ((in->cr3 & ~0x1f) + ((addr >> 27) & 0x18)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; } rsvd_mask |= PG_HI_USER_MASK; - if (pdpe & (rsvd_mask | PG_NX_MASK)) { + restart_3_nolma: + pte = ptw_ldq(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { + goto do_fault; + } + if (pte & (rsvd_mask | PG_NX_MASK)) { goto do_fault_rsvd; } + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_3_nolma; + } ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; } - pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) & - a20_mask; - pde_addr = GET_HPHYS(cs, pde_addr, MMU_DATA_STORE, NULL); - pde = x86_ldq_phys(cs, pde_addr); - if (!(pde & PG_PRESENT_MASK)) { + /* + * Page table level 2 + */ + pte_addr = ((pte & PG_ADDRESS_MASK) + + (((addr >> 21) & 0x1ff) << 3)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; + } + restart_2_pae: + pte = ptw_ldq(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } - if (pde & rsvd_mask) { + if (pte & rsvd_mask) { goto do_fault_rsvd; } - ptep &= pde ^ PG_NX_MASK; - if (pde & PG_PSE_MASK) { + if (pte & PG_PSE_MASK) { /* 2 MB page */ - *page_size = 2048 * 1024; - pte_addr = pde_addr; - pte = pde; + page_size = 2048 * 1024; + ptep &= pte ^ PG_NX_MASK; goto do_check_protect; } - /* 4 KB page */ - if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_2_pae; + } + ptep &= pte ^ PG_NX_MASK; + + /* + * Page table level 1 + */ + pte_addr = ((pte & PG_ADDRESS_MASK) + + (((addr >> 12) & 0x1ff) << 3)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; } - pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & - a20_mask; - pte_addr = GET_HPHYS(cs, pte_addr, MMU_DATA_STORE, NULL); - pte = x86_ldq_phys(cs, pte_addr); + pte = ptw_ldq(&pte_trans); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } @@ -174,54 +298,56 @@ static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_f } /* combine pde and pte nx, user and rw protections */ ptep &= pte ^ PG_NX_MASK; - *page_size = 4096; + page_size = 4096; } else { - uint32_t pde; - - /* page directory entry */ - pde_addr = ((cr3 & ~0xfff) + ((addr >> 20) & 0xffc)) & - a20_mask; - pde_addr = GET_HPHYS(cs, pde_addr, MMU_DATA_STORE, NULL); - pde = x86_ldl_phys(cs, pde_addr); - if (!(pde & PG_PRESENT_MASK)) { + /* + * Page table level 2 + */ + pte_addr = ((in->cr3 & ~0xfff) + ((addr >> 20) & 0xffc)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; + } + restart_2_nopae: + pte = ptw_ldl(&pte_trans); + if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } - ptep = pde | PG_NX_MASK; + ptep = pte | PG_NX_MASK; /* if PSE bit is set, then we use a 4MB page */ - if ((pde & PG_PSE_MASK) && (pg_mode & PG_MODE_PSE)) { - *page_size = 4096 * 1024; - pte_addr = pde_addr; - - /* Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved. + if ((pte & PG_PSE_MASK) && (pg_mode & PG_MODE_PSE)) { + page_size = 4096 * 1024; + /* + * Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved. * Leave bits 20-13 in place for setting accessed/dirty bits below. */ - pte = pde | ((pde & 0x1fe000LL) << (32 - 13)); + pte = (uint32_t)pte | ((pte & 0x1fe000LL) << (32 - 13)); rsvd_mask = 0x200000; goto do_check_protect_pse36; } - - if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { + goto restart_2_nopae; } - /* page directory entry */ - pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & - a20_mask; - pte_addr = GET_HPHYS(cs, pte_addr, MMU_DATA_STORE, NULL); - pte = x86_ldl_phys(cs, pte_addr); + /* + * Page table level 1 + */ + pte_addr = ((pte & ~0xfffu) + ((addr >> 10) & 0xffc)) & a20_mask; + if (!ptw_translate(&pte_trans, pte_addr)) { + return false; + } + pte = ptw_ldl(&pte_trans); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; } /* combine pde and pte user and rw protections */ ptep &= pte | PG_NX_MASK; - *page_size = 4096; + page_size = 4096; rsvd_mask = 0; } do_check_protect: - rsvd_mask |= (*page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK; + rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK; do_check_protect_pse36: if (pte & rsvd_mask) { goto do_fault_rsvd; @@ -233,17 +359,17 @@ do_check_protect_pse36: goto do_fault_protect; } - *prot = 0; - if (mmu_idx != MMU_KSMAP_IDX || !(ptep & PG_USER_MASK)) { - *prot |= PAGE_READ; + int prot = 0; + if (in->mmu_idx != MMU_KSMAP_IDX || !(ptep & PG_USER_MASK)) { + prot |= PAGE_READ; if ((ptep & PG_RW_MASK) || !(is_user || (pg_mode & PG_MODE_WP))) { - *prot |= PAGE_WRITE; + prot |= PAGE_WRITE; } } if (!(ptep & PG_NX_MASK) && - (mmu_idx == MMU_USER_IDX || + (is_user || !((pg_mode & PG_MODE_SMEP) && (ptep & PG_USER_MASK)))) { - *prot |= PAGE_EXEC; + prot |= PAGE_EXEC; } if (ptep & PG_USER_MASK) { @@ -262,182 +388,246 @@ do_check_protect_pse36: } else if (pkr_wd && (is_user || (pg_mode & PG_MODE_WP))) { pkr_prot &= ~PAGE_WRITE; } - - *prot &= pkr_prot; - if ((pkr_prot & (1 << is_write1)) == 0) { - assert(is_write1 != 2); - error_code |= PG_ERROR_PK_MASK; - goto do_fault_protect; + if ((pkr_prot & (1 << access_type)) == 0) { + goto do_fault_pk_protect; } + prot &= pkr_prot; } - if ((*prot & (1 << is_write1)) == 0) { + if ((prot & (1 << access_type)) == 0) { goto do_fault_protect; } /* yes, it can! */ - is_dirty = is_write && !(pte & PG_DIRTY_MASK); - if (!(pte & PG_ACCESSED_MASK) || is_dirty) { - pte |= PG_ACCESSED_MASK; - if (is_dirty) { - pte |= PG_DIRTY_MASK; + { + uint32_t set = PG_ACCESSED_MASK; + if (access_type == MMU_DATA_STORE) { + set |= PG_DIRTY_MASK; + } else if (!(pte & PG_DIRTY_MASK)) { + /* + * Only set write access if already dirty... + * otherwise wait for dirty access. + */ + prot &= ~PAGE_WRITE; + } + if (!ptw_setl(&pte_trans, pte, set)) { + /* + * We can arrive here from any of 3 levels and 2 formats. + * The only safe thing is to restart the entire lookup. + */ + goto restart_all; } - x86_stl_phys_notdirty(cs, pte_addr, pte); } - if (!(pte & PG_DIRTY_MASK)) { - /* only set write access if already dirty... otherwise wait - for dirty access */ - assert(!is_write); - *prot &= ~PAGE_WRITE; - } + /* align to page_size */ + paddr = (pte & a20_mask & PG_ADDRESS_MASK & ~(page_size - 1)) + | (addr & (page_size - 1)); + + if (in->ptw_idx == MMU_NESTED_IDX) { + CPUTLBEntryFull *full; + int flags, nested_page_size; + + flags = probe_access_full(env, paddr, access_type, + MMU_NESTED_IDX, true, + &pte_trans.haddr, &full, 0); + if (unlikely(flags & TLB_INVALID_MASK)) { + err->exception_index = 0; /* unused */ + err->error_code = env->error_code; + err->cr2 = paddr; + err->stage2 = S2_GPA; + return false; + } - pte = pte & a20_mask; + /* Merge stage1 & stage2 protection bits. */ + prot &= full->prot; - /* align to page_size */ - pte &= PG_ADDRESS_MASK & ~(*page_size - 1); - page_offset = addr & (*page_size - 1); - *xlat = GET_HPHYS(cs, pte + page_offset, is_write1, prot); - return PG_ERROR_OK; + /* Re-verify resulting protection. */ + if ((prot & (1 << access_type)) == 0) { + goto do_fault_protect; + } + + /* Merge stage1 & stage2 addresses to final physical address. */ + nested_page_size = 1 << full->lg_page_size; + paddr = (full->phys_addr & ~(nested_page_size - 1)) + | (paddr & (nested_page_size - 1)); + + /* + * Use the larger of stage1 & stage2 page sizes, so that + * invalidation works. + */ + if (nested_page_size > page_size) { + page_size = nested_page_size; + } + } + + out->paddr = paddr; + out->prot = prot; + out->page_size = page_size; + return true; + int error_code; do_fault_rsvd: - error_code |= PG_ERROR_RSVD_MASK; + error_code = PG_ERROR_RSVD_MASK; + goto do_fault_cont; do_fault_protect: - error_code |= PG_ERROR_P_MASK; + error_code = PG_ERROR_P_MASK; + goto do_fault_cont; + do_fault_pk_protect: + assert(access_type != MMU_INST_FETCH); + error_code = PG_ERROR_PK_MASK | PG_ERROR_P_MASK; + goto do_fault_cont; do_fault: - error_code |= (is_write << PG_ERROR_W_BIT); - if (is_user) + error_code = 0; + do_fault_cont: + if (is_user) { error_code |= PG_ERROR_U_MASK; - if (is_write1 == 2 && - ((pg_mode & PG_MODE_NXE) || (pg_mode & PG_MODE_SMEP))) - error_code |= PG_ERROR_I_D_MASK; - return error_code; -} - -hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, - int *prot) -{ - CPUX86State *env = &X86_CPU(cs)->env; - uint64_t exit_info_1; - int page_size; - int next_prot; - hwaddr hphys; - - if (likely(!(env->hflags2 & HF2_NPT_MASK))) { - return gphys; } - - exit_info_1 = mmu_translate(cs, gphys, NULL, env->nested_cr3, - access_type, MMU_USER_IDX, env->nested_pg_mode, - &hphys, &page_size, &next_prot); - if (exit_info_1 == PG_ERROR_OK) { - if (prot) { - *prot &= next_prot; + switch (access_type) { + case MMU_DATA_LOAD: + break; + case MMU_DATA_STORE: + error_code |= PG_ERROR_W_MASK; + break; + case MMU_INST_FETCH: + if (pg_mode & (PG_MODE_NXE | PG_MODE_SMEP)) { + error_code |= PG_ERROR_I_D_MASK; } - return hphys; + break; } + err->exception_index = EXCP0E_PAGE; + err->error_code = error_code; + err->cr2 = addr; + err->stage2 = S2_NONE; + return false; +} - x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), - gphys); - if (prot) { - exit_info_1 |= SVM_NPTEXIT_GPA; - } else { /* page table access */ +static G_NORETURN void raise_stage2(CPUX86State *env, TranslateFault *err, + uintptr_t retaddr) +{ + uint64_t exit_info_1 = err->error_code; + + switch (err->stage2) { + case S2_GPT: exit_info_1 |= SVM_NPTEXIT_GPT; + break; + case S2_GPA: + exit_info_1 |= SVM_NPTEXIT_GPA; + break; + default: + g_assert_not_reached(); } - cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, env->retaddr); + + x86_stq_phys(env_cpu(env), + env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), + err->cr2); + cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, retaddr); } -/* return value: - * -1 = cannot handle fault - * 0 = nothing more to do - * 1 = generate PF fault - */ -static int handle_mmu_fault(CPUState *cs, vaddr addr, int size, - int is_write1, int mmu_idx) +static bool get_physical_address(CPUX86State *env, vaddr addr, + MMUAccessType access_type, int mmu_idx, + TranslateResult *out, TranslateFault *err) { - X86CPU *cpu = X86_CPU(cs); - CPUX86State *env = &cpu->env; - int error_code = PG_ERROR_OK; - int pg_mode, prot, page_size; - int32_t a20_mask; - hwaddr paddr; - hwaddr vaddr; - -#if defined(DEBUG_MMU) - printf("MMU fault: addr=%" VADDR_PRIx " w=%d mmu=%d eip=" TARGET_FMT_lx "\n", - addr, is_write1, mmu_idx, env->eip); -#endif - - if (!(env->cr[0] & CR0_PG_MASK)) { - a20_mask = x86_get_a20_mask(env); - paddr = addr & a20_mask; -#ifdef TARGET_X86_64 - if (!(env->hflags & HF_LMA_MASK)) { - /* Without long mode we can only address 32bits in real mode */ - paddr = (uint32_t)paddr; + TranslateParams in; + bool use_stage2 = env->hflags2 & HF2_NPT_MASK; + + in.addr = addr; + in.access_type = access_type; + + switch (mmu_idx) { + case MMU_PHYS_IDX: + break; + + case MMU_NESTED_IDX: + if (likely(use_stage2)) { + in.cr3 = env->nested_cr3; + in.pg_mode = env->nested_pg_mode; + in.mmu_idx = MMU_USER_IDX; + in.ptw_idx = MMU_PHYS_IDX; + + if (!mmu_translate(env, &in, out, err)) { + err->stage2 = S2_GPA; + return false; + } + return true; } -#endif - prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - page_size = 4096; - } else { - pg_mode = get_pg_mode(env); - if (pg_mode & PG_MODE_LMA) { - int32_t sext; - - /* test virtual address sign extension */ - sext = (int64_t)addr >> (pg_mode & PG_MODE_LA57 ? 56 : 47); - if (sext != 0 && sext != -1) { - env->error_code = 0; - cs->exception_index = EXCP0D_GPF; - return 1; + break; + + default: + in.cr3 = env->cr[3]; + in.mmu_idx = mmu_idx; + in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX; + in.pg_mode = get_pg_mode(env); + + if (likely(in.pg_mode)) { + if (in.pg_mode & PG_MODE_LMA) { + /* test virtual address sign extension */ + int shift = in.pg_mode & PG_MODE_LA57 ? 56 : 47; + int64_t sext = (int64_t)addr >> shift; + if (sext != 0 && sext != -1) { + err->exception_index = EXCP0D_GPF; + err->error_code = 0; + err->cr2 = addr; + return false; + } } + return mmu_translate(env, &in, out, err); } - - error_code = mmu_translate(cs, addr, get_hphys, env->cr[3], is_write1, - mmu_idx, pg_mode, - &paddr, &page_size, &prot); + break; } - if (error_code == PG_ERROR_OK) { - /* Even if 4MB pages, we map only one 4KB page in the cache to - avoid filling it too fast */ - vaddr = addr & TARGET_PAGE_MASK; - paddr &= TARGET_PAGE_MASK; - - assert(prot & (1 << is_write1)); - tlb_set_page_with_attrs(cs, vaddr, paddr, cpu_get_mem_attrs(env), - prot, mmu_idx, page_size); - return 0; - } else { - if (env->intercept_exceptions & (1 << EXCP0E_PAGE)) { - /* cr2 is not modified in case of exceptions */ - x86_stq_phys(cs, - env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), - addr); - } else { - env->cr[2] = addr; - } - env->error_code = error_code; - cs->exception_index = EXCP0E_PAGE; - return 1; + /* Translation disabled. */ + out->paddr = addr & x86_get_a20_mask(env); +#ifdef TARGET_X86_64 + if (!(env->hflags & HF_LMA_MASK)) { + /* Without long mode we can only address 32bits in real mode */ + out->paddr = (uint32_t)out->paddr; } +#endif + out->prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + out->page_size = TARGET_PAGE_SIZE; + return true; } bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size, MMUAccessType access_type, int mmu_idx, bool probe, uintptr_t retaddr) { - X86CPU *cpu = X86_CPU(cs); - CPUX86State *env = &cpu->env; - - env->retaddr = retaddr; - if (handle_mmu_fault(cs, addr, size, access_type, mmu_idx)) { - /* FIXME: On error in get_hphys we have already jumped out. */ - g_assert(!probe); - raise_exception_err_ra(env, cs->exception_index, - env->error_code, retaddr); + CPUX86State *env = cs->env_ptr; + TranslateResult out; + TranslateFault err; + + if (get_physical_address(env, addr, access_type, mmu_idx, &out, &err)) { + /* + * Even if 4MB pages, we map only one 4KB page in the cache to + * avoid filling it too fast. + */ + assert(out.prot & (1 << access_type)); + tlb_set_page_with_attrs(cs, addr & TARGET_PAGE_MASK, + out.paddr & TARGET_PAGE_MASK, + cpu_get_mem_attrs(env), + out.prot, mmu_idx, out.page_size); + return true; } - return true; + + if (probe) { + /* This will be used if recursing for stage2 translation. */ + env->error_code = err.error_code; + return false; + } + + if (err.stage2 != S2_NONE) { + raise_stage2(env, &err, retaddr); + } + + if (env->intercept_exceptions & (1 << err.exception_index)) { + /* cr2 is not modified in case of exceptions */ + x86_stq_phys(cs, env->vm_vmcb + + offsetof(struct vmcb, control.exit_info_2), + err.cr2); + } else { + env->cr[2] = err.cr2; + } + raise_exception_err_ra(env, err.exception_index, err.error_code, retaddr); } G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, diff --git a/target/i386/tcg/sysemu/svm_helper.c b/target/i386/tcg/sysemu/svm_helper.c index 2b6f450..8e88567 100644 --- a/target/i386/tcg/sysemu/svm_helper.c +++ b/target/i386/tcg/sysemu/svm_helper.c @@ -27,19 +27,19 @@ /* Secure Virtual Machine helpers */ -static inline void svm_save_seg(CPUX86State *env, hwaddr addr, - const SegmentCache *sc) +static void svm_save_seg(CPUX86State *env, int mmu_idx, hwaddr addr, + const SegmentCache *sc) { - CPUState *cs = env_cpu(env); - - x86_stw_phys(cs, addr + offsetof(struct vmcb_seg, selector), - sc->selector); - x86_stq_phys(cs, addr + offsetof(struct vmcb_seg, base), - sc->base); - x86_stl_phys(cs, addr + offsetof(struct vmcb_seg, limit), - sc->limit); - x86_stw_phys(cs, addr + offsetof(struct vmcb_seg, attrib), - ((sc->flags >> 8) & 0xff) | ((sc->flags >> 12) & 0x0f00)); + cpu_stw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, selector), + sc->selector, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, base), + sc->base, mmu_idx, 0); + cpu_stl_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, limit), + sc->limit, mmu_idx, 0); + cpu_stw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, attrib), + ((sc->flags >> 8) & 0xff) + | ((sc->flags >> 12) & 0x0f00), + mmu_idx, 0); } /* @@ -52,29 +52,36 @@ static inline void svm_canonicalization(CPUX86State *env, target_ulong *seg_base *seg_base = ((((long) *seg_base) << shift_amt) >> shift_amt); } -static inline void svm_load_seg(CPUX86State *env, hwaddr addr, - SegmentCache *sc) +static void svm_load_seg(CPUX86State *env, int mmu_idx, hwaddr addr, + SegmentCache *sc) { - CPUState *cs = env_cpu(env); unsigned int flags; - sc->selector = x86_lduw_phys(cs, - addr + offsetof(struct vmcb_seg, selector)); - sc->base = x86_ldq_phys(cs, addr + offsetof(struct vmcb_seg, base)); - sc->limit = x86_ldl_phys(cs, addr + offsetof(struct vmcb_seg, limit)); - flags = x86_lduw_phys(cs, addr + offsetof(struct vmcb_seg, attrib)); + sc->selector = + cpu_lduw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, selector), + mmu_idx, 0); + sc->base = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, base), + mmu_idx, 0); + sc->limit = + cpu_ldl_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, limit), + mmu_idx, 0); + flags = + cpu_lduw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, attrib), + mmu_idx, 0); sc->flags = ((flags & 0xff) << 8) | ((flags & 0x0f00) << 12); + svm_canonicalization(env, &sc->base); } -static inline void svm_load_seg_cache(CPUX86State *env, hwaddr addr, - int seg_reg) +static void svm_load_seg_cache(CPUX86State *env, int mmu_idx, + hwaddr addr, int seg_reg) { - SegmentCache sc1, *sc = &sc1; + SegmentCache sc; - svm_load_seg(env, addr, sc); - cpu_x86_load_seg_cache(env, seg_reg, sc->selector, - sc->base, sc->limit, sc->flags); + svm_load_seg(env, mmu_idx, addr, &sc); + cpu_x86_load_seg_cache(env, seg_reg, sc.selector, + sc.base, sc.limit, sc.flags); } static inline bool is_efer_invalid_state (CPUX86State *env) @@ -199,13 +206,17 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) env->vm_hsave + offsetof(struct vmcb, save.rflags), cpu_compute_eflags(env)); - svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.es), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.es), &env->segs[R_ES]); - svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.cs), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.cs), &env->segs[R_CS]); - svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.ss), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.ss), &env->segs[R_SS]); - svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.ds), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.ds), &env->segs[R_DS]); x86_stq_phys(cs, env->vm_hsave + offsetof(struct vmcb, save.rip), @@ -271,6 +282,8 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) env->hflags2 |= HF2_NPT_MASK; env->nested_pg_mode = get_pg_mode(env) & PG_MODE_SVM_MASK; + + tlb_flush_by_mmuidx(cs, 1 << MMU_NESTED_IDX); } /* enable intercepts */ @@ -323,18 +336,18 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) save.rflags)), ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK)); - svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.es), - R_ES); - svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.cs), - R_CS); - svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.ss), - R_SS); - svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.ds), - R_DS); - svm_load_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.idtr), - &env->idt); - svm_load_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.gdtr), - &env->gdt); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.es), R_ES); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.cs), R_CS); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.ss), R_SS); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.ds), R_DS); + svm_load_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.idtr), &env->idt); + svm_load_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.gdtr), &env->gdt); env->eip = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.rip)); @@ -449,9 +462,8 @@ void helper_vmmcall(CPUX86State *env) void helper_vmload(CPUX86State *env, int aflag) { - CPUState *cs = env_cpu(env); + int mmu_idx = MMU_PHYS_IDX; target_ulong addr; - int prot; cpu_svm_check_intercept_param(env, SVM_EXIT_VMLOAD, 0, GETPC()); @@ -462,43 +474,52 @@ void helper_vmload(CPUX86State *env, int aflag) } if (virtual_vm_load_save_enabled(env, SVM_EXIT_VMLOAD, GETPC())) { - addr = get_hphys(cs, addr, MMU_DATA_LOAD, &prot); + mmu_idx = MMU_NESTED_IDX; } - qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmload! " TARGET_FMT_lx - "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n", - addr, x86_ldq_phys(cs, addr + offsetof(struct vmcb, - save.fs.base)), - env->segs[R_FS].base); - - svm_load_seg_cache(env, addr + offsetof(struct vmcb, save.fs), R_FS); - svm_load_seg_cache(env, addr + offsetof(struct vmcb, save.gs), R_GS); - svm_load_seg(env, addr + offsetof(struct vmcb, save.tr), &env->tr); - svm_load_seg(env, addr + offsetof(struct vmcb, save.ldtr), &env->ldt); + svm_load_seg_cache(env, mmu_idx, + addr + offsetof(struct vmcb, save.fs), R_FS); + svm_load_seg_cache(env, mmu_idx, + addr + offsetof(struct vmcb, save.gs), R_GS); + svm_load_seg(env, mmu_idx, + addr + offsetof(struct vmcb, save.tr), &env->tr); + svm_load_seg(env, mmu_idx, + addr + offsetof(struct vmcb, save.ldtr), &env->ldt); #ifdef TARGET_X86_64 - env->kernelgsbase = x86_ldq_phys(cs, addr + offsetof(struct vmcb, - save.kernel_gs_base)); - env->lstar = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.lstar)); - env->cstar = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.cstar)); - env->fmask = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.sfmask)); + env->kernelgsbase = + cpu_ldq_mmuidx_ra(env, + addr + offsetof(struct vmcb, save.kernel_gs_base), + mmu_idx, 0); + env->lstar = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.lstar), + mmu_idx, 0); + env->cstar = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.cstar), + mmu_idx, 0); + env->fmask = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sfmask), + mmu_idx, 0); svm_canonicalization(env, &env->kernelgsbase); #endif - env->star = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.star)); - env->sysenter_cs = x86_ldq_phys(cs, - addr + offsetof(struct vmcb, save.sysenter_cs)); - env->sysenter_esp = x86_ldq_phys(cs, addr + offsetof(struct vmcb, - save.sysenter_esp)); - env->sysenter_eip = x86_ldq_phys(cs, addr + offsetof(struct vmcb, - save.sysenter_eip)); - + env->star = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.star), + mmu_idx, 0); + env->sysenter_cs = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_cs), + mmu_idx, 0); + env->sysenter_esp = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_esp), + mmu_idx, 0); + env->sysenter_eip = + cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_eip), + mmu_idx, 0); } void helper_vmsave(CPUX86State *env, int aflag) { - CPUState *cs = env_cpu(env); + int mmu_idx = MMU_PHYS_IDX; target_ulong addr; - int prot; cpu_svm_check_intercept_param(env, SVM_EXIT_VMSAVE, 0, GETPC()); @@ -509,38 +530,36 @@ void helper_vmsave(CPUX86State *env, int aflag) } if (virtual_vm_load_save_enabled(env, SVM_EXIT_VMSAVE, GETPC())) { - addr = get_hphys(cs, addr, MMU_DATA_STORE, &prot); + mmu_idx = MMU_NESTED_IDX; } - qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmsave! " TARGET_FMT_lx - "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n", - addr, x86_ldq_phys(cs, - addr + offsetof(struct vmcb, save.fs.base)), - env->segs[R_FS].base); - - svm_save_seg(env, addr + offsetof(struct vmcb, save.fs), + svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.fs), &env->segs[R_FS]); - svm_save_seg(env, addr + offsetof(struct vmcb, save.gs), + svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.gs), &env->segs[R_GS]); - svm_save_seg(env, addr + offsetof(struct vmcb, save.tr), + svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.tr), &env->tr); - svm_save_seg(env, addr + offsetof(struct vmcb, save.ldtr), + svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.ldtr), &env->ldt); #ifdef TARGET_X86_64 - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.kernel_gs_base), - env->kernelgsbase); - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.lstar), env->lstar); - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.cstar), env->cstar); - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sfmask), env->fmask); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.kernel_gs_base), + env->kernelgsbase, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.lstar), + env->lstar, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.cstar), + env->cstar, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sfmask), + env->fmask, mmu_idx, 0); #endif - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.star), env->star); - x86_stq_phys(cs, - addr + offsetof(struct vmcb, save.sysenter_cs), env->sysenter_cs); - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sysenter_esp), - env->sysenter_esp); - x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sysenter_eip), - env->sysenter_eip); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.star), + env->star, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_cs), + env->sysenter_cs, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_esp), + env->sysenter_esp, mmu_idx, 0); + cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_eip), + env->sysenter_eip, mmu_idx, 0); } void helper_stgi(CPUX86State *env) @@ -720,15 +739,20 @@ void do_vmexit(CPUX86State *env) env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0); } env->hflags2 &= ~HF2_NPT_MASK; + tlb_flush_by_mmuidx(cs, 1 << MMU_NESTED_IDX); /* Save the VM state in the vmcb */ - svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.es), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.es), &env->segs[R_ES]); - svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.cs), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.cs), &env->segs[R_CS]); - svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.ss), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.ss), &env->segs[R_SS]); - svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.ds), + svm_save_seg(env, MMU_PHYS_IDX, + env->vm_vmcb + offsetof(struct vmcb, save.ds), &env->segs[R_DS]); x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.gdtr.base), @@ -809,14 +833,14 @@ void do_vmexit(CPUX86State *env) ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK | VM_MASK)); - svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.es), - R_ES); - svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.cs), - R_CS); - svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.ss), - R_SS); - svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.ds), - R_DS); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.es), R_ES); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.cs), R_CS); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.ss), R_SS); + svm_load_seg_cache(env, MMU_PHYS_IDX, + env->vm_hsave + offsetof(struct vmcb, save.ds), R_DS); env->eip = x86_ldq_phys(cs, env->vm_hsave + offsetof(struct vmcb, save.rip)); diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 279a3ae..e19d5c1 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -23,6 +23,7 @@ #include "disas/disas.h" #include "exec/exec-all.h" #include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "exec/cpu_ldst.h" #include "exec/translator.h" @@ -86,6 +87,9 @@ typedef struct DisasContext { int8_t override; /* -1 if no override, else R_CS, R_DS, etc */ uint8_t prefix; + bool has_modrm; + uint8_t modrm; + #ifndef CONFIG_USER_ONLY uint8_t cpl; /* code priv level */ uint8_t iopl; /* i/o priv level */ @@ -99,8 +103,8 @@ typedef struct DisasContext { uint8_t rex_r; uint8_t rex_x; uint8_t rex_b; - bool rex_w; #endif + bool vex_w; /* used by AVX even on 32-bit processors */ bool jmp_opt; /* use direct block chaining for direct jumps */ bool repz_opt; /* optimize jumps within repz instructions */ bool cc_op_dirty; @@ -113,6 +117,7 @@ typedef struct DisasContext { int cpuid_ext2_features; int cpuid_ext3_features; int cpuid_7_0_ebx_features; + int cpuid_7_0_ecx_features; int cpuid_xsave_features; /* TCG local temps */ @@ -124,8 +129,6 @@ typedef struct DisasContext { /* TCG local register indexes (only used inside old micro ops) */ TCGv tmp0; TCGv tmp4; - TCGv_ptr ptr0; - TCGv_ptr ptr1; TCGv_i32 tmp2_i32; TCGv_i32 tmp3_i32; TCGv_i64 tmp1_i64; @@ -177,7 +180,7 @@ typedef struct DisasContext { #ifdef TARGET_X86_64 #define REX_PREFIX(S) (((S)->prefix & PREFIX_REX) != 0) -#define REX_W(S) ((S)->rex_w) +#define REX_W(S) ((S)->vex_w) #define REX_R(S) ((S)->rex_r + 0) #define REX_X(S) ((S)->rex_x + 0) #define REX_B(S) ((S)->rex_b + 0) @@ -2277,11 +2280,11 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s, } /* Compute the address, with a minimum number of TCG ops. */ -static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a) +static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a, bool is_vsib) { TCGv ea = NULL; - if (a.index >= 0) { + if (a.index >= 0 && !is_vsib) { if (a.scale == 0) { ea = cpu_regs[a.index]; } else { @@ -2314,7 +2317,7 @@ static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a) static void gen_lea_modrm(CPUX86State *env, DisasContext *s, int modrm) { AddressParts a = gen_lea_modrm_0(env, s, modrm); - TCGv ea = gen_lea_modrm_1(s, a); + TCGv ea = gen_lea_modrm_1(s, a, false); gen_lea_v_seg(s, s->aflag, ea, a.def_seg, s->override); } @@ -2327,7 +2330,8 @@ static void gen_nop_modrm(CPUX86State *env, DisasContext *s, int modrm) static void gen_bndck(CPUX86State *env, DisasContext *s, int modrm, TCGCond cond, TCGv_i64 bndv) { - TCGv ea = gen_lea_modrm_1(s, gen_lea_modrm_0(env, s, modrm)); + AddressParts a = gen_lea_modrm_0(env, s, modrm); + TCGv ea = gen_lea_modrm_1(s, a, false); tcg_gen_extu_tl_i64(s->tmp1_i64, ea); if (!CODE64(s)) { @@ -2425,6 +2429,31 @@ static inline uint32_t insn_get(CPUX86State *env, DisasContext *s, MemOp ot) return ret; } +static target_long insn_get_signed(CPUX86State *env, DisasContext *s, MemOp ot) +{ + target_long ret; + + switch (ot) { + case MO_8: + ret = (int8_t) x86_ldub_code(env, s); + break; + case MO_16: + ret = (int16_t) x86_lduw_code(env, s); + break; + case MO_32: + ret = (int32_t) x86_ldl_code(env, s); + break; +#ifdef TARGET_X86_64 + case MO_64: + ret = x86_ldq_code(env, s); + break; +#endif + default: + g_assert_not_reached(); + } + return ret; +} + static inline int insn_const_size(MemOp ot) { if (ot <= MO_32) { @@ -2868,1943 +2897,61 @@ static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool align) int mem_index = s->mem_index; tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ | (align ? MO_ALIGN_16 : 0)); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0))); tcg_gen_addi_tl(s->tmp0, s->A0, 8); tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1))); } static inline void gen_sto_env_A0(DisasContext *s, int offset, bool align) { int mem_index = s->mem_index; - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0))); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0))); tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ | (align ? MO_ALIGN_16 : 0)); tcg_gen_addi_tl(s->tmp0, s->A0, 8); - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1))); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1))); tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); } -static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset) +static void gen_ldy_env_A0(DisasContext *s, int offset, bool align) { - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(0))); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(0))); - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(1))); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(1))); -} - -static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset) -{ - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); -} + int mem_index = s->mem_index; + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, + MO_LEUQ | (align ? MO_ALIGN_32 : 0)); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0))); + tcg_gen_addi_tl(s->tmp0, s->A0, 8); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1))); -static inline void gen_op_movl(DisasContext *s, int d_offset, int s_offset) -{ - tcg_gen_ld_i32(s->tmp2_i32, cpu_env, s_offset); - tcg_gen_st_i32(s->tmp2_i32, cpu_env, d_offset); + tcg_gen_addi_tl(s->tmp0, s->A0, 16); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2))); + tcg_gen_addi_tl(s->tmp0, s->A0, 24); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3))); } -static inline void gen_op_movq_env_0(DisasContext *s, int d_offset) +static void gen_sty_env_A0(DisasContext *s, int offset, bool align) { - tcg_gen_movi_i64(s->tmp1_i64, 0); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); + int mem_index = s->mem_index; + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, + MO_LEUQ | (align ? MO_ALIGN_32 : 0)); + tcg_gen_addi_tl(s->tmp0, s->A0, 8); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 16); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 24); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); } -#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) - -typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); -typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); -typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); -typedef void (*SSEFunc_0_epl)(TCGv_ptr env, TCGv_ptr reg, TCGv_i64 val); -typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b); -typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, - TCGv_ptr reg_c); -typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, - TCGv_i32 val); -typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val); -typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, - TCGv val); - -#define SSE_OPF_CMP (1 << 1) /* does not write for first operand */ -#define SSE_OPF_SPECIAL (1 << 3) /* magic */ -#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */ -#define SSE_OPF_MMX (1 << 5) /* MMX/integer/AVX2 instruction */ -#define SSE_OPF_SCALAR (1 << 6) /* Has SSE scalar variants */ -#define SSE_OPF_SHUF (1 << 9) /* pshufx/shufpx */ - -#define OP(op, flags, a, b, c, d) \ - {flags, {{.op = a}, {.op = b}, {.op = c}, {.op = d} } } - -#define MMX_OP(x) OP(op1, SSE_OPF_MMX, \ - gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL) - -#define SSE_FOP(name) OP(op1, SSE_OPF_SCALAR, \ - gen_helper_##name##ps##_xmm, gen_helper_##name##pd##_xmm, \ - gen_helper_##name##ss, gen_helper_##name##sd) -#define SSE_OP(sname, dname, op, flags) OP(op, flags, \ - gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL) - -typedef union SSEFuncs { - SSEFunc_0_epp op1; - SSEFunc_0_ppi op1i; - SSEFunc_0_eppt op1t; -} SSEFuncs; - -struct SSEOpHelper_table1 { - int flags; - SSEFuncs fn[4]; -}; - -#define SSE_3DNOW { SSE_OPF_3DNOW } -#define SSE_SPECIAL { SSE_OPF_SPECIAL } - -static const struct SSEOpHelper_table1 sse_op_table1[256] = { - /* 3DNow! extensions */ - [0x0e] = SSE_SPECIAL, /* femms */ - [0x0f] = SSE_3DNOW, /* pf... (sse_op_table5) */ - /* pure SSE operations */ - [0x10] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ - [0x11] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ - [0x12] = SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */ - [0x13] = SSE_SPECIAL, /* movlps, movlpd */ - [0x14] = SSE_OP(punpckldq, punpcklqdq, op1, 0), /* unpcklps, unpcklpd */ - [0x15] = SSE_OP(punpckhdq, punpckhqdq, op1, 0), /* unpckhps, unpckhpd */ - [0x16] = SSE_SPECIAL, /* movhps, movhpd, movshdup */ - [0x17] = SSE_SPECIAL, /* movhps, movhpd */ - - [0x28] = SSE_SPECIAL, /* movaps, movapd */ - [0x29] = SSE_SPECIAL, /* movaps, movapd */ - [0x2a] = SSE_SPECIAL, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ - [0x2b] = SSE_SPECIAL, /* movntps, movntpd, movntss, movntsd */ - [0x2c] = SSE_SPECIAL, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */ - [0x2d] = SSE_SPECIAL, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ - [0x2e] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR, - gen_helper_ucomiss, gen_helper_ucomisd, NULL, NULL), - [0x2f] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR, - gen_helper_comiss, gen_helper_comisd, NULL, NULL), - [0x50] = SSE_SPECIAL, /* movmskps, movmskpd */ - [0x51] = OP(op1, SSE_OPF_SCALAR, - gen_helper_sqrtps_xmm, gen_helper_sqrtpd_xmm, - gen_helper_sqrtss, gen_helper_sqrtsd), - [0x52] = OP(op1, SSE_OPF_SCALAR, - gen_helper_rsqrtps_xmm, NULL, gen_helper_rsqrtss, NULL), - [0x53] = OP(op1, SSE_OPF_SCALAR, - gen_helper_rcpps_xmm, NULL, gen_helper_rcpss, NULL), - [0x54] = SSE_OP(pand, pand, op1, 0), /* andps, andpd */ - [0x55] = SSE_OP(pandn, pandn, op1, 0), /* andnps, andnpd */ - [0x56] = SSE_OP(por, por, op1, 0), /* orps, orpd */ - [0x57] = SSE_OP(pxor, pxor, op1, 0), /* xorps, xorpd */ - [0x58] = SSE_FOP(add), - [0x59] = SSE_FOP(mul), - [0x5a] = OP(op1, SSE_OPF_SCALAR, - gen_helper_cvtps2pd_xmm, gen_helper_cvtpd2ps_xmm, - gen_helper_cvtss2sd, gen_helper_cvtsd2ss), - [0x5b] = OP(op1, 0, - gen_helper_cvtdq2ps_xmm, gen_helper_cvtps2dq_xmm, - gen_helper_cvttps2dq_xmm, NULL), - [0x5c] = SSE_FOP(sub), - [0x5d] = SSE_FOP(min), - [0x5e] = SSE_FOP(div), - [0x5f] = SSE_FOP(max), - - [0xc2] = SSE_FOP(cmpeq), /* sse_op_table4 */ - [0xc6] = SSE_OP(shufps, shufpd, op1i, SSE_OPF_SHUF), - - /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */ - [0x38] = SSE_SPECIAL, - [0x3a] = SSE_SPECIAL, - - /* MMX ops and their SSE extensions */ - [0x60] = MMX_OP(punpcklbw), - [0x61] = MMX_OP(punpcklwd), - [0x62] = MMX_OP(punpckldq), - [0x63] = MMX_OP(packsswb), - [0x64] = MMX_OP(pcmpgtb), - [0x65] = MMX_OP(pcmpgtw), - [0x66] = MMX_OP(pcmpgtl), - [0x67] = MMX_OP(packuswb), - [0x68] = MMX_OP(punpckhbw), - [0x69] = MMX_OP(punpckhwd), - [0x6a] = MMX_OP(punpckhdq), - [0x6b] = MMX_OP(packssdw), - [0x6c] = OP(op1, SSE_OPF_MMX, - NULL, gen_helper_punpcklqdq_xmm, NULL, NULL), - [0x6d] = OP(op1, SSE_OPF_MMX, - NULL, gen_helper_punpckhqdq_xmm, NULL, NULL), - [0x6e] = SSE_SPECIAL, /* movd mm, ea */ - [0x6f] = SSE_SPECIAL, /* movq, movdqa, , movqdu */ - [0x70] = OP(op1i, SSE_OPF_SHUF | SSE_OPF_MMX, - gen_helper_pshufw_mmx, gen_helper_pshufd_xmm, - gen_helper_pshufhw_xmm, gen_helper_pshuflw_xmm), - [0x71] = SSE_SPECIAL, /* shiftw */ - [0x72] = SSE_SPECIAL, /* shiftd */ - [0x73] = SSE_SPECIAL, /* shiftq */ - [0x74] = MMX_OP(pcmpeqb), - [0x75] = MMX_OP(pcmpeqw), - [0x76] = MMX_OP(pcmpeql), - [0x77] = SSE_SPECIAL, /* emms */ - [0x78] = SSE_SPECIAL, /* extrq_i, insertq_i (sse4a) */ - [0x79] = OP(op1, 0, - NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r), - [0x7c] = OP(op1, 0, - NULL, gen_helper_haddpd_xmm, NULL, gen_helper_haddps_xmm), - [0x7d] = OP(op1, 0, - NULL, gen_helper_hsubpd_xmm, NULL, gen_helper_hsubps_xmm), - [0x7e] = SSE_SPECIAL, /* movd, movd, , movq */ - [0x7f] = SSE_SPECIAL, /* movq, movdqa, movdqu */ - [0xc4] = SSE_SPECIAL, /* pinsrw */ - [0xc5] = SSE_SPECIAL, /* pextrw */ - [0xd0] = OP(op1, 0, - NULL, gen_helper_addsubpd_xmm, NULL, gen_helper_addsubps_xmm), - [0xd1] = MMX_OP(psrlw), - [0xd2] = MMX_OP(psrld), - [0xd3] = MMX_OP(psrlq), - [0xd4] = MMX_OP(paddq), - [0xd5] = MMX_OP(pmullw), - [0xd6] = SSE_SPECIAL, - [0xd7] = SSE_SPECIAL, /* pmovmskb */ - [0xd8] = MMX_OP(psubusb), - [0xd9] = MMX_OP(psubusw), - [0xda] = MMX_OP(pminub), - [0xdb] = MMX_OP(pand), - [0xdc] = MMX_OP(paddusb), - [0xdd] = MMX_OP(paddusw), - [0xde] = MMX_OP(pmaxub), - [0xdf] = MMX_OP(pandn), - [0xe0] = MMX_OP(pavgb), - [0xe1] = MMX_OP(psraw), - [0xe2] = MMX_OP(psrad), - [0xe3] = MMX_OP(pavgw), - [0xe4] = MMX_OP(pmulhuw), - [0xe5] = MMX_OP(pmulhw), - [0xe6] = OP(op1, 0, - NULL, gen_helper_cvttpd2dq_xmm, - gen_helper_cvtdq2pd_xmm, gen_helper_cvtpd2dq_xmm), - [0xe7] = SSE_SPECIAL, /* movntq, movntq */ - [0xe8] = MMX_OP(psubsb), - [0xe9] = MMX_OP(psubsw), - [0xea] = MMX_OP(pminsw), - [0xeb] = MMX_OP(por), - [0xec] = MMX_OP(paddsb), - [0xed] = MMX_OP(paddsw), - [0xee] = MMX_OP(pmaxsw), - [0xef] = MMX_OP(pxor), - [0xf0] = SSE_SPECIAL, /* lddqu */ - [0xf1] = MMX_OP(psllw), - [0xf2] = MMX_OP(pslld), - [0xf3] = MMX_OP(psllq), - [0xf4] = MMX_OP(pmuludq), - [0xf5] = MMX_OP(pmaddwd), - [0xf6] = MMX_OP(psadbw), - [0xf7] = OP(op1t, SSE_OPF_MMX, - gen_helper_maskmov_mmx, gen_helper_maskmov_xmm, NULL, NULL), - [0xf8] = MMX_OP(psubb), - [0xf9] = MMX_OP(psubw), - [0xfa] = MMX_OP(psubl), - [0xfb] = MMX_OP(psubq), - [0xfc] = MMX_OP(paddb), - [0xfd] = MMX_OP(paddw), - [0xfe] = MMX_OP(paddl), -}; -#undef MMX_OP -#undef OP -#undef SSE_FOP -#undef SSE_OP -#undef SSE_SPECIAL - -#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } - -static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = { - [0 + 2] = MMX_OP2(psrlw), - [0 + 4] = MMX_OP2(psraw), - [0 + 6] = MMX_OP2(psllw), - [8 + 2] = MMX_OP2(psrld), - [8 + 4] = MMX_OP2(psrad), - [8 + 6] = MMX_OP2(pslld), - [16 + 2] = MMX_OP2(psrlq), - [16 + 3] = { NULL, gen_helper_psrldq_xmm }, - [16 + 6] = MMX_OP2(psllq), - [16 + 7] = { NULL, gen_helper_pslldq_xmm }, -}; - -static const SSEFunc_0_epi sse_op_table3ai[] = { - gen_helper_cvtsi2ss, - gen_helper_cvtsi2sd -}; - -#ifdef TARGET_X86_64 -static const SSEFunc_0_epl sse_op_table3aq[] = { - gen_helper_cvtsq2ss, - gen_helper_cvtsq2sd -}; -#endif - -static const SSEFunc_i_ep sse_op_table3bi[] = { - gen_helper_cvttss2si, - gen_helper_cvtss2si, - gen_helper_cvttsd2si, - gen_helper_cvtsd2si -}; - -#ifdef TARGET_X86_64 -static const SSEFunc_l_ep sse_op_table3bq[] = { - gen_helper_cvttss2sq, - gen_helper_cvtss2sq, - gen_helper_cvttsd2sq, - gen_helper_cvtsd2sq -}; -#endif - -#define SSE_CMP(x) { \ - gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \ - gen_helper_ ## x ## ss, gen_helper_ ## x ## sd} -static const SSEFunc_0_epp sse_op_table4[8][4] = { - SSE_CMP(cmpeq), - SSE_CMP(cmplt), - SSE_CMP(cmple), - SSE_CMP(cmpunord), - SSE_CMP(cmpneq), - SSE_CMP(cmpnlt), - SSE_CMP(cmpnle), - SSE_CMP(cmpord), -}; -#undef SSE_CMP - -static const SSEFunc_0_epp sse_op_table5[256] = { - [0x0c] = gen_helper_pi2fw, - [0x0d] = gen_helper_pi2fd, - [0x1c] = gen_helper_pf2iw, - [0x1d] = gen_helper_pf2id, - [0x8a] = gen_helper_pfnacc, - [0x8e] = gen_helper_pfpnacc, - [0x90] = gen_helper_pfcmpge, - [0x94] = gen_helper_pfmin, - [0x96] = gen_helper_pfrcp, - [0x97] = gen_helper_pfrsqrt, - [0x9a] = gen_helper_pfsub, - [0x9e] = gen_helper_pfadd, - [0xa0] = gen_helper_pfcmpgt, - [0xa4] = gen_helper_pfmax, - [0xa6] = gen_helper_movq, /* pfrcpit1; no need to actually increase precision */ - [0xa7] = gen_helper_movq, /* pfrsqit1 */ - [0xaa] = gen_helper_pfsubr, - [0xae] = gen_helper_pfacc, - [0xb0] = gen_helper_pfcmpeq, - [0xb4] = gen_helper_pfmul, - [0xb6] = gen_helper_movq, /* pfrcpit2 */ - [0xb7] = gen_helper_pmulhrw_mmx, - [0xbb] = gen_helper_pswapd, - [0xbf] = gen_helper_pavgb_mmx, -}; - -struct SSEOpHelper_table6 { - SSEFuncs fn[2]; - uint32_t ext_mask; - int flags; -}; - -struct SSEOpHelper_table7 { - union { - SSEFunc_0_eppi op1; - } fn[2]; - uint32_t ext_mask; - int flags; -}; - -#define gen_helper_special_xmm NULL - -#define OP(name, op, flags, ext, mmx_name) \ - {{{.op = mmx_name}, {.op = gen_helper_ ## name ## _xmm} }, \ - CPUID_EXT_ ## ext, flags} -#define BINARY_OP_MMX(name, ext) \ - OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) -#define BINARY_OP(name, ext, flags) \ - OP(name, op1, flags, ext, NULL) -#define UNARY_OP_MMX(name, ext) \ - OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) -#define UNARY_OP(name, ext, flags) \ - OP(name, op1, flags, ext, NULL) -#define BLENDV_OP(name, ext, flags) OP(name, op1, 0, ext, NULL) -#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP, ext, NULL) -#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL) - -/* prefix [66] 0f 38 */ -static const struct SSEOpHelper_table6 sse_op_table6[256] = { - [0x00] = BINARY_OP_MMX(pshufb, SSSE3), - [0x01] = BINARY_OP_MMX(phaddw, SSSE3), - [0x02] = BINARY_OP_MMX(phaddd, SSSE3), - [0x03] = BINARY_OP_MMX(phaddsw, SSSE3), - [0x04] = BINARY_OP_MMX(pmaddubsw, SSSE3), - [0x05] = BINARY_OP_MMX(phsubw, SSSE3), - [0x06] = BINARY_OP_MMX(phsubd, SSSE3), - [0x07] = BINARY_OP_MMX(phsubsw, SSSE3), - [0x08] = BINARY_OP_MMX(psignb, SSSE3), - [0x09] = BINARY_OP_MMX(psignw, SSSE3), - [0x0a] = BINARY_OP_MMX(psignd, SSSE3), - [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), - [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), - [0x14] = BLENDV_OP(blendvps, SSE41, 0), - [0x15] = BLENDV_OP(blendvpd, SSE41, 0), - [0x17] = CMP_OP(ptest, SSE41), - [0x1c] = UNARY_OP_MMX(pabsb, SSSE3), - [0x1d] = UNARY_OP_MMX(pabsw, SSSE3), - [0x1e] = UNARY_OP_MMX(pabsd, SSSE3), - [0x20] = UNARY_OP(pmovsxbw, SSE41, SSE_OPF_MMX), - [0x21] = UNARY_OP(pmovsxbd, SSE41, SSE_OPF_MMX), - [0x22] = UNARY_OP(pmovsxbq, SSE41, SSE_OPF_MMX), - [0x23] = UNARY_OP(pmovsxwd, SSE41, SSE_OPF_MMX), - [0x24] = UNARY_OP(pmovsxwq, SSE41, SSE_OPF_MMX), - [0x25] = UNARY_OP(pmovsxdq, SSE41, SSE_OPF_MMX), - [0x28] = BINARY_OP(pmuldq, SSE41, SSE_OPF_MMX), - [0x29] = BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX), - [0x2a] = SPECIAL_OP(SSE41), /* movntdqa */ - [0x2b] = BINARY_OP(packusdw, SSE41, SSE_OPF_MMX), - [0x30] = UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX), - [0x31] = UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX), - [0x32] = UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX), - [0x33] = UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX), - [0x34] = UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX), - [0x35] = UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX), - [0x37] = BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX), - [0x38] = BINARY_OP(pminsb, SSE41, SSE_OPF_MMX), - [0x39] = BINARY_OP(pminsd, SSE41, SSE_OPF_MMX), - [0x3a] = BINARY_OP(pminuw, SSE41, SSE_OPF_MMX), - [0x3b] = BINARY_OP(pminud, SSE41, SSE_OPF_MMX), - [0x3c] = BINARY_OP(pmaxsb, SSE41, SSE_OPF_MMX), - [0x3d] = BINARY_OP(pmaxsd, SSE41, SSE_OPF_MMX), - [0x3e] = BINARY_OP(pmaxuw, SSE41, SSE_OPF_MMX), - [0x3f] = BINARY_OP(pmaxud, SSE41, SSE_OPF_MMX), - [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), - [0x41] = UNARY_OP(phminposuw, SSE41, 0), - [0xdb] = UNARY_OP(aesimc, AES, 0), - [0xdc] = BINARY_OP(aesenc, AES, 0), - [0xdd] = BINARY_OP(aesenclast, AES, 0), - [0xde] = BINARY_OP(aesdec, AES, 0), - [0xdf] = BINARY_OP(aesdeclast, AES, 0), -}; - -/* prefix [66] 0f 3a */ -static const struct SSEOpHelper_table7 sse_op_table7[256] = { - [0x08] = UNARY_OP(roundps, SSE41, 0), - [0x09] = UNARY_OP(roundpd, SSE41, 0), - [0x0a] = UNARY_OP(roundss, SSE41, SSE_OPF_SCALAR), - [0x0b] = UNARY_OP(roundsd, SSE41, SSE_OPF_SCALAR), - [0x0c] = BINARY_OP(blendps, SSE41, 0), - [0x0d] = BINARY_OP(blendpd, SSE41, 0), - [0x0e] = BINARY_OP(pblendw, SSE41, SSE_OPF_MMX), - [0x0f] = BINARY_OP_MMX(palignr, SSSE3), - [0x14] = SPECIAL_OP(SSE41), /* pextrb */ - [0x15] = SPECIAL_OP(SSE41), /* pextrw */ - [0x16] = SPECIAL_OP(SSE41), /* pextrd/pextrq */ - [0x17] = SPECIAL_OP(SSE41), /* extractps */ - [0x20] = SPECIAL_OP(SSE41), /* pinsrb */ - [0x21] = SPECIAL_OP(SSE41), /* insertps */ - [0x22] = SPECIAL_OP(SSE41), /* pinsrd/pinsrq */ - [0x40] = BINARY_OP(dpps, SSE41, 0), - [0x41] = BINARY_OP(dppd, SSE41, 0), - [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX), - [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0), - [0x60] = CMP_OP(pcmpestrm, SSE42), - [0x61] = CMP_OP(pcmpestri, SSE42), - [0x62] = CMP_OP(pcmpistrm, SSE42), - [0x63] = CMP_OP(pcmpistri, SSE42), - [0xdf] = UNARY_OP(aeskeygenassist, AES, 0), -}; - -#undef OP -#undef BINARY_OP_MMX -#undef BINARY_OP -#undef UNARY_OP_MMX -#undef UNARY_OP -#undef BLENDV_OP -#undef SPECIAL_OP - -/* VEX prefix not allowed */ -#define CHECK_NO_VEX(s) do { \ - if (s->prefix & PREFIX_VEX) \ - goto illegal_op; \ - } while (0) - -static void gen_sse(CPUX86State *env, DisasContext *s, int b) -{ - int b1, op1_offset, op2_offset, is_xmm, val; - int modrm, mod, rm, reg; - int sse_op_flags; - SSEFuncs sse_op_fn; - const struct SSEOpHelper_table6 *op6; - const struct SSEOpHelper_table7 *op7; - MemOp ot; - - b &= 0xff; - if (s->prefix & PREFIX_DATA) - b1 = 1; - else if (s->prefix & PREFIX_REPZ) - b1 = 2; - else if (s->prefix & PREFIX_REPNZ) - b1 = 3; - else - b1 = 0; - sse_op_flags = sse_op_table1[b].flags; - sse_op_fn = sse_op_table1[b].fn[b1]; - if ((sse_op_flags & (SSE_OPF_SPECIAL | SSE_OPF_3DNOW)) == 0 - && !sse_op_fn.op1) { - goto unknown_op; - } - if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) { - is_xmm = 1; - } else { - if (b1 == 0) { - /* MMX case */ - is_xmm = 0; - } else { - is_xmm = 1; - } - } - if (sse_op_flags & SSE_OPF_3DNOW) { - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { - goto illegal_op; - } - } - /* simple MMX/SSE operation */ - if (s->flags & HF_TS_MASK) { - gen_exception(s, EXCP07_PREX); - return; - } - if (s->flags & HF_EM_MASK) { - illegal_op: - gen_illegal_opcode(s); - return; - } - if (is_xmm - && !(s->flags & HF_OSFXSR_MASK) - && (b != 0x38 && b != 0x3a)) { - goto unknown_op; - } - if (b == 0x0e) { - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { - /* If we were fully decoding this we might use illegal_op. */ - goto unknown_op; - } - /* femms */ - gen_helper_emms(cpu_env); - return; - } - if (b == 0x77) { - /* emms */ - gen_helper_emms(cpu_env); - return; - } - /* prepare MMX state (XXX: optimize by storing fptt and fptags in - the static cpu state) */ - if (!is_xmm) { - gen_helper_enter_mmx(cpu_env); - } - - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7); - if (is_xmm) { - reg |= REX_R(s); - } - mod = (modrm >> 6) & 3; - if (sse_op_flags & SSE_OPF_SPECIAL) { - b |= (b1 << 8); - switch(b) { - case 0x0e7: /* movntq */ - CHECK_NO_VEX(s); - if (mod == 3) { - goto illegal_op; - } - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); - break; - case 0x1e7: /* movntdq */ - case 0x02b: /* movntps */ - case 0x12b: /* movntpd */ - if (mod == 3) - goto illegal_op; - gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, ZMM_OFFSET(reg), true); - break; - case 0x3f0: /* lddqu */ - if (mod == 3) - goto illegal_op; - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, ZMM_OFFSET(reg), false); - break; - case 0x22b: /* movntss */ - case 0x32b: /* movntsd */ - if (mod == 3) - goto illegal_op; - gen_lea_modrm(env, s, modrm); - if (b1 & 1) { - gen_stq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(0))); - gen_op_st_v(s, MO_32, s->T0, s->A0); - } - break; - case 0x6e: /* movd mm, ea */ - CHECK_NO_VEX(s); -#ifdef TARGET_X86_64 - if (s->dflag == MO_64) { - gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); - tcg_gen_st_tl(s->T0, cpu_env, - offsetof(CPUX86State, fpregs[reg].mmx)); - } else -#endif - { - gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx)); - tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); - gen_helper_movl_mm_T0_mmx(s->ptr0, s->tmp2_i32); - } - break; - case 0x16e: /* movd xmm, ea */ -#ifdef TARGET_X86_64 - if (s->dflag == MO_64) { - gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); - gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0); - } else -#endif - { - gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); - tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); - gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32); - } - break; - case 0x6f: /* movq mm, ea */ - CHECK_NO_VEX(s); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); - } else { - rm = (modrm & 7); - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, - offsetof(CPUX86State,fpregs[rm].mmx)); - tcg_gen_st_i64(s->tmp1_i64, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx)); - } - break; - case 0x010: /* movups */ - case 0x110: /* movupd */ - case 0x028: /* movaps */ - case 0x128: /* movapd */ - case 0x16f: /* movdqa xmm, ea */ - case 0x26f: /* movdqu xmm, ea */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, ZMM_OFFSET(reg), - /* movaps, movapd, movdqa */ - b == 0x028 || b == 0x128 || b == 0x16f); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm)); - } - break; - case 0x210: /* movss xmm, ea */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_op_ld_v(s, MO_32, s->T0, s->A0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); - tcg_gen_movi_tl(s->T0, 0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1))); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2))); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3))); - } else { - rm = (modrm & 7) | REX_B(s); - tcg_gen_ld_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0))); - tcg_gen_st_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); - } - break; - case 0x310: /* movsd xmm, ea */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - tcg_gen_movi_tl(s->T0, 0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2))); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3))); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); - } - break; - case 0x012: /* movlps */ - case 0x112: /* movlpd */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - /* movhlps */ - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(1))); - } - break; - case 0x212: /* movsldup */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, ZMM_OFFSET(reg), true); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(0))); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(2))); - } - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0))); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(2))); - break; - case 0x312: /* movddup */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); - } - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); - break; - case 0x016: /* movhps */ - case 0x116: /* movhpd */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(1))); - } else { - /* movlhps */ - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); - } - break; - case 0x216: /* movshdup */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, ZMM_OFFSET(reg), true); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(1))); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(3))); - } - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(1))); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(3))); - break; - case 0x178: - case 0x378: - CHECK_NO_VEX(s); - { - int bit_index, field_length; - - if (b1 == 1 && reg != 0) - goto illegal_op; - field_length = x86_ldub_code(env, s) & 0x3F; - bit_index = x86_ldub_code(env, s) & 0x3F; - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); - if (b1 == 1) - gen_helper_extrq_i(cpu_env, s->ptr0, - tcg_const_i32(bit_index), - tcg_const_i32(field_length)); - else { - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State, xmm_t0); - gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_D(0))); - } else { - rm = (modrm & 7) | REX_B(s); - op2_offset = ZMM_OFFSET(rm); - } - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - gen_helper_insertq_i(cpu_env, s->ptr0, s->ptr1, - tcg_const_i32(bit_index), - tcg_const_i32(field_length)); - } - } - break; - case 0x7e: /* movd ea, mm */ - CHECK_NO_VEX(s); -#ifdef TARGET_X86_64 - if (s->dflag == MO_64) { - tcg_gen_ld_i64(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx)); - gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 1); - } else -#endif - { - tcg_gen_ld32u_tl(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx.MMX_L(0))); - gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 1); - } - break; - case 0x17e: /* movd ea, xmm */ -#ifdef TARGET_X86_64 - if (s->dflag == MO_64) { - tcg_gen_ld_i64(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); - gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 1); - } else -#endif - { - tcg_gen_ld32u_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0))); - gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 1); - } - break; - case 0x27e: /* movq xmm, ea */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); - } - gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); - break; - case 0x7f: /* movq ea, mm */ - CHECK_NO_VEX(s); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); - } else { - rm = (modrm & 7); - gen_op_movq(s, offsetof(CPUX86State, fpregs[rm].mmx), - offsetof(CPUX86State,fpregs[reg].mmx)); - } - break; - case 0x011: /* movups */ - case 0x111: /* movupd */ - case 0x029: /* movaps */ - case 0x129: /* movapd */ - case 0x17f: /* movdqa ea, xmm */ - case 0x27f: /* movdqu ea, xmm */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, ZMM_OFFSET(reg), - /* movaps, movapd, movdqa */ - b == 0x029 || b == 0x129 || b == 0x17f); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movo(s, ZMM_OFFSET(rm), ZMM_OFFSET(reg)); - } - break; - case 0x211: /* movss ea, xmm */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - tcg_gen_ld32u_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); - gen_op_st_v(s, MO_32, s->T0, s->A0); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0))); - } - break; - case 0x311: /* movsd ea, xmm */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); - } - break; - case 0x013: /* movlps */ - case 0x113: /* movlpd */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - goto illegal_op; - } - break; - case 0x017: /* movhps */ - case 0x117: /* movhpd */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(1))); - } else { - goto illegal_op; - } - break; - case 0x71: /* shift mm, im */ - case 0x72: - case 0x73: - case 0x171: /* shift xmm, im */ - case 0x172: - case 0x173: - val = x86_ldub_code(env, s); - if (is_xmm) { - tcg_gen_movi_tl(s->T0, val); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_t0.ZMM_L(0))); - tcg_gen_movi_tl(s->T0, 0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_t0.ZMM_L(1))); - op1_offset = offsetof(CPUX86State,xmm_t0); - } else { - CHECK_NO_VEX(s); - tcg_gen_movi_tl(s->T0, val); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, mmx_t0.MMX_L(0))); - tcg_gen_movi_tl(s->T0, 0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, mmx_t0.MMX_L(1))); - op1_offset = offsetof(CPUX86State,mmx_t0); - } - assert(b1 < 2); - SSEFunc_0_epp fn = sse_op_table2[((b - 1) & 3) * 8 + - (((modrm >> 3)) & 7)][b1]; - if (!fn) { - goto unknown_op; - } - if (is_xmm) { - rm = (modrm & 7) | REX_B(s); - op2_offset = ZMM_OFFSET(rm); - } else { - rm = (modrm & 7); - op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op1_offset); - fn(cpu_env, s->ptr0, s->ptr1); - break; - case 0x050: /* movmskps */ - rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); - gen_helper_movmskps_xmm(s->tmp2_i32, cpu_env, s->ptr0); - tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); - break; - case 0x150: /* movmskpd */ - rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); - gen_helper_movmskpd_xmm(s->tmp2_i32, cpu_env, s->ptr0); - tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); - break; - case 0x02a: /* cvtpi2ps */ - case 0x12a: /* cvtpi2pd */ - CHECK_NO_VEX(s); - gen_helper_enter_mmx(cpu_env); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State,mmx_t0); - gen_ldq_env_A0(s, op2_offset); - } else { - rm = (modrm & 7); - op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); - } - op1_offset = ZMM_OFFSET(reg); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - switch(b >> 8) { - case 0x0: - gen_helper_cvtpi2ps(cpu_env, s->ptr0, s->ptr1); - break; - default: - case 0x1: - gen_helper_cvtpi2pd(cpu_env, s->ptr0, s->ptr1); - break; - } - break; - case 0x22a: /* cvtsi2ss */ - case 0x32a: /* cvtsi2sd */ - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - op1_offset = ZMM_OFFSET(reg); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - if (ot == MO_32) { - SSEFunc_0_epi sse_fn_epi = sse_op_table3ai[(b >> 8) & 1]; - tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); - sse_fn_epi(cpu_env, s->ptr0, s->tmp2_i32); - } else { -#ifdef TARGET_X86_64 - SSEFunc_0_epl sse_fn_epl = sse_op_table3aq[(b >> 8) & 1]; - sse_fn_epl(cpu_env, s->ptr0, s->T0); -#else - goto illegal_op; -#endif - } - break; - case 0x02c: /* cvttps2pi */ - case 0x12c: /* cvttpd2pi */ - case 0x02d: /* cvtps2pi */ - case 0x12d: /* cvtpd2pi */ - CHECK_NO_VEX(s); - gen_helper_enter_mmx(cpu_env); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State,xmm_t0); - /* FIXME: should be 64-bit access if b1 == 0. */ - gen_ldo_env_A0(s, op2_offset, !!b1); - } else { - rm = (modrm & 7) | REX_B(s); - op2_offset = ZMM_OFFSET(rm); - } - op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - switch(b) { - case 0x02c: - gen_helper_cvttps2pi(cpu_env, s->ptr0, s->ptr1); - break; - case 0x12c: - gen_helper_cvttpd2pi(cpu_env, s->ptr0, s->ptr1); - break; - case 0x02d: - gen_helper_cvtps2pi(cpu_env, s->ptr0, s->ptr1); - break; - case 0x12d: - gen_helper_cvtpd2pi(cpu_env, s->ptr0, s->ptr1); - break; - } - break; - case 0x22c: /* cvttss2si */ - case 0x32c: /* cvttsd2si */ - case 0x22d: /* cvtss2si */ - case 0x32d: /* cvtsd2si */ - ot = mo_64_32(s->dflag); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - if ((b >> 8) & 1) { - gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_Q(0))); - } else { - gen_op_ld_v(s, MO_32, s->T0, s->A0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_t0.ZMM_L(0))); - } - op2_offset = offsetof(CPUX86State,xmm_t0); - } else { - rm = (modrm & 7) | REX_B(s); - op2_offset = ZMM_OFFSET(rm); - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); - if (ot == MO_32) { - SSEFunc_i_ep sse_fn_i_ep = - sse_op_table3bi[((b >> 7) & 2) | (b & 1)]; - sse_fn_i_ep(s->tmp2_i32, cpu_env, s->ptr0); - tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32); - } else { -#ifdef TARGET_X86_64 - SSEFunc_l_ep sse_fn_l_ep = - sse_op_table3bq[((b >> 7) & 2) | (b & 1)]; - sse_fn_l_ep(s->T0, cpu_env, s->ptr0); -#else - goto illegal_op; -#endif - } - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; - case 0xc4: /* pinsrw */ - case 0x1c4: - s->rip_offset = 1; - gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0); - val = x86_ldub_code(env, s); - if (b1) { - val &= 7; - tcg_gen_st16_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_W(val))); - } else { - CHECK_NO_VEX(s); - val &= 3; - tcg_gen_st16_tl(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val))); - } - break; - case 0xc5: /* pextrw */ - case 0x1c5: - if (mod != 3) - goto illegal_op; - ot = mo_64_32(s->dflag); - val = x86_ldub_code(env, s); - if (b1) { - val &= 7; - rm = (modrm & 7) | REX_B(s); - tcg_gen_ld16u_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm].ZMM_W(val))); - } else { - val &= 3; - rm = (modrm & 7); - tcg_gen_ld16u_tl(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[rm].mmx.MMX_W(val))); - } - reg = ((modrm >> 3) & 7) | REX_R(s); - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; - case 0x1d6: /* movq ea, xmm */ - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - gen_stq_env_A0(s, offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(0))); - } else { - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); - gen_op_movq_env_0(s, - offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(1))); - } - break; - case 0x2d6: /* movq2dq */ - CHECK_NO_VEX(s); - gen_helper_enter_mmx(cpu_env); - rm = (modrm & 7); - gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State,fpregs[rm].mmx)); - gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); - break; - case 0x3d6: /* movdq2q */ - CHECK_NO_VEX(s); - gen_helper_enter_mmx(cpu_env); - rm = (modrm & 7) | REX_B(s); - gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); - break; - case 0xd7: /* pmovmskb */ - case 0x1d7: - if (mod != 3) - goto illegal_op; - if (b1) { - rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); - gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0); - } else { - CHECK_NO_VEX(s); - rm = (modrm & 7); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State, fpregs[rm].mmx)); - gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr0); - } - reg = ((modrm >> 3) & 7) | REX_R(s); - tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); - break; - - case 0x138: - case 0x038: - b = modrm; - if ((b & 0xf0) == 0xf0) { - goto do_0f_38_fx; - } - modrm = x86_ldub_code(env, s); - rm = modrm & 7; - reg = ((modrm >> 3) & 7) | REX_R(s); - mod = (modrm >> 6) & 3; - - assert(b1 < 2); - op6 = &sse_op_table6[b]; - if (op6->ext_mask == 0) { - goto unknown_op; - } - if (!(s->cpuid_ext_features & op6->ext_mask)) { - goto illegal_op; - } - - if (b1) { - op1_offset = ZMM_OFFSET(reg); - if (mod == 3) { - op2_offset = ZMM_OFFSET(rm | REX_B(s)); - } else { - op2_offset = offsetof(CPUX86State,xmm_t0); - gen_lea_modrm(env, s, modrm); - switch (b) { - case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ - case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ - case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ - gen_ldq_env_A0(s, op2_offset + - offsetof(ZMMReg, ZMM_Q(0))); - break; - case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ - case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ - tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, - s->mem_index, MO_LEUL); - tcg_gen_st_i32(s->tmp2_i32, cpu_env, op2_offset + - offsetof(ZMMReg, ZMM_L(0))); - break; - case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ - tcg_gen_qemu_ld_tl(s->tmp0, s->A0, - s->mem_index, MO_LEUW); - tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset + - offsetof(ZMMReg, ZMM_W(0))); - break; - case 0x2a: /* movntdqa */ - gen_ldo_env_A0(s, op1_offset, true); - return; - default: - gen_ldo_env_A0(s, op2_offset, true); - } - } - if (!op6->fn[b1].op1) { - goto illegal_op; - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - op6->fn[b1].op1(cpu_env, s->ptr0, s->ptr1); - } else { - CHECK_NO_VEX(s); - if ((op6->flags & SSE_OPF_MMX) == 0) { - goto unknown_op; - } - op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); - if (mod == 3) { - op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); - } else { - op2_offset = offsetof(CPUX86State,mmx_t0); - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, op2_offset); - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - op6->fn[0].op1(cpu_env, s->ptr0, s->ptr1); - } - - if (op6->flags & SSE_OPF_CMP) { - set_cc_op(s, CC_OP_EFLAGS); - } - break; - - case 0x238: - case 0x338: - do_0f_38_fx: - /* Various integer extensions at 0f 38 f[0-f]. */ - b = modrm | (b1 << 8); - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7) | REX_R(s); - - switch (b) { - case 0x3f0: /* crc32 Gd,Eb */ - case 0x3f1: /* crc32 Gd,Ey */ - do_crc32: - CHECK_NO_VEX(s); - if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) { - goto illegal_op; - } - if ((b & 0xff) == 0xf0) { - ot = MO_8; - } else if (s->dflag != MO_64) { - ot = (s->prefix & PREFIX_DATA ? MO_16 : MO_32); - } else { - ot = MO_64; - } - - tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[reg]); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - gen_helper_crc32(s->T0, s->tmp2_i32, - s->T0, tcg_const_i32(8 << ot)); - - ot = mo_64_32(s->dflag); - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; - - case 0x1f0: /* crc32 or movbe */ - case 0x1f1: - CHECK_NO_VEX(s); - /* For these insns, the f3 prefix is supposed to have priority - over the 66 prefix, but that's not what we implement above - setting b1. */ - if (s->prefix & PREFIX_REPNZ) { - goto do_crc32; - } - /* FALLTHRU */ - case 0x0f0: /* movbe Gy,My */ - case 0x0f1: /* movbe My,Gy */ - CHECK_NO_VEX(s); - if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) { - goto illegal_op; - } - if (s->dflag != MO_64) { - ot = (s->prefix & PREFIX_DATA ? MO_16 : MO_32); - } else { - ot = MO_64; - } - - gen_lea_modrm(env, s, modrm); - if ((b & 1) == 0) { - tcg_gen_qemu_ld_tl(s->T0, s->A0, - s->mem_index, ot | MO_BE); - gen_op_mov_reg_v(s, ot, reg, s->T0); - } else { - tcg_gen_qemu_st_tl(cpu_regs[reg], s->A0, - s->mem_index, ot | MO_BE); - } - break; - - case 0x0f2: /* andn Gy, By, Ey */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - tcg_gen_andc_tl(s->T0, s->T0, cpu_regs[s->vex_v]); - gen_op_mov_reg_v(s, ot, reg, s->T0); - gen_op_update1_cc(s); - set_cc_op(s, CC_OP_LOGICB + ot); - break; - - case 0x0f7: /* bextr Gy, Ey, By */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - { - TCGv bound, zero; - - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - /* Extract START, and shift the operand. - Shifts larger than operand size get zeros. */ - tcg_gen_ext8u_tl(s->A0, cpu_regs[s->vex_v]); - tcg_gen_shr_tl(s->T0, s->T0, s->A0); - - bound = tcg_const_tl(ot == MO_64 ? 63 : 31); - zero = tcg_const_tl(0); - tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, - s->T0, zero); - tcg_temp_free(zero); - - /* Extract the LEN into a mask. Lengths larger than - operand size get all ones. */ - tcg_gen_extract_tl(s->A0, cpu_regs[s->vex_v], 8, 8); - tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->A0, bound, - s->A0, bound); - tcg_temp_free(bound); - tcg_gen_movi_tl(s->T1, 1); - tcg_gen_shl_tl(s->T1, s->T1, s->A0); - tcg_gen_subi_tl(s->T1, s->T1, 1); - tcg_gen_and_tl(s->T0, s->T0, s->T1); - - gen_op_mov_reg_v(s, ot, reg, s->T0); - gen_op_update1_cc(s); - set_cc_op(s, CC_OP_LOGICB + ot); - } - break; - - case 0x0f5: /* bzhi Gy, Ey, By */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - tcg_gen_ext8u_tl(s->T1, cpu_regs[s->vex_v]); - { - TCGv bound = tcg_const_tl(ot == MO_64 ? 63 : 31); - /* Note that since we're using BMILG (in order to get O - cleared) we need to store the inverse into C. */ - tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src, - s->T1, bound); - tcg_gen_movcond_tl(TCG_COND_GT, s->T1, s->T1, - bound, bound, s->T1); - tcg_temp_free(bound); - } - tcg_gen_movi_tl(s->A0, -1); - tcg_gen_shl_tl(s->A0, s->A0, s->T1); - tcg_gen_andc_tl(s->T0, s->T0, s->A0); - gen_op_mov_reg_v(s, ot, reg, s->T0); - gen_op_update1_cc(s); - set_cc_op(s, CC_OP_BMILGB + ot); - break; - - case 0x3f6: /* mulx By, Gy, rdx, Ey */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - switch (ot) { - default: - tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); - tcg_gen_trunc_tl_i32(s->tmp3_i32, cpu_regs[R_EDX]); - tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32, - s->tmp2_i32, s->tmp3_i32); - tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32); - tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp3_i32); - break; -#ifdef TARGET_X86_64 - case MO_64: - tcg_gen_mulu2_i64(s->T0, s->T1, - s->T0, cpu_regs[R_EDX]); - tcg_gen_mov_i64(cpu_regs[s->vex_v], s->T0); - tcg_gen_mov_i64(cpu_regs[reg], s->T1); - break; -#endif - } - break; - - case 0x3f5: /* pdep Gy, By, Ey */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - /* Note that by zero-extending the source operand, we - automatically handle zero-extending the result. */ - if (ot == MO_64) { - tcg_gen_mov_tl(s->T1, cpu_regs[s->vex_v]); - } else { - tcg_gen_ext32u_tl(s->T1, cpu_regs[s->vex_v]); - } - gen_helper_pdep(cpu_regs[reg], s->T1, s->T0); - break; - - case 0x2f5: /* pext Gy, By, Ey */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - /* Note that by zero-extending the source operand, we - automatically handle zero-extending the result. */ - if (ot == MO_64) { - tcg_gen_mov_tl(s->T1, cpu_regs[s->vex_v]); - } else { - tcg_gen_ext32u_tl(s->T1, cpu_regs[s->vex_v]); - } - gen_helper_pext(cpu_regs[reg], s->T1, s->T0); - break; - - case 0x1f6: /* adcx Gy, Ey */ - case 0x2f6: /* adox Gy, Ey */ - CHECK_NO_VEX(s); - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) { - goto illegal_op; - } else { - TCGv carry_in, carry_out, zero; - int end_op; - - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - - /* Re-use the carry-out from a previous round. */ - carry_in = NULL; - carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2); - switch (s->cc_op) { - case CC_OP_ADCX: - if (b == 0x1f6) { - carry_in = cpu_cc_dst; - end_op = CC_OP_ADCX; - } else { - end_op = CC_OP_ADCOX; - } - break; - case CC_OP_ADOX: - if (b == 0x1f6) { - end_op = CC_OP_ADCOX; - } else { - carry_in = cpu_cc_src2; - end_op = CC_OP_ADOX; - } - break; - case CC_OP_ADCOX: - end_op = CC_OP_ADCOX; - carry_in = carry_out; - break; - default: - end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX); - break; - } - /* If we can't reuse carry-out, get it out of EFLAGS. */ - if (!carry_in) { - if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) { - gen_compute_eflags(s); - } - carry_in = s->tmp0; - tcg_gen_extract_tl(carry_in, cpu_cc_src, - ctz32(b == 0x1f6 ? CC_C : CC_O), 1); - } - - switch (ot) { -#ifdef TARGET_X86_64 - case MO_32: - /* If we know TL is 64-bit, and we want a 32-bit - result, just do everything in 64-bit arithmetic. */ - tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]); - tcg_gen_ext32u_i64(s->T0, s->T0); - tcg_gen_add_i64(s->T0, s->T0, cpu_regs[reg]); - tcg_gen_add_i64(s->T0, s->T0, carry_in); - tcg_gen_ext32u_i64(cpu_regs[reg], s->T0); - tcg_gen_shri_i64(carry_out, s->T0, 32); - break; -#endif - default: - /* Otherwise compute the carry-out in two steps. */ - zero = tcg_const_tl(0); - tcg_gen_add2_tl(s->T0, carry_out, - s->T0, zero, - carry_in, zero); - tcg_gen_add2_tl(cpu_regs[reg], carry_out, - cpu_regs[reg], carry_out, - s->T0, zero); - tcg_temp_free(zero); - break; - } - set_cc_op(s, end_op); - } - break; - - case 0x1f7: /* shlx Gy, Ey, By */ - case 0x2f7: /* sarx Gy, Ey, By */ - case 0x3f7: /* shrx Gy, Ey, By */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - if (ot == MO_64) { - tcg_gen_andi_tl(s->T1, cpu_regs[s->vex_v], 63); - } else { - tcg_gen_andi_tl(s->T1, cpu_regs[s->vex_v], 31); - } - if (b == 0x1f7) { - tcg_gen_shl_tl(s->T0, s->T0, s->T1); - } else if (b == 0x2f7) { - if (ot != MO_64) { - tcg_gen_ext32s_tl(s->T0, s->T0); - } - tcg_gen_sar_tl(s->T0, s->T0, s->T1); - } else { - if (ot != MO_64) { - tcg_gen_ext32u_tl(s->T0, s->T0); - } - tcg_gen_shr_tl(s->T0, s->T0, s->T1); - } - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; - - case 0x0f3: - case 0x1f3: - case 0x2f3: - case 0x3f3: /* Group 17 */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - - tcg_gen_mov_tl(cpu_cc_src, s->T0); - switch (reg & 7) { - case 1: /* blsr By,Ey */ - tcg_gen_subi_tl(s->T1, s->T0, 1); - tcg_gen_and_tl(s->T0, s->T0, s->T1); - break; - case 2: /* blsmsk By,Ey */ - tcg_gen_subi_tl(s->T1, s->T0, 1); - tcg_gen_xor_tl(s->T0, s->T0, s->T1); - break; - case 3: /* blsi By, Ey */ - tcg_gen_neg_tl(s->T1, s->T0); - tcg_gen_and_tl(s->T0, s->T0, s->T1); - break; - default: - goto unknown_op; - } - tcg_gen_mov_tl(cpu_cc_dst, s->T0); - gen_op_mov_reg_v(s, ot, s->vex_v, s->T0); - set_cc_op(s, CC_OP_BMILGB + ot); - break; - - default: - goto unknown_op; - } - break; - - case 0x03a: - case 0x13a: - b = modrm; - modrm = x86_ldub_code(env, s); - rm = modrm & 7; - reg = ((modrm >> 3) & 7) | REX_R(s); - mod = (modrm >> 6) & 3; - - assert(b1 < 2); - op7 = &sse_op_table7[b]; - if (op7->ext_mask == 0) { - goto unknown_op; - } - if (!(s->cpuid_ext_features & op7->ext_mask)) { - goto illegal_op; - } - - s->rip_offset = 1; - - if (op7->flags & SSE_OPF_SPECIAL) { - /* None of the "special" ops are valid on mmx registers */ - if (b1 == 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - rm = (modrm & 7) | REX_B(s); - if (mod != 3) - gen_lea_modrm(env, s, modrm); - reg = ((modrm >> 3) & 7) | REX_R(s); - val = x86_ldub_code(env, s); - switch (b) { - case 0x14: /* pextrb */ - tcg_gen_ld8u_tl(s->T0, cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_B(val & 15))); - if (mod == 3) { - gen_op_mov_reg_v(s, ot, rm, s->T0); - } else { - tcg_gen_qemu_st_tl(s->T0, s->A0, - s->mem_index, MO_UB); - } - break; - case 0x15: /* pextrw */ - tcg_gen_ld16u_tl(s->T0, cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_W(val & 7))); - if (mod == 3) { - gen_op_mov_reg_v(s, ot, rm, s->T0); - } else { - tcg_gen_qemu_st_tl(s->T0, s->A0, - s->mem_index, MO_LEUW); - } - break; - case 0x16: - if (ot == MO_32) { /* pextrd */ - tcg_gen_ld_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(val & 3))); - if (mod == 3) { - tcg_gen_extu_i32_tl(cpu_regs[rm], s->tmp2_i32); - } else { - tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, - s->mem_index, MO_LEUL); - } - } else { /* pextrq */ -#ifdef TARGET_X86_64 - tcg_gen_ld_i64(s->tmp1_i64, cpu_env, - offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(val & 1))); - if (mod == 3) { - tcg_gen_mov_i64(cpu_regs[rm], s->tmp1_i64); - } else { - tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, - s->mem_index, MO_LEUQ); - } -#else - goto illegal_op; -#endif - } - break; - case 0x17: /* extractps */ - tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(val & 3))); - if (mod == 3) { - gen_op_mov_reg_v(s, ot, rm, s->T0); - } else { - tcg_gen_qemu_st_tl(s->T0, s->A0, - s->mem_index, MO_LEUL); - } - break; - case 0x20: /* pinsrb */ - if (mod == 3) { - gen_op_mov_v_reg(s, MO_32, s->T0, rm); - } else { - tcg_gen_qemu_ld_tl(s->T0, s->A0, - s->mem_index, MO_UB); - } - tcg_gen_st8_tl(s->T0, cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_B(val & 15))); - break; - case 0x21: /* insertps */ - if (mod == 3) { - tcg_gen_ld_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State,xmm_regs[rm] - .ZMM_L((val >> 6) & 3))); - } else { - tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, - s->mem_index, MO_LEUL); - } - tcg_gen_st_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State,xmm_regs[reg] - .ZMM_L((val >> 4) & 3))); - if ((val >> 0) & 1) - tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), - cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(0))); - if ((val >> 1) & 1) - tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), - cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(1))); - if ((val >> 2) & 1) - tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), - cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(2))); - if ((val >> 3) & 1) - tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), - cpu_env, offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(3))); - break; - case 0x22: - if (ot == MO_32) { /* pinsrd */ - if (mod == 3) { - tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[rm]); - } else { - tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, - s->mem_index, MO_LEUL); - } - tcg_gen_st_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State, - xmm_regs[reg].ZMM_L(val & 3))); - } else { /* pinsrq */ -#ifdef TARGET_X86_64 - if (mod == 3) { - gen_op_mov_v_reg(s, ot, s->tmp1_i64, rm); - } else { - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, - s->mem_index, MO_LEUQ); - } - tcg_gen_st_i64(s->tmp1_i64, cpu_env, - offsetof(CPUX86State, - xmm_regs[reg].ZMM_Q(val & 1))); -#else - goto illegal_op; -#endif - } - break; - } - return; - } - - if (b1 == 0) { - CHECK_NO_VEX(s); - /* MMX */ - if ((op7->flags & SSE_OPF_MMX) == 0) { - goto illegal_op; - } - op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); - if (mod == 3) { - op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); - } else { - op2_offset = offsetof(CPUX86State,mmx_t0); - gen_lea_modrm(env, s, modrm); - gen_ldq_env_A0(s, op2_offset); - } - val = x86_ldub_code(env, s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - - /* We only actually have one MMX instuction (palignr) */ - assert(b == 0x0f); - - op7->fn[0].op1(cpu_env, s->ptr0, s->ptr1, - tcg_const_i32(val)); - break; - } - - /* SSE */ - op1_offset = ZMM_OFFSET(reg); - if (mod == 3) { - op2_offset = ZMM_OFFSET(rm | REX_B(s)); - } else { - op2_offset = offsetof(CPUX86State, xmm_t0); - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, op2_offset, true); - } - - val = x86_ldub_code(env, s); - if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ - set_cc_op(s, CC_OP_EFLAGS); - - if (s->dflag == MO_64) { - /* The helper must use entire 64-bit gp registers */ - val |= 1 << 8; - } - } - - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - op7->fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val)); - if (op7->flags & SSE_OPF_CMP) { - set_cc_op(s, CC_OP_EFLAGS); - } - break; - - case 0x33a: - /* Various integer extensions at 0f 3a f[0-f]. */ - b = modrm | (b1 << 8); - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7) | REX_R(s); - - switch (b) { - case 0x3f0: /* rorx Gy,Ey, Ib */ - if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2) - || !(s->prefix & PREFIX_VEX) - || s->vex_l != 0) { - goto illegal_op; - } - ot = mo_64_32(s->dflag); - gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - b = x86_ldub_code(env, s); - if (ot == MO_64) { - tcg_gen_rotri_tl(s->T0, s->T0, b & 63); - } else { - tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); - tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b & 31); - tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32); - } - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; - - default: - goto unknown_op; - } - break; - - default: - unknown_op: - gen_unknown_opcode(env, s); - return; - } - } else { - /* generic MMX or SSE operation */ - switch(b) { - case 0x70: /* pshufx insn */ - case 0xc6: /* pshufx insn */ - case 0xc2: /* compare insns */ - s->rip_offset = 1; - break; - default: - break; - } - if (is_xmm) { - op1_offset = ZMM_OFFSET(reg); - if (mod != 3) { - int sz = 4; - - gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State, xmm_t0); - - if (sse_op_flags & SSE_OPF_SCALAR) { - if (sse_op_flags & SSE_OPF_CMP) { - /* ucomis[sd], comis[sd] */ - if (b1 == 0) { - sz = 2; - } else { - sz = 3; - } - } else { - /* Most sse scalar operations. */ - if (b1 == 2) { - sz = 2; - } else if (b1 == 3) { - sz = 3; - } - } - } - - switch (sz) { - case 2: - /* 32 bit access */ - gen_op_ld_v(s, MO_32, s->T0, s->A0); - tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State, xmm_t0.ZMM_L(0))); - break; - case 3: - /* 64 bit access */ - gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_D(0))); - break; - default: - /* 128 bit access */ - gen_ldo_env_A0(s, op2_offset, true); - break; - } - } else { - rm = (modrm & 7) | REX_B(s); - op2_offset = ZMM_OFFSET(rm); - } - } else { - CHECK_NO_VEX(s); - op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); - if (mod != 3) { - gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State,mmx_t0); - gen_ldq_env_A0(s, op2_offset); - } else { - rm = (modrm & 7); - op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); - } - if (sse_op_flags & SSE_OPF_3DNOW) { - /* 3DNow! data insns */ - val = x86_ldub_code(env, s); - SSEFunc_0_epp op_3dnow = sse_op_table5[val]; - if (!op_3dnow) { - goto unknown_op; - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - op_3dnow(cpu_env, s->ptr0, s->ptr1); - return; - } - } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - if (sse_op_flags & SSE_OPF_SHUF) { - val = x86_ldub_code(env, s); - sse_op_fn.op1i(s->ptr0, s->ptr1, tcg_const_i32(val)); - } else if (b == 0xf7) { - /* maskmov : we must prepare A0 */ - if (mod != 3) { - goto illegal_op; - } - tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]); - gen_extu(s->aflag, s->A0); - gen_add_A0_ds_seg(s); - sse_op_fn.op1t(cpu_env, s->ptr0, s->ptr1, s->A0); - } else if (b == 0xc2) { - /* compare insns, bits 7:3 (7:5 for AVX) are ignored */ - val = x86_ldub_code(env, s) & 7; - sse_op_table4[val][b1](cpu_env, s->ptr0, s->ptr1); - } else { - sse_op_fn.op1(cpu_env, s->ptr0, s->ptr1); - } - - if (sse_op_flags & SSE_OPF_CMP) { - set_cc_op(s, CC_OP_EFLAGS); - } - } -} +#include "decode-new.h" +#include "emit.c.inc" +#include "decode-new.c.inc" /* convert one instruction. s->base.is_jmp is set if the translation must be stopped. Return the next pc value */ @@ -4817,11 +2964,11 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) int modrm, reg, rm, mod, op, opreg, val; bool orig_cc_op_dirty = s->cc_op_dirty; CCOp orig_cc_op = s->cc_op; + target_ulong orig_pc_save = s->pc_save; s->pc = s->base.pc_next; s->override = -1; #ifdef TARGET_X86_64 - s->rex_w = false; s->rex_r = 0; s->rex_x = 0; s->rex_b = 0; @@ -4829,6 +2976,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) s->rip_offset = 0; /* for relative ip address */ s->vex_l = 0; s->vex_v = 0; + s->vex_w = false; switch (sigsetjmp(s->jmpbuf, 0)) { case 0: break; @@ -4838,8 +2986,15 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) case 2: /* Restore state that may affect the next instruction. */ s->pc = s->base.pc_next; + /* + * TODO: These save/restore can be removed after the table-based + * decoder is complete; we will be decoding the insn completely + * before any code generation that might affect these variables. + */ s->cc_op_dirty = orig_cc_op_dirty; s->cc_op = orig_cc_op; + s->pc_save = orig_pc_save; + /* END TODO */ s->base.num_insns--; tcg_remove_ops_after(s->prev_insn_end); s->base.is_jmp = DISAS_TOO_MANY; @@ -4851,9 +3006,15 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) prefixes = 0; next_byte: + s->prefix = prefixes; b = x86_ldub_code(env, s); /* Collect prefixes. */ switch (b) { + default: + break; + case 0x0f: + b = x86_ldub_code(env, s) + 0x100; + break; case 0xf3: prefixes |= PREFIX_REPZ; prefixes &= ~PREFIX_REPNZ; @@ -4894,7 +3055,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) if (CODE64(s)) { /* REX prefix */ prefixes |= PREFIX_REX; - s->rex_w = (b >> 3) & 1; + s->vex_w = (b >> 3) & 1; s->rex_r = (b & 0x4) << 1; s->rex_x = (b & 0x2) << 2; s->rex_b = (b & 0x1) << 3; @@ -4904,58 +3065,17 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) #endif case 0xc5: /* 2-byte VEX */ case 0xc4: /* 3-byte VEX */ - /* VEX prefixes cannot be used except in 32-bit mode. - Otherwise the instruction is LES or LDS. */ if (CODE32(s) && !VM86(s)) { - static const int pp_prefix[4] = { - 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ - }; - int vex3, vex2 = x86_ldub_code(env, s); + int vex2 = x86_ldub_code(env, s); + s->pc--; /* rewind the advance_pc() x86_ldub_code() did */ if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) { /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b, otherwise the instruction is LES or LDS. */ - s->pc--; /* rewind the advance_pc() x86_ldub_code() did */ break; } - - /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */ - if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ - | PREFIX_LOCK | PREFIX_DATA | PREFIX_REX)) { - goto illegal_op; - } -#ifdef TARGET_X86_64 - s->rex_r = (~vex2 >> 4) & 8; -#endif - if (b == 0xc5) { - /* 2-byte VEX prefix: RVVVVlpp, implied 0f leading opcode byte */ - vex3 = vex2; - b = x86_ldub_code(env, s) | 0x100; - } else { - /* 3-byte VEX prefix: RXBmmmmm wVVVVlpp */ - vex3 = x86_ldub_code(env, s); -#ifdef TARGET_X86_64 - s->rex_x = (~vex2 >> 3) & 8; - s->rex_b = (~vex2 >> 2) & 8; - s->rex_w = (vex3 >> 7) & 1; -#endif - switch (vex2 & 0x1f) { - case 0x01: /* Implied 0f leading opcode bytes. */ - b = x86_ldub_code(env, s) | 0x100; - break; - case 0x02: /* Implied 0f 38 leading opcode bytes. */ - b = 0x138; - break; - case 0x03: /* Implied 0f 3a leading opcode bytes. */ - b = 0x13a; - break; - default: /* Reserved for future use. */ - goto unknown_op; - } - } - s->vex_v = (~vex3 >> 3) & 0xf; - s->vex_l = (vex3 >> 2) & 1; - prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX; + disas_insn_new(s, cpu, b); + return s->pc; } break; } @@ -4988,14 +3108,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) s->dflag = dflag; /* now check op code */ - reswitch: - switch(b) { - case 0x0f: - /**************************/ - /* extended op code */ - b = x86_ldub_code(env, s) | 0x100; - goto reswitch; - + switch (b) { /**************************/ /* arith & logic */ case 0x00 ... 0x05: @@ -5931,7 +4044,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) reg = ((modrm >> 3) & 7) | REX_R(s); { AddressParts a = gen_lea_modrm_0(env, s, modrm); - TCGv ea = gen_lea_modrm_1(s, a); + TCGv ea = gen_lea_modrm_1(s, a, false); gen_lea_v_seg(s, s->aflag, ea, -1, -1); gen_op_mov_reg_v(s, dflag, reg, s->A0); } @@ -6154,7 +4267,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) if (mod != 3) { /* memory op */ AddressParts a = gen_lea_modrm_0(env, s, modrm); - TCGv ea = gen_lea_modrm_1(s, a); + TCGv ea = gen_lea_modrm_1(s, a, false); TCGv last_addr = tcg_temp_new(); bool update_fdp = true; @@ -7149,7 +5262,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) gen_exts(ot, s->T1); tcg_gen_sari_tl(s->tmp0, s->T1, 3 + ot); tcg_gen_shli_tl(s->tmp0, s->tmp0, ot); - tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a), s->tmp0); + tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a, false), s->tmp0); gen_lea_v_seg(s, s->aflag, s->A0, a.def_seg, s->override); if (!(s->prefix & PREFIX_LOCK)) { gen_op_ld_v(s, ot, s->T0, s->A0); @@ -8198,7 +6311,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) /* rip-relative generates #ud */ goto illegal_op; } - tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, a)); + tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, a, false)); if (!CODE64(s)) { tcg_gen_ext32u_tl(s->A0, s->A0); } @@ -8622,11 +6735,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) set_cc_op(s, CC_OP_POPCNT); break; - case 0x10e ... 0x10f: - /* 3DNow! instructions, ignore prefixes */ - s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); - /* fall through */ - case 0x110 ... 0x117: + case 0x10e ... 0x117: case 0x128 ... 0x12f: case 0x138 ... 0x13a: case 0x150 ... 0x179: @@ -8634,7 +6743,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) case 0x1c2: case 0x1c4 ... 0x1c6: case 0x1d0 ... 0x1fe: - gen_sse(env, s, b); + disas_insn_new(s, cpu, b); break; default: goto unknown_op; @@ -8780,6 +6889,7 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu) dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX]; dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX]; dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX]; + dc->cpuid_7_0_ecx_features = env->features[FEAT_7_0_ECX]; dc->cpuid_xsave_features = env->features[FEAT_XSAVE]; dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) || (flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK))); @@ -8799,8 +6909,6 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu) dc->tmp2_i32 = tcg_temp_new_i32(); dc->tmp3_i32 = tcg_temp_new_i32(); dc->tmp4 = tcg_temp_new(); - dc->ptr0 = tcg_temp_new_ptr(); - dc->ptr1 = tcg_temp_new_ptr(); dc->cc_srcT = tcg_temp_local_new(); } diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target index 3273aa8..81831ca 100644 --- a/tests/tcg/i386/Makefile.target +++ b/tests/tcg/i386/Makefile.target @@ -107,7 +107,7 @@ run-test-mmx: QEMU_OPTS += -cpu max run-plugin-test-mmx: QEMU_OPTS += -cpu max test-mmx: test-mmx.h -test-avx: CFLAGS += -masm=intel -O -I. +test-avx: CFLAGS += -mavx -masm=intel -O -I. run-test-avx: QEMU_OPTS += -cpu max run-plugin-test-avx: QEMU_OPTS += -cpu max test-avx: test-avx.h diff --git a/tests/tcg/i386/test-avx.c b/tests/tcg/i386/test-avx.c index 23c170d..953e290 100644 --- a/tests/tcg/i386/test-avx.c +++ b/tests/tcg/i386/test-avx.c @@ -6,18 +6,18 @@ typedef void (*testfn)(void); typedef struct { - uint64_t q0, q1; -} __attribute__((aligned(16))) v2di; + uint64_t q0, q1, q2, q3; +} __attribute__((aligned(32))) v4di; typedef struct { uint64_t mm[8]; - v2di xmm[16]; + v4di ymm[16]; uint64_t r[16]; uint64_t flags; uint32_t ff; uint64_t pad; - v2di mem[4]; - v2di mem0[4]; + v4di mem[4]; + v4di mem0[4]; } reg_state; typedef struct { @@ -31,20 +31,20 @@ reg_state initI; reg_state initF32; reg_state initF64; -static void dump_xmm(const char *name, int n, const v2di *r, int ff) +static void dump_ymm(const char *name, int n, const v4di *r, int ff) { - printf("%s%d = %016lx %016lx\n", - name, n, r->q1, r->q0); + printf("%s%d = %016lx %016lx %016lx %016lx\n", + name, n, r->q3, r->q2, r->q1, r->q0); if (ff == 64) { - double v[2]; + double v[4]; memcpy(v, r, sizeof(v)); - printf(" %16g %16g\n", - v[1], v[0]); + printf(" %16g %16g %16g %16g\n", + v[3], v[2], v[1], v[0]); } else if (ff == 32) { - float v[4]; + float v[8]; memcpy(v, r, sizeof(v)); - printf(" %8g %8g %8g %8g\n", - v[3], v[2], v[1], v[0]); + printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n", + v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]); } } @@ -53,10 +53,10 @@ static void dump_regs(reg_state *s) int i; for (i = 0; i < 16; i++) { - dump_xmm("xmm", i, &s->xmm[i], 0); + dump_ymm("ymm", i, &s->ymm[i], 0); } for (i = 0; i < 4; i++) { - dump_xmm("mem", i, &s->mem0[i], 0); + dump_ymm("mem", i, &s->mem0[i], 0); } } @@ -74,13 +74,13 @@ static void compare_state(const reg_state *a, const reg_state *b) } } for (i = 0; i < 16; i++) { - if (memcmp(&a->xmm[i], &b->xmm[i], 16)) { - dump_xmm("xmm", i, &b->xmm[i], a->ff); + if (memcmp(&a->ymm[i], &b->ymm[i], 32)) { + dump_ymm("ymm", i, &b->ymm[i], a->ff); } } for (i = 0; i < 4; i++) { - if (memcmp(&a->mem0[i], &a->mem[i], 16)) { - dump_xmm("mem", i, &a->mem[i], a->ff); + if (memcmp(&a->mem0[i], &a->mem[i], 32)) { + dump_ymm("mem", i, &a->mem[i], a->ff); } } if (a->flags != b->flags) { @@ -89,9 +89,9 @@ static void compare_state(const reg_state *a, const reg_state *b) } #define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t" -#define LOADXMM(r, o) "movdqa " #r ", " #o "[%0]\n\t" +#define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t" #define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t" -#define STOREXMM(r, o) "movdqa " #o "[%1], " #r "\n\t" +#define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t" #define MMREG(F) \ F(mm0, 0x00) \ F(mm1, 0x08) \ @@ -101,39 +101,39 @@ static void compare_state(const reg_state *a, const reg_state *b) F(mm5, 0x28) \ F(mm6, 0x30) \ F(mm7, 0x38) -#define XMMREG(F) \ - F(xmm0, 0x040) \ - F(xmm1, 0x050) \ - F(xmm2, 0x060) \ - F(xmm3, 0x070) \ - F(xmm4, 0x080) \ - F(xmm5, 0x090) \ - F(xmm6, 0x0a0) \ - F(xmm7, 0x0b0) \ - F(xmm8, 0x0c0) \ - F(xmm9, 0x0d0) \ - F(xmm10, 0x0e0) \ - F(xmm11, 0x0f0) \ - F(xmm12, 0x100) \ - F(xmm13, 0x110) \ - F(xmm14, 0x120) \ - F(xmm15, 0x130) +#define YMMREG(F) \ + F(ymm0, 0x040) \ + F(ymm1, 0x060) \ + F(ymm2, 0x080) \ + F(ymm3, 0x0a0) \ + F(ymm4, 0x0c0) \ + F(ymm5, 0x0e0) \ + F(ymm6, 0x100) \ + F(ymm7, 0x120) \ + F(ymm8, 0x140) \ + F(ymm9, 0x160) \ + F(ymm10, 0x180) \ + F(ymm11, 0x1a0) \ + F(ymm12, 0x1c0) \ + F(ymm13, 0x1e0) \ + F(ymm14, 0x200) \ + F(ymm15, 0x220) #define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t" #define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t" #define REG(F) \ - F(rbx, 0x148) \ - F(rcx, 0x150) \ - F(rdx, 0x158) \ - F(rsi, 0x160) \ - F(rdi, 0x168) \ - F(r8, 0x180) \ - F(r9, 0x188) \ - F(r10, 0x190) \ - F(r11, 0x198) \ - F(r12, 0x1a0) \ - F(r13, 0x1a8) \ - F(r14, 0x1b0) \ - F(r15, 0x1b8) \ + F(rbx, 0x248) \ + F(rcx, 0x250) \ + F(rdx, 0x258) \ + F(rsi, 0x260) \ + F(rdi, 0x268) \ + F(r8, 0x280) \ + F(r9, 0x288) \ + F(r10, 0x290) \ + F(r11, 0x298) \ + F(r12, 0x2a0) \ + F(r13, 0x2a8) \ + F(r14, 0x2b0) \ + F(r15, 0x2b8) \ static void run_test(const TestDef *t) { @@ -143,7 +143,7 @@ static void run_test(const TestDef *t) printf("%5d %s\n", t->n, t->s); asm volatile( MMREG(LOADMM) - XMMREG(LOADXMM) + YMMREG(LOADYMM) "sub rsp, 128\n\t" "push rax\n\t" "push rbx\n\t" @@ -156,26 +156,26 @@ static void run_test(const TestDef *t) "pop rbx\n\t" "shr rbx, 8\n\t" "shl rbx, 8\n\t" - "mov rcx, 0x1c0[rax]\n\t" + "mov rcx, 0x2c0[rax]\n\t" "and rcx, 0xff\n\t" "or rbx, rcx\n\t" "push rbx\n\t" "popf\n\t" REG(LOADREG) - "mov rax, 0x140[rax]\n\t" + "mov rax, 0x240[rax]\n\t" "call [rsp]\n\t" "mov [rsp], rax\n\t" "mov rax, 8[rsp]\n\t" REG(STOREREG) "mov rbx, [rsp]\n\t" - "mov 0x140[rax], rbx\n\t" + "mov 0x240[rax], rbx\n\t" "mov rbx, 0\n\t" - "mov 0x170[rax], rbx\n\t" - "mov 0x178[rax], rbx\n\t" + "mov 0x270[rax], rbx\n\t" + "mov 0x278[rax], rbx\n\t" "pushf\n\t" "pop rbx\n\t" "and rbx, 0xff\n\t" - "mov 0x1c0[rax], rbx\n\t" + "mov 0x2c0[rax], rbx\n\t" "add rsp, 16\n\t" "pop rdx\n\t" "pop rcx\n\t" @@ -183,15 +183,15 @@ static void run_test(const TestDef *t) "pop rax\n\t" "add rsp, 128\n\t" MMREG(STOREMM) - XMMREG(STOREXMM) + YMMREG(STOREYMM) : : "r"(init), "r"(&result), "r"(t->fn) : "memory", "cc", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15" ); compare_state(init, &result); } @@ -223,22 +223,30 @@ static void run_all(void) float val_f32[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5, 8.3}; double val_f64[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5}; -v2di val_i64[] = { - {0x3d6b3b6a9e4118f2lu, 0x355ae76d2774d78clu}, - {0xd851c54a56bf1f29lu, 0x4a84d1d50bf4c4fflu}, - {0x5826475e2c5fd799lu, 0xfd32edc01243f5e9lu}, +v4di val_i64[] = { + {0x3d6b3b6a9e4118f2lu, 0x355ae76d2774d78clu, + 0xac3ff76c4daa4b28lu, 0xe7fabd204cb54083lu}, + {0xd851c54a56bf1f29lu, 0x4a84d1d50bf4c4fflu, + 0x56621e553d52b56clu, 0xd0069553da8f584alu}, + {0x5826475e2c5fd799lu, 0xfd32edc01243f5e9lu, + 0x738ba2c66d3fe126lu, 0x5707219c6e6c26b4lu}, }; -v2di deadbeef = {0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull}; -v2di indexq = {0x000000000000001full, 0x000000000000008full}; -v2di indexd = {0x00000002000000efull, 0xfffffff500000010ull}; +v4di deadbeef = {0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull, + 0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull}; +v4di indexq = {0x000000000000001full, 0x000000000000008full, + 0xffffffffffffffffull, 0xffffffffffffff5full}; +v4di indexd = {0x00000002000000efull, 0xfffffff500000010ull, + 0x0000000afffffff0ull, 0x000000000000000eull}; -void init_f32reg(v2di *r) +v4di gather_mem[0x20]; + +void init_f32reg(v4di *r) { static int n; - float v[4]; + float v[8]; int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < 8; i++) { v[i] = val_f32[n++]; if (n == ARRAY_LEN(val_f32)) { n = 0; @@ -247,12 +255,12 @@ void init_f32reg(v2di *r) memcpy(r, v, sizeof(*r)); } -void init_f64reg(v2di *r) +void init_f64reg(v4di *r) { static int n; - double v[2]; + double v[4]; int i; - for (i = 0; i < 2; i++) { + for (i = 0; i < 4; i++) { v[i] = val_f64[n++]; if (n == ARRAY_LEN(val_f64)) { n = 0; @@ -261,13 +269,15 @@ void init_f64reg(v2di *r) memcpy(r, v, sizeof(*r)); } -void init_intreg(v2di *r) +void init_intreg(v4di *r) { static uint64_t mask; static int n; r->q0 = val_i64[n].q0 ^ mask; r->q1 = val_i64[n].q1 ^ mask; + r->q2 = val_i64[n].q2 ^ mask; + r->q3 = val_i64[n].q3 ^ mask; n++; if (n == ARRAY_LEN(val_i64)) { n = 0; @@ -280,46 +290,53 @@ static void init_all(reg_state *s) int i; s->r[3] = (uint64_t)&s->mem[0]; /* rdx */ + s->r[4] = (uint64_t)&gather_mem[ARRAY_LEN(gather_mem) / 2]; /* rsi */ s->r[5] = (uint64_t)&s->mem[2]; /* rdi */ s->flags = 2; - for (i = 0; i < 8; i++) { - s->xmm[i] = deadbeef; + for (i = 0; i < 16; i++) { + s->ymm[i] = deadbeef; } - s->xmm[13] = indexd; - s->xmm[14] = indexq; - for (i = 0; i < 2; i++) { + s->ymm[13] = indexd; + s->ymm[14] = indexq; + for (i = 0; i < 4; i++) { s->mem0[i] = deadbeef; } } int main(int argc, char *argv[]) { + int i; + init_all(&initI); - init_intreg(&initI.xmm[10]); - init_intreg(&initI.xmm[11]); - init_intreg(&initI.xmm[12]); + init_intreg(&initI.ymm[10]); + init_intreg(&initI.ymm[11]); + init_intreg(&initI.ymm[12]); init_intreg(&initI.mem0[1]); printf("Int:\n"); dump_regs(&initI); init_all(&initF32); - init_f32reg(&initF32.xmm[10]); - init_f32reg(&initF32.xmm[11]); - init_f32reg(&initF32.xmm[12]); + init_f32reg(&initF32.ymm[10]); + init_f32reg(&initF32.ymm[11]); + init_f32reg(&initF32.ymm[12]); init_f32reg(&initF32.mem0[1]); initF32.ff = 32; printf("F32:\n"); dump_regs(&initF32); init_all(&initF64); - init_f64reg(&initF64.xmm[10]); - init_f64reg(&initF64.xmm[11]); - init_f64reg(&initF64.xmm[12]); + init_f64reg(&initF64.ymm[10]); + init_f64reg(&initF64.ymm[11]); + init_f64reg(&initF64.ymm[12]); init_f64reg(&initF64.mem0[1]); initF64.ff = 64; printf("F64:\n"); dump_regs(&initF64); + for (i = 0; i < ARRAY_LEN(gather_mem); i++) { + init_intreg(&gather_mem[i]); + } + if (argc > 1) { int n = atoi(argv[1]); run_test(&test_table[n]); diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py index e16a3d8..0298232 100755 --- a/tests/tcg/i386/test-avx.py +++ b/tests/tcg/i386/test-avx.py @@ -8,6 +8,7 @@ from fnmatch import fnmatch archs = [ "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2", + "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX", ] ignore = set(["FISTTP", @@ -42,7 +43,7 @@ imask = { 'vROUND[PS][SD]': 0x7, 'vSHUFPD': 0x0f, 'vSHUFPS': 0xff, - 'vAESKEYGENASSIST': 0, + 'vAESKEYGENASSIST': 0xff, 'VEXTRACT[FI]128': 0x01, 'VINSERT[FI]128': 0x01, 'VPBLENDD': 0xff, @@ -85,7 +86,7 @@ def mem_w(w): else: raise Exception() - return t + " PTR 16[rdx]" + return t + " PTR 32[rdx]" class XMMArg(): isxmm = True |