aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2022-10-18 11:14:31 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2022-10-18 11:14:31 -0400
commit214a8da23651f2472b296b3293e619fd58d9e212 (patch)
tree467317b595f2d19b68121d4ef5f843817c7783bb
parent2c65091fd9d387b8dca8115dbdd9c3c61f658a9e (diff)
parent653fad2497bed71d938827299cb9ac38ac333f9b (diff)
downloadqemu-214a8da23651f2472b296b3293e619fd58d9e212.zip
qemu-214a8da23651f2472b296b3293e619fd58d9e212.tar.gz
qemu-214a8da23651f2472b296b3293e619fd58d9e212.tar.bz2
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging
* configure: don't enable firmware for targets that are not built * configure: don't use strings(1) * scsi, target/i386: switch from device_legacy_reset() to device_cold_reset() * target/i386: AVX support for TCG * target/i386: fix SynIC SINT assertion failure on guest reset * target/i386: Use atomic operations for pte updates and other cleanups * tests/tcg: extend SSE tests to AVX * virtio-scsi: send "REPORTED LUNS CHANGED" sense data upon disk hotplug events # -----BEGIN PGP SIGNATURE----- # # iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmNOlOcUHHBib256aW5p # QHJlZGhhdC5jb20ACgkQv/vSX3jHroNuvwgAj/Z5pI9KU33XiWKFR3bZf2lHh21P # xmTzNtPmnP1WHDY1DNug/UB+BLg3c+carpTf5n3B8aKI4X3FfxGSJvYlXy4BONFD # XqYMH3OZB5GaR8Wza9trNYjDs/9hOZus/0R6Hqdl/T38PlMjf8mmayULJIGdcFcJ # WJvITVntbcCwwbpyJbRC5BNigG8ZXTNRoKBgtFVGz6Ox+n0YydwKX5qU5J7xRfCU # lW41LjZ0Fk5lonH16+xuS4WD5EyrNt8cMKCGsxnyxhI7nehe/OGnYr9l+xZJclrh # inQlSwJv0IpUJcrGCI4Xugwux4Z7ZXv3JQ37FzsdZcv/ZXpGonXMeXNJ9A== # =o6x7 # -----END PGP SIGNATURE----- # gpg: Signature made Tue 18 Oct 2022 07:58:31 EDT # gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83 # gpg: issuer "pbonzini@redhat.com" # gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full] # gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full] # Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1 # Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83 * tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (53 commits) target/i386: remove old SSE decoder target/i386: move 3DNow to the new decoder tests/tcg: extend SSE tests to AVX target/i386: Enable AVX cpuid bits when using TCG target/i386: implement VLDMXCSR/VSTMXCSR target/i386: implement XSAVE and XRSTOR of AVX registers target/i386: reimplement 0x0f 0x28-0x2f, add AVX target/i386: reimplement 0x0f 0x10-0x17, add AVX target/i386: reimplement 0x0f 0xc2, 0xc4-0xc6, add AVX target/i386: reimplement 0x0f 0x38, add AVX target/i386: Use tcg gvec ops for pmovmskb target/i386: reimplement 0x0f 0x3a, add AVX target/i386: clarify (un)signedness of immediates from 0F3Ah opcodes target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, add AVX target/i386: reimplement 0x0f 0x70-0x77, add AVX target/i386: reimplement 0x0f 0x78-0x7f, add AVX target/i386: reimplement 0x0f 0x50-0x5f, add AVX target/i386: reimplement 0x0f 0xd8-0xdf, 0xe8-0xef, 0xf8-0xff, add AVX target/i386: reimplement 0x0f 0x60-0x6f, add AVX target/i386: Introduce 256-bit vector helpers ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rwxr-xr-xconfigure55
-rw-r--r--hw/i386/microvm.c4
-rw-r--r--hw/i386/pc.c5
-rw-r--r--hw/scsi/esp.c2
-rw-r--r--hw/scsi/lsi53c895a.c4
-rw-r--r--hw/scsi/megasas.c2
-rw-r--r--hw/scsi/mptsas.c8
-rw-r--r--hw/scsi/scsi-bus.c18
-rw-r--r--hw/scsi/spapr_vscsi.c2
-rw-r--r--hw/scsi/virtio-scsi.c8
-rw-r--r--hw/scsi/vmw_pvscsi.c6
-rw-r--r--include/hw/scsi/scsi.h1
-rw-r--r--target/i386/cpu-param.h2
-rw-r--r--target/i386/cpu.c23
-rw-r--r--target/i386/cpu.h68
-rw-r--r--target/i386/helper.c12
-rw-r--r--target/i386/helper.h3
-rw-r--r--target/i386/kvm/hyperv.c4
-rw-r--r--target/i386/kvm/kvm.c24
-rw-r--r--target/i386/kvm/kvm_i386.h1
-rw-r--r--target/i386/ops_sse.h698
-rw-r--r--target/i386/ops_sse_header.h351
-rw-r--r--target/i386/tcg/decode-new.c.inc1795
-rw-r--r--target/i386/tcg/decode-new.h249
-rw-r--r--target/i386/tcg/emit.c.inc2234
-rw-r--r--target/i386/tcg/fpu_helper.c88
-rw-r--r--target/i386/tcg/sysemu/excp_helper.c712
-rw-r--r--target/i386/tcg/sysemu/svm_helper.c234
-rw-r--r--target/i386/tcg/translate.c2092
-rw-r--r--tests/tcg/i386/Makefile.target2
-rw-r--r--tests/tcg/i386/test-avx.c199
-rwxr-xr-xtests/tcg/i386/test-avx.py5
32 files changed, 5980 insertions, 2931 deletions
diff --git a/configure b/configure
index 45ee6f4..81561be 100755
--- a/configure
+++ b/configure
@@ -1423,30 +1423,31 @@ if test "$tcg" = "enabled"; then
git_submodules="$git_submodules tests/fp/berkeley-softfloat-3"
fi
-# ---
+##########################################
# big/little endian test
cat > $TMPC << EOF
-#include <stdio.h>
-short big_endian[] = { 0x4269, 0x4765, 0x4e64, 0x4961, 0x4e00, 0, };
-short little_endian[] = { 0x694c, 0x7454, 0x654c, 0x6e45, 0x6944, 0x6e41, 0, };
-int main(int argc, char *argv[])
-{
- return printf("%s %s\n", (char *)big_endian, (char *)little_endian);
-}
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# error LITTLE
+#endif
+int main(void) { return 0; }
EOF
-if compile_prog ; then
- if strings -a $TMPE | grep -q BiGeNdIaN ; then
- bigendian="yes"
- elif strings -a $TMPE | grep -q LiTtLeEnDiAn ; then
- bigendian="no"
- else
- echo big/little test failed
- exit 1
- fi
+if ! compile_prog ; then
+ bigendian="no"
else
+ cat > $TMPC << EOF
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# error BIG
+#endif
+int main(void) { return 0; }
+EOF
+
+ if ! compile_prog ; then
+ bigendian="yes"
+ else
echo big/little test failed
exit 1
+ fi
fi
##########################################
@@ -1841,6 +1842,16 @@ compute_target_variable() {
fi
}
+have_target() {
+ for i; do
+ case " $target_list " in
+ *" $i "*) return 0;;
+ *) ;;
+ esac
+ done
+ return 1
+}
+
# probe_target_compiler TARGET
#
# Look for a compiler for the given target, either native or cross.
@@ -2261,8 +2272,9 @@ echo "# Automatically generated by configure - do not modify" > Makefile.prereqs
# Mac OS X ships with a broken assembler
roms=
-if test "$targetos" != "darwin" && test "$targetos" != "sunos" && \
- test "$targetos" != "haiku" && test "$softmmu" = yes && \
+if have_target i386-softmmu x86_64-softmmu && \
+ test "$targetos" != "darwin" && test "$targetos" != "sunos" && \
+ test "$targetos" != "haiku" && \
probe_target_compiler i386-softmmu; then
roms="pc-bios/optionrom"
config_mak=pc-bios/optionrom/config.mak
@@ -2271,7 +2283,8 @@ if test "$targetos" != "darwin" && test "$targetos" != "sunos" && \
write_target_makefile >> $config_mak
fi
-if test "$softmmu" = yes && probe_target_compiler ppc-softmmu; then
+if have_target ppc-softmmu ppc64-softmmu && \
+ probe_target_compiler ppc-softmmu; then
roms="$roms pc-bios/vof"
config_mak=pc-bios/vof/config.mak
echo "# Automatically generated by configure - do not modify" > $config_mak
@@ -2281,7 +2294,7 @@ fi
# Only build s390-ccw bios if the compiler has -march=z900 or -march=z10
# (which is the lowest architecture level that Clang supports)
-if test "$softmmu" = yes && probe_target_compiler s390x-softmmu; then
+if have_target s390x-softmmu && probe_target_compiler s390x-softmmu; then
write_c_skeleton
do_compiler "$target_cc" $target_cc_cflags -march=z900 -o $TMPO -c $TMPC
has_z900=$?
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 7fe8cce..52f9aa9 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -485,9 +485,7 @@ static void microvm_machine_reset(MachineState *machine)
CPU_FOREACH(cs) {
cpu = X86_CPU(cs);
- if (cpu->apic_state) {
- device_legacy_reset(cpu->apic_state);
- }
+ x86_cpu_after_reset(cpu);
}
}
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 566accf..768982a 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -92,6 +92,7 @@
#include "hw/virtio/virtio-mem-pci.h"
#include "hw/mem/memory-device.h"
#include "sysemu/replay.h"
+#include "target/i386/cpu.h"
#include "qapi/qmp/qerror.h"
#include "e820_memory_layout.h"
#include "fw_cfg.h"
@@ -1859,9 +1860,7 @@ static void pc_machine_reset(MachineState *machine)
CPU_FOREACH(cs) {
cpu = X86_CPU(cs);
- if (cpu->apic_state) {
- device_legacy_reset(cpu->apic_state);
- }
+ x86_cpu_after_reset(cpu);
}
}
diff --git a/hw/scsi/esp.c b/hw/scsi/esp.c
index 2ff18ce..e5b281e 100644
--- a/hw/scsi/esp.c
+++ b/hw/scsi/esp.c
@@ -941,7 +941,7 @@ static void esp_soft_reset(ESPState *s)
static void esp_bus_reset(ESPState *s)
{
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
}
static void parent_esp_reset(ESPState *s, int irq, int level)
diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index 05a43ec..5097964 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -1868,7 +1868,7 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val)
}
if (val & LSI_SCNTL1_RST) {
if (!(s->sstat0 & LSI_SSTAT0_RST)) {
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
s->sstat0 |= LSI_SSTAT0_RST;
lsi_script_scsi_interrupt(s, LSI_SIST0_RST, 0);
}
@@ -1926,7 +1926,7 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val)
lsi_execute_script(s);
}
if (val & LSI_ISTAT0_SRST) {
- qdev_reset_all(DEVICE(s));
+ device_cold_reset(DEVICE(s));
}
break;
case 0x16: /* MBOX0 */
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index 7082456..9cbbb16 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -1484,7 +1484,7 @@ static int megasas_cluster_reset_ld(MegasasState *s, MegasasCmd *cmd)
MegasasCmd *tmp_cmd = &s->frames[i];
if (tmp_cmd->req && tmp_cmd->req->dev->id == target_id) {
SCSIDevice *d = tmp_cmd->req->dev;
- qdev_reset_all(&d->qdev);
+ device_cold_reset(&d->qdev);
}
}
return MFI_STAT_OK;
diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c
index a90c254..c485da7 100644
--- a/hw/scsi/mptsas.c
+++ b/hw/scsi/mptsas.c
@@ -522,7 +522,7 @@ reply_maybe_async:
reply.ResponseCode = MPI_SCSITASKMGMT_RSP_TM_INVALID_LUN;
goto out;
}
- qdev_reset_all(&sdev->qdev);
+ device_cold_reset(&sdev->qdev);
break;
case MPI_SCSITASKMGMT_TASKTYPE_TARGET_RESET:
@@ -538,13 +538,13 @@ reply_maybe_async:
QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) {
sdev = SCSI_DEVICE(kid->child);
if (sdev->channel == 0 && sdev->id == req->TargetID) {
- qdev_reset_all(kid->child);
+ device_cold_reset(kid->child);
}
}
break;
case MPI_SCSITASKMGMT_TASKTYPE_RESET_BUS:
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
break;
default:
@@ -807,7 +807,7 @@ static void mptsas_soft_reset(MPTSASState *s)
s->intr_mask = MPI_HIM_DIM | MPI_HIM_RIM;
mptsas_update_interrupt(s);
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
s->intr_status = 0;
s->intr_mask = save_mask;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 4403717..ceceafb 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1616,6 +1616,24 @@ static int scsi_ua_precedence(SCSISense sense)
return (sense.asc << 8) | sense.ascq;
}
+void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense)
+{
+ int prec1, prec2;
+ if (sense.key != UNIT_ATTENTION) {
+ return;
+ }
+
+ /*
+ * Override a pre-existing unit attention condition, except for a more
+ * important reset condition.
+ */
+ prec1 = scsi_ua_precedence(bus->unit_attention);
+ prec2 = scsi_ua_precedence(sense);
+ if (prec2 < prec1) {
+ bus->unit_attention = sense;
+ }
+}
+
void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense)
{
int prec1, prec2;
diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c
index 0a8cbf5..5bbbef6 100644
--- a/hw/scsi/spapr_vscsi.c
+++ b/hw/scsi/spapr_vscsi.c
@@ -865,7 +865,7 @@ static int vscsi_process_tsk_mgmt(VSCSIState *s, vscsi_req *req)
break;
}
- qdev_reset_all(&d->qdev);
+ device_cold_reset(&d->qdev);
break;
case SRP_TSK_ABORT_TASK_SET:
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 41f2a56..6f6e2e3 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -365,7 +365,7 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
goto incorrect_lun;
}
s->resetting++;
- qdev_reset_all(&d->qdev);
+ device_cold_reset(&d->qdev);
s->resetting--;
break;
@@ -417,7 +417,7 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
SCSIDevice *d1 = SCSI_DEVICE(kid->child);
if (d1->channel == 0 && d1->id == target) {
- qdev_reset_all(&d1->qdev);
+ device_cold_reset(&d1->qdev);
}
}
rcu_read_unlock();
@@ -831,7 +831,7 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
assert(!s->dataplane_started);
s->resetting++;
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
s->resetting--;
vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
@@ -956,6 +956,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
virtio_scsi_push_event(s, sd,
VIRTIO_SCSI_T_TRANSPORT_RESET,
VIRTIO_SCSI_EVT_RESET_RESCAN);
+ scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
virtio_scsi_release(s);
}
}
@@ -973,6 +974,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
virtio_scsi_push_event(s, sd,
VIRTIO_SCSI_T_TRANSPORT_RESET,
VIRTIO_SCSI_EVT_RESET_REMOVED);
+ scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
virtio_scsi_release(s);
}
diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c
index 91e2f85..fa76696 100644
--- a/hw/scsi/vmw_pvscsi.c
+++ b/hw/scsi/vmw_pvscsi.c
@@ -445,7 +445,7 @@ static void
pvscsi_reset_adapter(PVSCSIState *s)
{
s->resetting++;
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
s->resetting--;
pvscsi_process_completion_queue(s);
assert(QTAILQ_EMPTY(&s->pending_queue));
@@ -880,7 +880,7 @@ pvscsi_on_cmd_reset_device(PVSCSIState *s)
if (sdev != NULL) {
s->resetting++;
- device_legacy_reset(&sdev->qdev);
+ device_cold_reset(&sdev->qdev);
s->resetting--;
return PVSCSI_COMMAND_PROCESSING_SUCCEEDED;
}
@@ -894,7 +894,7 @@ pvscsi_on_cmd_reset_bus(PVSCSIState *s)
trace_pvscsi_on_cmd_arrived("PVSCSI_CMD_RESET_BUS");
s->resetting++;
- qbus_reset_all(BUS(&s->bus));
+ bus_cold_reset(BUS(&s->bus));
s->resetting--;
return PVSCSI_COMMAND_PROCESSING_SUCCEEDED;
}
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 0011034..3b1b3d2 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -186,6 +186,7 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk,
BlockdevOnError rerror,
BlockdevOnError werror,
const char *serial, Error **errp);
+void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense);
void scsi_bus_legacy_handle_cmdline(SCSIBus *bus);
void scsi_legacy_handle_cmdline(void);
diff --git a/target/i386/cpu-param.h b/target/i386/cpu-param.h
index 1e79389..f579b16 100644
--- a/target/i386/cpu-param.h
+++ b/target/i386/cpu-param.h
@@ -23,7 +23,7 @@
# define TARGET_VIRT_ADDR_SPACE_BITS 32
#endif
#define TARGET_PAGE_BITS 12
-#define NB_MMU_MODES 3
+#define NB_MMU_MODES 5
#ifndef CONFIG_USER_ONLY
# define TARGET_TB_PCREL 1
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8a11470..0ebd610 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,12 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
- CPUID_EXT_RDRAND)
+ CPUID_EXT_RDRAND | CPUID_EXT_AVX)
/* missing:
CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
- CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AVX,
+ CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER,
CPUID_EXT_F16C */
#ifdef TARGET_X86_64
@@ -653,14 +653,14 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \
CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT | \
CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE | \
- CPUID_7_0_EBX_ERMS)
+ CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2)
/* missing:
- CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
+ CPUID_7_0_EBX_HLE
CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
CPUID_7_0_EBX_RDSEED */
#define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \
/* CPUID_7_0_ECX_OSPKE is dynamic */ \
- CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS)
+ CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS | CPUID_7_0_ECX_VAES)
#define TCG_7_0_EDX_FEATURES 0
#define TCG_7_1_EAX_FEATURES 0
#define TCG_APM_FEATURES 0
@@ -6035,6 +6035,19 @@ static void x86_cpu_reset(DeviceState *dev)
#endif
}
+void x86_cpu_after_reset(X86CPU *cpu)
+{
+#ifndef CONFIG_USER_ONLY
+ if (kvm_enabled()) {
+ kvm_arch_after_reset_vcpu(cpu);
+ }
+
+ if (cpu->apic_state) {
+ device_cold_reset(cpu->apic_state);
+ }
+#endif
+}
+
static void mce_init(X86CPU *cpu)
{
CPUX86State *cenv = &cpu->env;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7edf5df..dad2b2d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -169,6 +169,7 @@ typedef enum X86Seg {
#define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */
#define HF_MPX_IU_SHIFT 26 /* BND registers in-use */
#define HF_UMIP_SHIFT 27 /* CR4.UMIP */
+#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */
#define HF_CPL_MASK (3 << HF_CPL_SHIFT)
#define HF_INHIBIT_IRQ_MASK (1 << HF_INHIBIT_IRQ_SHIFT)
@@ -195,6 +196,7 @@ typedef enum X86Seg {
#define HF_MPX_EN_MASK (1 << HF_MPX_EN_SHIFT)
#define HF_MPX_IU_MASK (1 << HF_MPX_IU_SHIFT)
#define HF_UMIP_MASK (1 << HF_UMIP_SHIFT)
+#define HF_AVX_EN_MASK (1 << HF_AVX_EN_SHIFT)
/* hflags2 */
@@ -1233,18 +1235,34 @@ typedef struct SegmentCache {
uint32_t flags;
} SegmentCache;
-#define MMREG_UNION(n, bits) \
- union n { \
- uint8_t _b_##n[(bits)/8]; \
- uint16_t _w_##n[(bits)/16]; \
- uint32_t _l_##n[(bits)/32]; \
- uint64_t _q_##n[(bits)/64]; \
- float32 _s_##n[(bits)/32]; \
- float64 _d_##n[(bits)/64]; \
- }
-
-typedef MMREG_UNION(ZMMReg, 512) ZMMReg;
-typedef MMREG_UNION(MMXReg, 64) MMXReg;
+typedef union MMXReg {
+ uint8_t _b_MMXReg[64 / 8];
+ uint16_t _w_MMXReg[64 / 16];
+ uint32_t _l_MMXReg[64 / 32];
+ uint64_t _q_MMXReg[64 / 64];
+ float32 _s_MMXReg[64 / 32];
+ float64 _d_MMXReg[64 / 64];
+} MMXReg;
+
+typedef union XMMReg {
+ uint64_t _q_XMMReg[128 / 64];
+} XMMReg;
+
+typedef union YMMReg {
+ uint64_t _q_YMMReg[256 / 64];
+ XMMReg _x_YMMReg[256 / 128];
+} YMMReg;
+
+typedef union ZMMReg {
+ uint8_t _b_ZMMReg[512 / 8];
+ uint16_t _w_ZMMReg[512 / 16];
+ uint32_t _l_ZMMReg[512 / 32];
+ uint64_t _q_ZMMReg[512 / 64];
+ float32 _s_ZMMReg[512 / 32];
+ float64 _d_ZMMReg[512 / 64];
+ XMMReg _x_ZMMReg[512 / 128];
+ YMMReg _y_ZMMReg[512 / 256];
+} ZMMReg;
typedef struct BNDReg {
uint64_t lb;
@@ -1267,6 +1285,13 @@ typedef struct BNDCSReg {
#define ZMM_S(n) _s_ZMMReg[15 - (n)]
#define ZMM_Q(n) _q_ZMMReg[7 - (n)]
#define ZMM_D(n) _d_ZMMReg[7 - (n)]
+#define ZMM_X(n) _x_ZMMReg[3 - (n)]
+#define ZMM_Y(n) _y_ZMMReg[1 - (n)]
+
+#define XMM_Q(n) _q_XMMReg[1 - (n)]
+
+#define YMM_Q(n) _q_YMMReg[3 - (n)]
+#define YMM_X(n) _x_YMMReg[1 - (n)]
#define MMX_B(n) _b_MMXReg[7 - (n)]
#define MMX_W(n) _w_MMXReg[3 - (n)]
@@ -1279,6 +1304,13 @@ typedef struct BNDCSReg {
#define ZMM_S(n) _s_ZMMReg[n]
#define ZMM_Q(n) _q_ZMMReg[n]
#define ZMM_D(n) _d_ZMMReg[n]
+#define ZMM_X(n) _x_ZMMReg[n]
+#define ZMM_Y(n) _y_ZMMReg[n]
+
+#define XMM_Q(n) _q_XMMReg[n]
+
+#define YMM_Q(n) _q_YMMReg[n]
+#define YMM_X(n) _x_YMMReg[n]
#define MMX_B(n) _b_MMXReg[n]
#define MMX_W(n) _w_MMXReg[n]
@@ -1556,8 +1588,8 @@ typedef struct CPUArchState {
float_status mmx_status; /* for 3DNow! float ops */
float_status sse_status;
uint32_t mxcsr;
- ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32];
- ZMMReg xmm_t0;
+ ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32] QEMU_ALIGNED(16);
+ ZMMReg xmm_t0 QEMU_ALIGNED(16);
MMXReg mmx_t0;
uint64_t opmask_regs[NB_OPMASK_REGS];
@@ -2082,6 +2114,8 @@ typedef struct PropValue {
} PropValue;
void x86_cpu_apply_props(X86CPU *cpu, PropValue *props);
+void x86_cpu_after_reset(X86CPU *cpu);
+
uint32_t cpu_x86_virtual_addr_width(CPUX86State *env);
/* cpu.c other functions (cpuid) */
@@ -2094,6 +2128,7 @@ void host_cpuid(uint32_t function, uint32_t count,
/* helper.c */
void x86_cpu_set_a20(X86CPU *cpu, int a20_state);
+void cpu_sync_avx_hflag(CPUX86State *env);
#ifndef CONFIG_USER_ONLY
static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs)
@@ -2147,6 +2182,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
#define MMU_KSMAP_IDX 0
#define MMU_USER_IDX 1
#define MMU_KNOSMAP_IDX 2
+#define MMU_NESTED_IDX 3
+#define MMU_PHYS_IDX 4
+
static inline int cpu_mmu_index(CPUX86State *env, bool ifetch)
{
return (env->hflags & HF_CPL_MASK) == 3 ? MMU_USER_IDX :
@@ -2382,8 +2420,6 @@ static inline bool ctl_has_irq(CPUX86State *env)
return (env->int_ctl & V_IRQ_MASK) && (int_prio >= tpr);
}
-hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
- int *prot);
#if defined(TARGET_X86_64) && \
defined(CONFIG_USER_ONLY) && \
defined(CONFIG_LINUX)
diff --git a/target/i386/helper.c b/target/i386/helper.c
index b954ccd..b62a1e4 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -29,6 +29,17 @@
#endif
#include "qemu/log.h"
+void cpu_sync_avx_hflag(CPUX86State *env)
+{
+ if ((env->cr[4] & CR4_OSXSAVE_MASK)
+ && (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK))
+ == (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) {
+ env->hflags |= HF_AVX_EN_MASK;
+ } else{
+ env->hflags &= ~HF_AVX_EN_MASK;
+ }
+}
+
void cpu_sync_bndcs_hflags(CPUX86State *env)
{
uint32_t hflags = env->hflags;
@@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
env->hflags = hflags;
cpu_sync_bndcs_hflags(env);
+ cpu_sync_avx_hflag(env);
}
#if !defined(CONFIG_USER_ONLY)
diff --git a/target/i386/helper.h b/target/i386/helper.h
index 39a3c24..88143b2 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -212,12 +212,13 @@ DEF_HELPER_2(ldmxcsr, void, env, i32)
DEF_HELPER_1(update_mxcsr, void, env)
DEF_HELPER_1(enter_mmx, void, env)
DEF_HELPER_1(emms, void, env)
-DEF_HELPER_3(movq, void, env, ptr, ptr)
#define SHIFT 0
#include "ops_sse_header.h"
#define SHIFT 1
#include "ops_sse_header.h"
+#define SHIFT 2
+#include "ops_sse_header.h"
DEF_HELPER_3(rclb, tl, env, tl, tl)
DEF_HELPER_3(rclw, tl, env, tl, tl)
diff --git a/target/i386/kvm/hyperv.c b/target/i386/kvm/hyperv.c
index 9026ef3..e3ac978 100644
--- a/target/i386/kvm/hyperv.c
+++ b/target/i386/kvm/hyperv.c
@@ -23,6 +23,10 @@ int hyperv_x86_synic_add(X86CPU *cpu)
return 0;
}
+/*
+ * All devices possibly using SynIC have to be reset before calling this to let
+ * them remove their SINT routes first.
+ */
void hyperv_x86_synic_reset(X86CPU *cpu)
{
hyperv_synic_reset(CPU(cpu));
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index bed6c00..dac100c 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -2203,20 +2203,30 @@ void kvm_arch_reset_vcpu(X86CPU *cpu)
env->mp_state = KVM_MP_STATE_RUNNABLE;
}
+ /* enabled by default */
+ env->poll_control_msr = 1;
+
+ kvm_init_nested_state(env);
+
+ sev_es_set_reset_vector(CPU(cpu));
+}
+
+void kvm_arch_after_reset_vcpu(X86CPU *cpu)
+{
+ CPUX86State *env = &cpu->env;
+ int i;
+
+ /*
+ * Reset SynIC after all other devices have been reset to let them remove
+ * their SINT routes first.
+ */
if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
- int i;
for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
}
hyperv_x86_synic_reset(cpu);
}
- /* enabled by default */
- env->poll_control_msr = 1;
-
- kvm_init_nested_state(env);
-
- sev_es_set_reset_vector(CPU(cpu));
}
void kvm_arch_do_init_vcpu(X86CPU *cpu)
diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
index 2ed586c..b7c38ba 100644
--- a/target/i386/kvm/kvm_i386.h
+++ b/target/i386/kvm/kvm_i386.h
@@ -38,6 +38,7 @@ bool kvm_has_adjust_clock_stable(void);
bool kvm_has_exception_payload(void);
void kvm_synchronize_all_tsc(void);
void kvm_arch_reset_vcpu(X86CPU *cs);
+void kvm_arch_after_reset_vcpu(X86CPU *cpu);
void kvm_arch_do_init_vcpu(X86CPU *cs);
void kvm_put_apicbase(X86CPU *cpu, uint64_t value);
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 7bf8bb9..d35fc15 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -35,7 +35,11 @@
#define W(n) ZMM_W(n)
#define L(n) ZMM_L(n)
#define Q(n) ZMM_Q(n)
+#if SHIFT == 1
#define SUFFIX _xmm
+#else
+#define SUFFIX _ymm
+#endif
#endif
#define LANE_WIDTH (SHIFT ? 16 : 8)
@@ -48,9 +52,8 @@
#define FPSLL(x, c) ((x) << shift)
#endif
-void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 15) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -64,9 +67,8 @@ void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 15) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -80,9 +82,8 @@ void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 15) {
shift = 15;
@@ -94,9 +95,8 @@ void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 31) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -110,9 +110,8 @@ void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 31) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -126,9 +125,8 @@ void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 31) {
shift = 31;
@@ -140,9 +138,8 @@ void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 63) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -156,9 +153,8 @@ void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift;
if (c->Q(0) > 63) {
for (int i = 0; i < 1 << SHIFT; i++) {
@@ -173,9 +169,8 @@ void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
#if SHIFT >= 1
-void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift, i, j;
shift = c->L(0);
@@ -192,9 +187,8 @@ void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
}
-void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
{
- Reg *s = d;
int shift, i, j;
shift = c->L(0);
@@ -222,9 +216,8 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
}
#define SSE_HELPER_2(name, elem, num, F) \
- void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
int n = num; \
for (int i = 0; i < n; i++) { \
d->elem(i) = F(v->elem(i), s->elem(i)); \
@@ -304,17 +297,6 @@ static inline int satsw(int x)
#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
-#define FAND(a, b) ((a) & (b))
-#define FANDN(a, b) ((~(a)) & (b))
-#define FOR(a, b) ((a) | (b))
-#define FXOR(a, b) ((a) ^ (b))
-
-#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
-#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
-#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
-#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
-
-#define FMULLW(a, b) ((a) * (b))
#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
#define FMULHUW(a, b) ((a) * (b) >> 16)
#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
@@ -322,58 +304,24 @@ static inline int satsw(int x)
#define FAVG(a, b) (((a) + (b) + 1) >> 1)
#endif
-SSE_HELPER_B(helper_paddb, FADD)
-SSE_HELPER_W(helper_paddw, FADD)
-SSE_HELPER_L(helper_paddl, FADD)
-SSE_HELPER_Q(helper_paddq, FADD)
-
-SSE_HELPER_B(helper_psubb, FSUB)
-SSE_HELPER_W(helper_psubw, FSUB)
-SSE_HELPER_L(helper_psubl, FSUB)
-SSE_HELPER_Q(helper_psubq, FSUB)
-
-SSE_HELPER_B(helper_paddusb, FADDUB)
-SSE_HELPER_B(helper_paddsb, FADDSB)
-SSE_HELPER_B(helper_psubusb, FSUBUB)
-SSE_HELPER_B(helper_psubsb, FSUBSB)
-
-SSE_HELPER_W(helper_paddusw, FADDUW)
-SSE_HELPER_W(helper_paddsw, FADDSW)
-SSE_HELPER_W(helper_psubusw, FSUBUW)
-SSE_HELPER_W(helper_psubsw, FSUBSW)
-
-SSE_HELPER_B(helper_pminub, FMINUB)
-SSE_HELPER_B(helper_pmaxub, FMAXUB)
-
-SSE_HELPER_W(helper_pminsw, FMINSW)
-SSE_HELPER_W(helper_pmaxsw, FMAXSW)
-
-SSE_HELPER_Q(helper_pand, FAND)
-SSE_HELPER_Q(helper_pandn, FANDN)
-SSE_HELPER_Q(helper_por, FOR)
-SSE_HELPER_Q(helper_pxor, FXOR)
-
-SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
-SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
-SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
-SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
-SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
+SSE_HELPER_W(helper_pmulhuw, FMULHUW)
+SSE_HELPER_W(helper_pmulhw, FMULHW)
-SSE_HELPER_W(helper_pmullw, FMULLW)
#if SHIFT == 0
-SSE_HELPER_W(helper_pmulhrw, FMULHRW)
+void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ d->W(0) = FMULHRW(d->W(0), s->W(0));
+ d->W(1) = FMULHRW(d->W(1), s->W(1));
+ d->W(2) = FMULHRW(d->W(2), s->W(2));
+ d->W(3) = FMULHRW(d->W(3), s->W(3));
+}
#endif
-SSE_HELPER_W(helper_pmulhuw, FMULHUW)
-SSE_HELPER_W(helper_pmulhw, FMULHW)
SSE_HELPER_B(helper_pavgb, FAVG)
SSE_HELPER_W(helper_pavgw, FAVG)
-void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < (1 << SHIFT); i++) {
@@ -381,9 +329,8 @@ void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < (2 << SHIFT); i++) {
@@ -402,10 +349,8 @@ static inline int abs1(int a)
}
}
#endif
-
-void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < (1 << SHIFT); i++) {
@@ -436,29 +381,6 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
#endif
-void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
-{
- int i;
-
- d->L(0) = val;
- d->L(1) = 0;
- for (i = 1; i < (1 << SHIFT); i++) {
- d->Q(i) = 0;
- }
-}
-
-#ifdef TARGET_X86_64
-void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
-{
- int i;
-
- d->Q(0) = val;
- for (i = 1; i < (1 << SHIFT); i++) {
- d->Q(i) = 0;
- }
-}
-#endif
-
#define SHUFFLE4(F, a, b, offset) do { \
r0 = a->F((order & 3) + offset); \
r1 = a->F(((order >> 2) & 3) + offset); \
@@ -478,9 +400,8 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
SHUFFLE4(W, s, s, 0);
}
#else
-void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
{
- Reg *v = d;
uint32_t r0, r1, r2, r3;
int i;
@@ -489,9 +410,8 @@ void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
}
}
-void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
{
- Reg *v = d;
uint64_t r0, r1;
int i;
@@ -543,9 +463,8 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
#define SSE_HELPER_P(name, F) \
void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
int i; \
for (i = 0; i < 2 << SHIFT; i++) { \
d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \
@@ -553,9 +472,8 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
} \
\
void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
int i; \
for (i = 0; i < 1 << SHIFT; i++) { \
d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \
@@ -567,16 +485,22 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
#define SSE_HELPER_S(name, F) \
SSE_HELPER_P(name, F) \
\
- void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
+ void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
{ \
- Reg *v = d; \
+ int i; \
d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \
+ for (i = 1; i < 2 << SHIFT; i++) { \
+ d->ZMM_L(i) = v->ZMM_L(i); \
+ } \
} \
\
- void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
+ void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
{ \
- Reg *v = d; \
+ int i; \
d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \
+ for (i = 1; i < 1 << SHIFT; i++) { \
+ d->ZMM_Q(i) = v->ZMM_Q(i); \
+ } \
}
#else
@@ -623,14 +547,22 @@ void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
#if SHIFT == 1
-void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s)
+void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
+ int i;
d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
+ for (i = 1; i < 2 << SHIFT; i++) {
+ d->ZMM_L(i) = v->ZMM_L(i);
+ }
}
-void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s)
+void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
+ int i;
d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
+ for (i = 1; i < 1 << SHIFT; i++) {
+ d->ZMM_Q(i) = v->ZMM_Q(i);
+ }
}
#endif
@@ -655,14 +587,22 @@ void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
#if SHIFT == 1
-void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
+void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
+ int i;
d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
+ for (i = 1; i < 1 << SHIFT; i++) {
+ d->ZMM_Q(i) = v->ZMM_Q(i);
+ }
}
-void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
+void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
+ int i;
d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
+ for (i = 1; i < 2 << SHIFT; i++) {
+ d->ZMM_L(i) = v->ZMM_L(i);
+ }
}
#endif
@@ -882,13 +822,17 @@ void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
}
#if SHIFT == 1
-void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
{
uint8_t old_flags = get_float_exception_flags(&env->sse_status);
+ int i;
d->ZMM_S(0) = float32_div(float32_one,
float32_sqrt(s->ZMM_S(0), &env->sse_status),
&env->sse_status);
set_float_exception_flags(old_flags, &env->sse_status);
+ for (i = 1; i < 2 << SHIFT; i++) {
+ d->ZMM_L(i) = v->ZMM_L(i);
+ }
}
#endif
@@ -903,10 +847,14 @@ void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
}
#if SHIFT == 1
-void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
{
uint8_t old_flags = get_float_exception_flags(&env->sse_status);
+ int i;
d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
+ for (i = 1; i < 2 << SHIFT; i++) {
+ d->ZMM_L(i) = v->ZMM_L(i);
+ }
set_float_exception_flags(old_flags, &env->sse_status);
}
#endif
@@ -958,9 +906,8 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int len
#endif
#define SSE_HELPER_HPS(name, F) \
-void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
float32 r[2 << SHIFT]; \
int i, j, k; \
for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
@@ -980,9 +927,8 @@ SSE_HELPER_HPS(haddps, float32_add)
SSE_HELPER_HPS(hsubps, float32_sub)
#define SSE_HELPER_HPD(name, F) \
-void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
float64 r[1 << SHIFT]; \
int i, j, k; \
for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \
@@ -1001,9 +947,8 @@ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
SSE_HELPER_HPD(haddpd, float64_add)
SSE_HELPER_HPD(hsubpd, float64_sub)
-void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < 2 << SHIFT; i += 2) {
d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
@@ -1011,9 +956,8 @@ void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < 1 << SHIFT; i += 2) {
d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
@@ -1023,9 +967,8 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
#define SSE_HELPER_CMP_P(name, F, C) \
void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
int i; \
for (i = 0; i < 2 << SHIFT; i++) { \
d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \
@@ -1033,9 +976,8 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
} \
\
void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
int i; \
for (i = 0; i < 1 << SHIFT; i++) { \
d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \
@@ -1045,22 +987,39 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
#if SHIFT == 1
#define SSE_HELPER_CMP(name, F, C) \
SSE_HELPER_CMP_P(name, F, C) \
- void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \
+ void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
+ int i; \
d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \
+ for (i = 1; i < 2 << SHIFT; i++) { \
+ d->ZMM_L(i) = v->ZMM_L(i); \
+ } \
} \
\
- void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \
+ void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
+ int i; \
d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \
+ for (i = 1; i < 1 << SHIFT; i++) { \
+ d->ZMM_Q(i) = v->ZMM_Q(i); \
+ } \
}
+static inline bool FPU_EQU(FloatRelation x)
+{
+ return (x == float_relation_equal || x == float_relation_unordered);
+}
+static inline bool FPU_GE(FloatRelation x)
+{
+ return (x == float_relation_equal || x == float_relation_greater);
+}
#define FPU_EQ(x) (x == float_relation_equal)
#define FPU_LT(x) (x == float_relation_less)
#define FPU_LE(x) (x <= float_relation_equal)
+#define FPU_GT(x) (x == float_relation_greater)
#define FPU_UNORD(x) (x == float_relation_unordered)
+/* We must make sure we evaluate the argument in case it is a signalling NAN */
+#define FPU_FALSE(x) (x == float_relation_equal && 0)
#define FPU_CMPQ(size, a, b) \
float ## size ## _compare_quiet(a, b, &env->sse_status)
@@ -1080,6 +1039,33 @@ SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
+SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
+SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
+SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
+SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE)
+SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
+SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
+SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
+SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE)
+
+SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
+SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
+SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
+SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD)
+SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
+SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
+SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
+SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
+
+SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
+SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
+SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
+SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE)
+SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
+SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
+SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
+SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE)
+
#undef SSE_HELPER_CMP
#if SHIFT == 1
@@ -1156,32 +1142,10 @@ uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
#endif
-uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
-{
- uint32_t val;
- int i;
-
- val = 0;
- for (i = 0; i < (1 << SHIFT); i++) {
- uint8_t byte = 0;
- byte |= (s->B(8 * i + 0) >> 7);
- byte |= (s->B(8 * i + 1) >> 6) & 0x02;
- byte |= (s->B(8 * i + 2) >> 5) & 0x04;
- byte |= (s->B(8 * i + 3) >> 4) & 0x08;
- byte |= (s->B(8 * i + 4) >> 3) & 0x10;
- byte |= (s->B(8 * i + 5) >> 2) & 0x20;
- byte |= (s->B(8 * i + 6) >> 1) & 0x40;
- byte |= (s->B(8 * i + 7)) & 0x80;
- val |= byte << (8 * i);
- }
- return val;
-}
-
#define PACK_HELPER_B(name, F) \
void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint8_t r[PACK_WIDTH * 2]; \
int j, k; \
for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
@@ -1200,9 +1164,8 @@ void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
PACK_HELPER_B(sswb, satsb)
PACK_HELPER_B(uswb, satub)
-void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
uint16_t r[PACK_WIDTH];
int j, k;
@@ -1222,9 +1185,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
#define UNPCK_OP(base_name, base) \
\
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint8_t r[PACK_WIDTH * 2]; \
int j, i; \
\
@@ -1241,9 +1203,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
} \
\
void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint16_t r[PACK_WIDTH]; \
int j, i; \
\
@@ -1260,9 +1221,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
} \
\
void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint32_t r[PACK_WIDTH / 2]; \
int j, i; \
\
@@ -1280,9 +1240,8 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
\
XMM_ONLY( \
void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \
- CPUX86State *env, Reg *d, Reg *s) \
+ CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint64_t r[2]; \
int i; \
\
@@ -1453,9 +1412,8 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
#endif
/* SSSE3 op helpers */
-void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
#if SHIFT == 0
uint8_t r[8];
@@ -1480,9 +1438,8 @@ void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
#define SSE_HELPER_HW(name, F) \
-void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint16_t r[4 << SHIFT]; \
int i, j, k; \
for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
@@ -1499,9 +1456,8 @@ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
}
#define SSE_HELPER_HL(name, F) \
-void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
{ \
- Reg *v = d; \
uint32_t r[2 << SHIFT]; \
int i, j, k; \
for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
@@ -1527,9 +1483,8 @@ SSE_HELPER_HL(phsubd, FSUB)
#undef SSE_HELPER_HW
#undef SSE_HELPER_HL
-void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < 4 << SHIFT; i++) {
d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
@@ -1537,13 +1492,6 @@ void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
-#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x)
-#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x)
-SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB)
-SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW)
-SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL)
-
#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
@@ -1554,19 +1502,18 @@ SSE_HELPER_B(helper_psignb, FSIGNB)
SSE_HELPER_W(helper_psignw, FSIGNW)
SSE_HELPER_L(helper_psignd, FSIGNL)
-void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
- int32_t shift)
+void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
+ uint32_t imm)
{
- Reg *v = d;
int i;
/* XXX could be checked during translation */
- if (shift >= (SHIFT ? 32 : 16)) {
+ if (imm >= (SHIFT ? 32 : 16)) {
for (i = 0; i < (1 << SHIFT); i++) {
d->Q(i) = 0;
}
} else {
- shift <<= 3;
+ int shift = imm * 8;
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
#if SHIFT == 0
d->Q(0) = SHR(s->Q(0), shift - 0) |
@@ -1594,10 +1541,9 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
#if SHIFT >= 1
#define SSE_HELPER_V(name, elem, num, F) \
- void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \
+ Reg *m) \
{ \
- Reg *v = d; \
- Reg *m = &env->xmm_regs[0]; \
int i; \
for (i = 0; i < num; i++) { \
d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \
@@ -1605,10 +1551,9 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
#define SSE_HELPER_I(name, elem, num, F) \
- void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, \
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \
uint32_t imm) \
{ \
- Reg *v = d; \
int i; \
for (i = 0; i < num; i++) { \
int j = i & 7; \
@@ -1636,6 +1581,10 @@ void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
}
+#define FMOVSLDUP(i) s->L((i) & ~1)
+#define FMOVSHDUP(i) s->L((i) | 1)
+#define FMOVDLDUP(i) s->Q((i) & ~1)
+
#define SSE_HELPER_F(name, elem, num, F) \
void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
{ \
@@ -1658,11 +1607,13 @@ SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
+SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP)
+SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP)
+SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP)
#endif
-void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
int i;
for (i = 0; i < 1 << SHIFT; i++) {
@@ -1670,12 +1621,8 @@ void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-#define FCMPEQQ(d, s) (d == s ? -1 : 0)
-SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
-
-void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
- Reg *v = d;
uint16_t r[8];
int i, j, k;
@@ -1694,22 +1641,6 @@ void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
-#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
-#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
-#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
-SSE_HELPER_B(helper_pminsb, FMINSB)
-SSE_HELPER_L(helper_pminsd, FMINSD)
-SSE_HELPER_W(helper_pminuw, MIN)
-SSE_HELPER_L(helper_pminud, MIN)
-SSE_HELPER_B(helper_pmaxsb, FMAXSB)
-SSE_HELPER_L(helper_pmaxsd, FMAXSD)
-SSE_HELPER_W(helper_pmaxuw, MAX)
-SSE_HELPER_L(helper_pmaxud, MAX)
-
-#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
-SSE_HELPER_L(helper_pmulld, FMULLD)
-
#if SHIFT == 1
void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
@@ -1819,11 +1750,12 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
#if SHIFT == 1
-void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
uint32_t mode)
{
uint8_t old_flags = get_float_exception_flags(&env->sse_status);
signed char prev_rounding_mode;
+ int i;
prev_rounding_mode = env->sse_status.float_rounding_mode;
if (!(mode & (1 << 2))) {
@@ -1844,6 +1776,9 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
+ for (i = 1; i < 2 << SHIFT; i++) {
+ d->ZMM_L(i) = v->ZMM_L(i);
+ }
if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
@@ -1853,11 +1788,12 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
env->sse_status.float_rounding_mode = prev_rounding_mode;
}
-void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
uint32_t mode)
{
uint8_t old_flags = get_float_exception_flags(&env->sse_status);
signed char prev_rounding_mode;
+ int i;
prev_rounding_mode = env->sse_status.float_rounding_mode;
if (!(mode & (1 << 2))) {
@@ -1878,6 +1814,9 @@ void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
+ for (i = 1; i < 1 << SHIFT; i++) {
+ d->ZMM_Q(i) = v->ZMM_Q(i);
+ }
if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
@@ -1893,10 +1832,9 @@ SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
-void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
uint32_t mask)
{
- Reg *v = d;
float32 prod1, prod2, temp2, temp3, temp4;
int i;
@@ -1939,9 +1877,8 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
#if SHIFT == 1
/* Oddly, there is no ymm version of dppd */
void glue(helper_dppd, SUFFIX)(CPUX86State *env,
- Reg *d, Reg *s, uint32_t mask)
+ Reg *d, Reg *v, Reg *s, uint32_t mask)
{
- Reg *v = d;
float64 prod1, prod2, temp2;
if (mask & (1 << 4)) {
@@ -1960,10 +1897,9 @@ void glue(helper_dppd, SUFFIX)(CPUX86State *env,
}
#endif
-void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
uint32_t offset)
{
- Reg *v = d;
int i, j;
uint16_t r[8];
@@ -1985,9 +1921,6 @@ void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
/* SSE4.2 op helpers */
-#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
-SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
-
#if SHIFT == 1
static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
{
@@ -2043,7 +1976,7 @@ static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
}
static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
- int8_t ctrl, int valids, int validd)
+ uint8_t ctrl, int valids, int validd)
{
unsigned int res = 0;
int v;
@@ -2236,10 +2169,9 @@ static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
}
#endif
-void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
uint32_t ctrl)
{
- Reg *v = d;
uint64_t a, b;
int i;
@@ -2250,10 +2182,10 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
}
}
-void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
int i;
- Reg st = *d;
+ Reg st = *v;
Reg rk = *s;
for (i = 0 ; i < 2 << SHIFT ; i++) {
@@ -2265,10 +2197,10 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
int i;
- Reg st = *d;
+ Reg st = *v;
Reg rk = *s;
for (i = 0; i < 8 << SHIFT; i++) {
@@ -2276,10 +2208,10 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
int i;
- Reg st = *d;
+ Reg st = *v;
Reg rk = *s;
for (i = 0 ; i < 2 << SHIFT ; i++) {
@@ -2291,10 +2223,10 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
}
}
-void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
{
int i;
- Reg st = *d;
+ Reg st = *v;
Reg rk = *s;
for (i = 0; i < 8 << SHIFT; i++) {
@@ -2332,8 +2264,290 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
#endif
#endif
+#if SHIFT >= 1
+void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ uint64_t r0, r1;
+ int i;
+
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = v->Q(i + ((s->Q(i) >> 1) & 1));
+ r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1));
+ d->Q(i) = r0;
+ d->Q(i+1) = r1;
+ }
+}
+
+void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ uint32_t r0, r1, r2, r3;
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ r0 = v->L(i + (s->L(i) & 3));
+ r1 = v->L(i + (s->L(i+1) & 3));
+ r2 = v->L(i + (s->L(i+2) & 3));
+ r3 = v->L(i + (s->L(i+3) & 3));
+ d->L(i) = r0;
+ d->L(i+1) = r1;
+ d->L(i+2) = r2;
+ d->L(i+3) = r3;
+ }
+}
+
+void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
+{
+ uint64_t r0, r1;
+ int i;
+
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = s->Q(i + ((order >> 0) & 1));
+ r1 = s->Q(i + ((order >> 1) & 1));
+ d->Q(i) = r0;
+ d->Q(i+1) = r1;
+
+ order >>= 2;
+ }
+}
+
+void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
+{
+ uint32_t r0, r1, r2, r3;
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ r0 = s->L(i + ((order >> 0) & 3));
+ r1 = s->L(i + ((order >> 2) & 3));
+ r2 = s->L(i + ((order >> 4) & 3));
+ r3 = s->L(i + ((order >> 6) & 3));
+ d->L(i) = r0;
+ d->L(i+1) = r1;
+ d->L(i+2) = r2;
+ d->L(i+3) = r3;
+ }
+}
+
+#if SHIFT == 1
+#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
+#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
+#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31))
+#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
+#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
+#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
+#endif
+
+SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
+SSE_HELPER_L(helper_vpsravd, FPSRAVD)
+SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
+
+SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
+SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
+SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
+
+void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ uint32_t zf = 0, cf = 0;
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i++) {
+ zf |= (s->L(i) & d->L(i));
+ cf |= (s->L(i) & ~d->L(i));
+ }
+ CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
+}
+
+void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ uint64_t zf = 0, cf = 0;
+ int i;
+
+ for (i = 0; i < 1 << SHIFT; i++) {
+ zf |= (s->Q(i) & d->Q(i));
+ cf |= (s->Q(i) & ~d->Q(i));
+ }
+ CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
+}
+
+void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
+ }
+ }
+}
+
+void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
+ }
+ }
+}
+
+void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
+ }
+}
+
+void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ int i;
+
+ for (i = 0; i < (1 << SHIFT); i++) {
+ d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0;
+ }
+}
+
+void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ target_ulong addr = a0
+ + ((target_ulong)(int32_t)s->L(i) << scale);
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+ }
+ v->L(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ target_ulong addr = a0
+ + ((target_ulong)(int32_t)s->L(i) << scale);
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
+ }
+ v->Q(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ target_ulong addr = a0
+ + ((target_ulong)(int64_t)s->Q(i) << scale);
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+ }
+ v->L(i) = 0;
+ }
+ for (i /= 2; i < 1 << SHIFT; i++) {
+ d->Q(i) = 0;
+ v->Q(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ target_ulong addr = a0
+ + ((target_ulong)(int64_t)s->Q(i) << scale);
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
+ }
+ v->Q(i) = 0;
+ }
+}
+#endif
+
+#if SHIFT >= 2
+void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order)
+{
+ uint64_t r0, r1, r2, r3;
+
+ switch (order & 3) {
+ case 0:
+ r0 = v->Q(0);
+ r1 = v->Q(1);
+ break;
+ case 1:
+ r0 = v->Q(2);
+ r1 = v->Q(3);
+ break;
+ case 2:
+ r0 = s->Q(0);
+ r1 = s->Q(1);
+ break;
+ case 3:
+ r0 = s->Q(2);
+ r1 = s->Q(3);
+ break;
+ }
+ switch ((order >> 4) & 3) {
+ case 0:
+ r2 = v->Q(0);
+ r3 = v->Q(1);
+ break;
+ case 1:
+ r2 = v->Q(2);
+ r3 = v->Q(3);
+ break;
+ case 2:
+ r2 = s->Q(0);
+ r3 = s->Q(1);
+ break;
+ case 3:
+ r2 = s->Q(2);
+ r3 = s->Q(3);
+ break;
+ }
+ d->Q(0) = r0;
+ d->Q(1) = r1;
+ d->Q(2) = r2;
+ d->Q(3) = r3;
+}
+
+void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order)
+{
+ uint64_t r0, r1, r2, r3;
+ r0 = s->Q(order & 3);
+ r1 = s->Q((order >> 2) & 3);
+ r2 = s->Q((order >> 4) & 3);
+ r3 = s->Q((order >> 6) & 3);
+ d->Q(0) = r0;
+ d->Q(1) = r1;
+ d->Q(2) = r2;
+ d->Q(3) = r3;
+}
+
+void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
+{
+ uint32_t r[8];
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ r[i] = s->L(v->L(i) & 7);
+ }
+ for (i = 0; i < 8; i++) {
+ d->L(i) = r[i];
+ }
+}
+#endif
+
#undef SSE_HELPER_S
+#undef LANE_WIDTH
#undef SHIFT
#undef XMM_ONLY
#undef Reg
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 400b24c0..2f1f811 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -21,7 +21,11 @@
#define SUFFIX _mmx
#else
#define Reg ZMMReg
+#if SHIFT == 1
#define SUFFIX _xmm
+#else
+#define SUFFIX _ymm
+#endif
#endif
#define dh_alias_Reg ptr
@@ -34,74 +38,34 @@
#define dh_typecode_ZMMReg dh_typecode_ptr
#define dh_typecode_MMXReg dh_typecode_ptr
-DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psllw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psrld, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psrad, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg)
-
-#if SHIFT == 1
-DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(psrlw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psraw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psllw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psrld, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psrad, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pslld, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psrlq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psllq, SUFFIX), void, env, Reg, Reg, Reg)
+
+#if SHIFT >= 1
+DEF_HELPER_4(glue(psrldq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pslldq, SUFFIX), void, env, Reg, Reg, Reg)
#endif
#define SSE_HELPER_B(name, F)\
- DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
+ DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg)
#define SSE_HELPER_W(name, F)\
- DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
+ DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg)
#define SSE_HELPER_L(name, F)\
- DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
+ DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg)
#define SSE_HELPER_Q(name, F)\
- DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
-
-SSE_HELPER_B(paddb, FADD)
-SSE_HELPER_W(paddw, FADD)
-SSE_HELPER_L(paddl, FADD)
-SSE_HELPER_Q(paddq, FADD)
-
-SSE_HELPER_B(psubb, FSUB)
-SSE_HELPER_W(psubw, FSUB)
-SSE_HELPER_L(psubl, FSUB)
-SSE_HELPER_Q(psubq, FSUB)
-
-SSE_HELPER_B(paddusb, FADDUB)
-SSE_HELPER_B(paddsb, FADDSB)
-SSE_HELPER_B(psubusb, FSUBUB)
-SSE_HELPER_B(psubsb, FSUBSB)
-
-SSE_HELPER_W(paddusw, FADDUW)
-SSE_HELPER_W(paddsw, FADDSW)
-SSE_HELPER_W(psubusw, FSUBUW)
-SSE_HELPER_W(psubsw, FSUBSW)
-
-SSE_HELPER_B(pminub, FMINUB)
-SSE_HELPER_B(pmaxub, FMAXUB)
-
-SSE_HELPER_W(pminsw, FMINSW)
-SSE_HELPER_W(pmaxsw, FMAXSW)
-
-SSE_HELPER_Q(pand, FAND)
-SSE_HELPER_Q(pandn, FANDN)
-SSE_HELPER_Q(por, FOR)
-SSE_HELPER_Q(pxor, FXOR)
-
-SSE_HELPER_B(pcmpgtb, FCMPGTB)
-SSE_HELPER_W(pcmpgtw, FCMPGTW)
-SSE_HELPER_L(pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(pcmpeqb, FCMPEQ)
-SSE_HELPER_W(pcmpeqw, FCMPEQ)
-SSE_HELPER_L(pcmpeql, FCMPEQ)
+ DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg)
-SSE_HELPER_W(pmullw, FMULLW)
#if SHIFT == 0
-SSE_HELPER_W(pmulhrw, FMULHRW)
+DEF_HELPER_3(glue(pmulhrw, SUFFIX), void, env, Reg, Reg)
#endif
SSE_HELPER_W(pmulhuw, FMULHUW)
SSE_HELPER_W(pmulhw, FMULHW)
@@ -109,51 +73,74 @@ SSE_HELPER_W(pmulhw, FMULHW)
SSE_HELPER_B(pavgb, FAVG)
SSE_HELPER_W(pavgw, FAVG)
-DEF_HELPER_3(glue(pmuludq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaddwd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(pmuludq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pmaddwd, SUFFIX), void, env, Reg, Reg, Reg)
-DEF_HELPER_3(glue(psadbw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(psadbw, SUFFIX), void, env, Reg, Reg, Reg)
+#if SHIFT < 2
DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl)
-DEF_HELPER_2(glue(movl_mm_T0, SUFFIX), void, Reg, i32)
-#ifdef TARGET_X86_64
-DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64)
#endif
#if SHIFT == 0
DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int)
#else
-DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int)
DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int)
DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int)
DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
#endif
-#if SHIFT == 1
+#if SHIFT >= 1
/* FPU ops */
/* XXX: not accurate */
-#define SSE_HELPER_S(name, F) \
- DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \
- DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
+#define SSE_HELPER_P4(name) \
+ DEF_HELPER_4(glue(name ## ps, SUFFIX), void, env, Reg, Reg, Reg) \
+ DEF_HELPER_4(glue(name ## pd, SUFFIX), void, env, Reg, Reg, Reg)
+
+#define SSE_HELPER_P3(name, ...) \
+ DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
+ DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg)
+
+#if SHIFT == 1
+#define SSE_HELPER_S4(name) \
+ SSE_HELPER_P4(name) \
+ DEF_HELPER_4(name ## ss, void, env, Reg, Reg, Reg) \
+ DEF_HELPER_4(name ## sd, void, env, Reg, Reg, Reg)
+#define SSE_HELPER_S3(name) \
+ SSE_HELPER_P3(name) \
+ DEF_HELPER_4(name ## ss, void, env, Reg, Reg, Reg) \
+ DEF_HELPER_4(name ## sd, void, env, Reg, Reg, Reg)
+#else
+#define SSE_HELPER_S4(name, ...) SSE_HELPER_P4(name)
+#define SSE_HELPER_S3(name, ...) SSE_HELPER_P3(name)
+#endif
+
+DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, int)
+DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, int)
-SSE_HELPER_S(add, FPU_ADD)
-SSE_HELPER_S(sub, FPU_SUB)
-SSE_HELPER_S(mul, FPU_MUL)
-SSE_HELPER_S(div, FPU_DIV)
-SSE_HELPER_S(min, FPU_MIN)
-SSE_HELPER_S(max, FPU_MAX)
-SSE_HELPER_S(sqrt, FPU_SQRT)
+SSE_HELPER_S4(add)
+SSE_HELPER_S4(sub)
+SSE_HELPER_S4(mul)
+SSE_HELPER_S4(div)
+SSE_HELPER_S4(min)
+SSE_HELPER_S4(max)
+SSE_HELPER_S3(sqrt)
DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(cvtss2sd, void, env, Reg, Reg)
-DEF_HELPER_3(cvtsd2ss, void, env, Reg, Reg)
DEF_HELPER_3(glue(cvtdq2ps, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(cvtdq2pd, SUFFIX), void, env, Reg, Reg)
+
+DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
+DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
+
+DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
+DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
+
+#if SHIFT == 1
+DEF_HELPER_4(cvtss2sd, void, env, Reg, Reg, Reg)
+DEF_HELPER_4(cvtsd2ss, void, env, Reg, Reg, Reg)
DEF_HELPER_3(cvtpi2ps, void, env, ZMMReg, MMXReg)
DEF_HELPER_3(cvtpi2pd, void, env, ZMMReg, MMXReg)
DEF_HELPER_3(cvtsi2ss, void, env, ZMMReg, i32)
@@ -164,8 +151,6 @@ DEF_HELPER_3(cvtsq2ss, void, env, ZMMReg, i64)
DEF_HELPER_3(cvtsq2sd, void, env, ZMMReg, i64)
#endif
-DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
DEF_HELPER_3(cvtps2pi, void, env, MMXReg, ZMMReg)
DEF_HELPER_3(cvtpd2pi, void, env, MMXReg, ZMMReg)
DEF_HELPER_2(cvtss2si, s32, env, ZMMReg)
@@ -175,8 +160,6 @@ DEF_HELPER_2(cvtss2sq, s64, env, ZMMReg)
DEF_HELPER_2(cvtsd2sq, s64, env, ZMMReg)
#endif
-DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg)
DEF_HELPER_3(cvttps2pi, void, env, MMXReg, ZMMReg)
DEF_HELPER_3(cvttpd2pi, void, env, MMXReg, ZMMReg)
DEF_HELPER_2(cvttss2si, s32, env, ZMMReg)
@@ -185,27 +168,25 @@ DEF_HELPER_2(cvttsd2si, s32, env, ZMMReg)
DEF_HELPER_2(cvttss2sq, s64, env, ZMMReg)
DEF_HELPER_2(cvttsd2sq, s64, env, ZMMReg)
#endif
+#endif
DEF_HELPER_3(glue(rsqrtps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(rsqrtss, void, env, ZMMReg, ZMMReg)
DEF_HELPER_3(glue(rcpps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(rcpss, void, env, ZMMReg, ZMMReg)
+
+#if SHIFT == 1
+DEF_HELPER_4(rsqrtss, void, env, ZMMReg, ZMMReg, ZMMReg)
+DEF_HELPER_4(rcpss, void, env, ZMMReg, ZMMReg, ZMMReg)
DEF_HELPER_3(extrq_r, void, env, ZMMReg, ZMMReg)
DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int)
DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg)
DEF_HELPER_5(insertq_i, void, env, ZMMReg, ZMMReg, int, int)
-DEF_HELPER_3(glue(haddps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(haddpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(hsubps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(hsubpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(addsubps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(addsubpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-
-#define SSE_HELPER_CMP(name, F, C) \
- DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \
- DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
+#endif
+
+SSE_HELPER_P4(hadd)
+SSE_HELPER_P4(hsub)
+SSE_HELPER_P4(addsub)
+
+#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_S4(name)
SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
@@ -216,29 +197,58 @@ SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
+SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
+SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
+SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
+SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE)
+SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
+SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
+SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
+SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE)
+
+SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
+SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
+SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
+SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD)
+SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
+SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
+SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
+SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
+
+SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
+SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
+SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
+SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE)
+SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
+SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
+SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
+SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE)
+
+#if SHIFT == 1
DEF_HELPER_3(ucomiss, void, env, Reg, Reg)
DEF_HELPER_3(comiss, void, env, Reg, Reg)
DEF_HELPER_3(ucomisd, void, env, Reg, Reg)
DEF_HELPER_3(comisd, void, env, Reg, Reg)
+#endif
+
DEF_HELPER_2(glue(movmskps, SUFFIX), i32, env, Reg)
DEF_HELPER_2(glue(movmskpd, SUFFIX), i32, env, Reg)
#endif
-DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg)
-DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg)
-#define UNPCK_OP(base_name, base) \
- DEF_HELPER_3(glue(punpck ## base_name ## bw, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(glue(punpck ## base_name ## wd, SUFFIX), void, env, Reg, Reg) \
- DEF_HELPER_3(glue(punpck ## base_name ## dq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(packsswb, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(packuswb, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(packssdw, SUFFIX), void, env, Reg, Reg, Reg)
+#define UNPCK_OP(name, base) \
+ DEF_HELPER_4(glue(punpck ## name ## bw, SUFFIX), void, env, Reg, Reg, Reg) \
+ DEF_HELPER_4(glue(punpck ## name ## wd, SUFFIX), void, env, Reg, Reg, Reg) \
+ DEF_HELPER_4(glue(punpck ## name ## dq, SUFFIX), void, env, Reg, Reg, Reg)
UNPCK_OP(l, 0)
UNPCK_OP(h, 1)
-#if SHIFT == 1
-DEF_HELPER_3(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg)
+#if SHIFT >= 1
+DEF_HELPER_4(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg, Reg)
#endif
/* 3DNow! float ops */
@@ -265,28 +275,25 @@ DEF_HELPER_3(pswapd, void, env, MMXReg, MMXReg)
#endif
/* SSSE3 op helpers */
-DEF_HELPER_3(glue(phaddw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(phaddd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(phaddsw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(phsubw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(phsubd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(phsubsw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pabsb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pabsw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pabsd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pshufb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psignb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psignw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(psignd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_4(glue(palignr, SUFFIX), void, env, Reg, Reg, s32)
+DEF_HELPER_4(glue(phaddw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(phaddd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(phaddsw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(phsubw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(phsubd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(phsubsw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(pshufb, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psignb, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psignw, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(psignd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_5(glue(palignr, SUFFIX), void, env, Reg, Reg, Reg, i32)
/* SSE4.1 op helpers */
-#if SHIFT == 1
-DEF_HELPER_3(glue(pblendvb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(blendvps, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(blendvpd, SUFFIX), void, env, Reg, Reg)
+#if SHIFT >= 1
+DEF_HELPER_5(glue(pblendvb, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(blendvps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(blendvpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
DEF_HELPER_3(glue(ptest, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovsxbw, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovsxbd, SUFFIX), void, env, Reg, Reg)
@@ -300,34 +307,32 @@ DEF_HELPER_3(glue(pmovzxbq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxwd, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxwq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxdq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmuldq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(packusdw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pminsb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pminsd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pminuw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pminud, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaxsb, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaxsd, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaxuw, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmaxud, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(pmulld, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsldup, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovshdup, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovdldup, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(pmuldq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(packusdw, SUFFIX), void, env, Reg, Reg, Reg)
+#if SHIFT == 1
DEF_HELPER_3(glue(phminposuw, SUFFIX), void, env, Reg, Reg)
+#endif
DEF_HELPER_4(glue(roundps, SUFFIX), void, env, Reg, Reg, i32)
DEF_HELPER_4(glue(roundpd, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(roundss, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(roundsd, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(blendps, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(blendpd, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(pblendw, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(dpps, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(dppd, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, i32)
+#if SHIFT == 1
+DEF_HELPER_5(roundss_xmm, void, env, Reg, Reg, Reg, i32)
+DEF_HELPER_5(roundsd_xmm, void, env, Reg, Reg, Reg, i32)
+#endif
+DEF_HELPER_5(glue(blendps, SUFFIX), void, env, Reg, Reg, Reg, i32)
+DEF_HELPER_5(glue(blendpd, SUFFIX), void, env, Reg, Reg, Reg, i32)
+DEF_HELPER_5(glue(pblendw, SUFFIX), void, env, Reg, Reg, Reg, i32)
+DEF_HELPER_5(glue(dpps, SUFFIX), void, env, Reg, Reg, Reg, i32)
+#if SHIFT == 1
+DEF_HELPER_5(glue(dppd, SUFFIX), void, env, Reg, Reg, Reg, i32)
+#endif
+DEF_HELPER_5(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, Reg, i32)
#endif
/* SSE4.2 op helpers */
#if SHIFT == 1
-DEF_HELPER_3(glue(pcmpgtq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_4(glue(pcmpestri, SUFFIX), void, env, Reg, Reg, i32)
DEF_HELPER_4(glue(pcmpestrm, SUFFIX), void, env, Reg, Reg, i32)
DEF_HELPER_4(glue(pcmpistri, SUFFIX), void, env, Reg, Reg, i32)
@@ -336,14 +341,45 @@ DEF_HELPER_3(crc32, tl, i32, tl, i32)
#endif
/* AES-NI op helpers */
+#if SHIFT >= 1
+DEF_HELPER_4(glue(aesdec, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(aesdeclast, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(aesenc, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(aesenclast, SUFFIX), void, env, Reg, Reg, Reg)
#if SHIFT == 1
-DEF_HELPER_3(glue(aesdec, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(aesdeclast, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(aesenc, SUFFIX), void, env, Reg, Reg)
-DEF_HELPER_3(glue(aesenclast, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(aesimc, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32)
-DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32)
+#endif
+DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32)
+#endif
+
+/* AVX helpers */
+#if SHIFT >= 1
+DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_3(glue(vpermilpd_imm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(vpermilps_imm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_6(glue(vpgatherdd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherdq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherqd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherqq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+#if SHIFT == 2
+DEF_HELPER_3(vpermd_ymm, void, Reg, Reg, Reg)
+DEF_HELPER_4(vpermdq_ymm, void, Reg, Reg, Reg, i32)
+DEF_HELPER_3(vpermq_ymm, void, Reg, Reg, i32)
+#endif
#endif
#undef SHIFT
@@ -354,6 +390,9 @@ DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32)
#undef SSE_HELPER_W
#undef SSE_HELPER_L
#undef SSE_HELPER_Q
-#undef SSE_HELPER_S
+#undef SSE_HELPER_S3
+#undef SSE_HELPER_S4
+#undef SSE_HELPER_P3
+#undef SSE_HELPER_P4
#undef SSE_HELPER_CMP
#undef UNPCK_OP
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
new file mode 100644
index 0000000..8e1eb9d
--- /dev/null
+++ b/target/i386/tcg/decode-new.c.inc
@@ -0,0 +1,1795 @@
+/*
+ * New-style decoder for i386 instructions
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The decoder is mostly based on tables copied from the Intel SDM. As
+ * a result, most operand load and writeback is done entirely in common
+ * table-driven code using the same operand type (X86_TYPE_*) and
+ * size (X86_SIZE_*) codes used in the manual.
+ *
+ * The main difference is that the V, U and W types are extended to
+ * cover MMX as well; if an instruction is like
+ *
+ * por Pq, Qq
+ * 66 por Vx, Hx, Wx
+ *
+ * only the second row is included and the instruction is marked as a
+ * valid MMX instruction. The MMX flag directs the decoder to rewrite
+ * the V/U/H/W types to P/N/P/Q if there is no prefix, as well as changing
+ * "x" to "q" if there is no prefix.
+ *
+ * In addition, the ss/ps/sd/pd types are sometimes mushed together as "x"
+ * if the difference is expressed via prefixes. Individual instructions
+ * are separated by prefix in the generator functions.
+ *
+ * There are a couple cases in which instructions (e.g. MOVD) write the
+ * whole XMM or MM register but are established incorrectly in the manual
+ * as "d" or "q". These have to be fixed for the decoder to work correctly.
+ */
+
+#define X86_OP_NONE { 0 },
+
+#define X86_OP_GROUP3(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) { \
+ .decode = glue(decode_, op), \
+ .op0 = glue(X86_TYPE_, op0_), \
+ .s0 = glue(X86_SIZE_, s0_), \
+ .op1 = glue(X86_TYPE_, op1_), \
+ .s1 = glue(X86_SIZE_, s1_), \
+ .op2 = glue(X86_TYPE_, op2_), \
+ .s2 = glue(X86_SIZE_, s2_), \
+ .is_decode = true, \
+ ## __VA_ARGS__ \
+}
+
+#define X86_OP_GROUP2(op, op0, s0, op1, s1, ...) \
+ X86_OP_GROUP3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
+#define X86_OP_GROUP0(op, ...) \
+ X86_OP_GROUP3(op, None, None, None, None, None, None, ## __VA_ARGS__)
+
+#define X86_OP_ENTRY3(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) { \
+ .gen = glue(gen_, op), \
+ .op0 = glue(X86_TYPE_, op0_), \
+ .s0 = glue(X86_SIZE_, s0_), \
+ .op1 = glue(X86_TYPE_, op1_), \
+ .s1 = glue(X86_SIZE_, s1_), \
+ .op2 = glue(X86_TYPE_, op2_), \
+ .s2 = glue(X86_SIZE_, s2_), \
+ ## __VA_ARGS__ \
+}
+
+#define X86_OP_ENTRY4(op, op0_, s0_, op1_, s1_, op2_, s2_, ...) \
+ X86_OP_ENTRY3(op, op0_, s0_, op1_, s1_, op2_, s2_, \
+ .op3 = X86_TYPE_I, .s3 = X86_SIZE_b, \
+ ## __VA_ARGS__)
+
+#define X86_OP_ENTRY2(op, op0, s0, op1, s1, ...) \
+ X86_OP_ENTRY3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
+#define X86_OP_ENTRYw(op, op0, s0, ...) \
+ X86_OP_ENTRY3(op, op0, s0, None, None, None, None, ## __VA_ARGS__)
+#define X86_OP_ENTRYr(op, op0, s0, ...) \
+ X86_OP_ENTRY3(op, None, None, None, None, op0, s0, ## __VA_ARGS__)
+#define X86_OP_ENTRY0(op, ...) \
+ X86_OP_ENTRY3(op, None, None, None, None, None, None, ## __VA_ARGS__)
+
+#define cpuid(feat) .cpuid = X86_FEAT_##feat,
+#define i64 .special = X86_SPECIAL_i64,
+#define o64 .special = X86_SPECIAL_o64,
+#define xchg .special = X86_SPECIAL_Locked,
+#define mmx .special = X86_SPECIAL_MMX,
+#define zext0 .special = X86_SPECIAL_ZExtOp0,
+#define zext2 .special = X86_SPECIAL_ZExtOp2,
+#define avx_movx .special = X86_SPECIAL_AVXExtMov,
+
+#define vex1 .vex_class = 1,
+#define vex1_rep3 .vex_class = 1, .vex_special = X86_VEX_REPScalar,
+#define vex2 .vex_class = 2,
+#define vex2_rep3 .vex_class = 2, .vex_special = X86_VEX_REPScalar,
+#define vex3 .vex_class = 3,
+#define vex4 .vex_class = 4,
+#define vex4_unal .vex_class = 4, .vex_special = X86_VEX_SSEUnaligned,
+#define vex5 .vex_class = 5,
+#define vex6 .vex_class = 6,
+#define vex7 .vex_class = 7,
+#define vex8 .vex_class = 8,
+#define vex11 .vex_class = 11,
+#define vex12 .vex_class = 12,
+#define vex13 .vex_class = 13,
+
+#define avx2_256 .vex_special = X86_VEX_AVX2_256,
+
+#define P_00 1
+#define P_66 (1 << PREFIX_DATA)
+#define P_F3 (1 << PREFIX_REPZ)
+#define P_F2 (1 << PREFIX_REPNZ)
+
+#define p_00 .valid_prefix = P_00,
+#define p_66 .valid_prefix = P_66,
+#define p_f3 .valid_prefix = P_F3,
+#define p_f2 .valid_prefix = P_F2,
+#define p_00_66 .valid_prefix = P_00 | P_66,
+#define p_00_f3 .valid_prefix = P_00 | P_F3,
+#define p_66_f2 .valid_prefix = P_66 | P_F2,
+#define p_00_66_f3 .valid_prefix = P_00 | P_66 | P_F3,
+#define p_66_f3_f2 .valid_prefix = P_66 | P_F3 | P_F2,
+#define p_00_66_f3_f2 .valid_prefix = P_00 | P_66 | P_F3 | P_F2,
+
+static uint8_t get_modrm(DisasContext *s, CPUX86State *env)
+{
+ if (!s->has_modrm) {
+ s->modrm = x86_ldub_code(env, s);
+ s->has_modrm = true;
+ }
+ return s->modrm;
+}
+
+static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEntry entries[4])
+{
+ if (s->prefix & PREFIX_REPNZ) {
+ return &entries[3];
+ } else if (s->prefix & PREFIX_REPZ) {
+ return &entries[2];
+ } else if (s->prefix & PREFIX_DATA) {
+ return &entries[1];
+ } else {
+ return &entries[0];
+ }
+}
+
+static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ /* only includes ldmxcsr and stmxcsr, because they have AVX variants. */
+ static const X86OpEntry group15_reg[8] = {
+ };
+
+ static const X86OpEntry group15_mem[8] = {
+ [2] = X86_OP_ENTRYr(LDMXCSR, E,d, vex5),
+ [3] = X86_OP_ENTRYw(STMXCSR, E,d, vex5),
+ };
+
+ uint8_t modrm = get_modrm(s, env);
+ if ((modrm >> 6) == 3) {
+ *entry = group15_reg[(modrm >> 3) & 7];
+ } else {
+ *entry = group15_mem[(modrm >> 3) & 7];
+ }
+}
+
+static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86GenFunc group17_gen[8] = {
+ NULL, gen_BLSR, gen_BLSMSK, gen_BLSI,
+ };
+ int op = (get_modrm(s, env) >> 3) & 7;
+ entry->gen = group17_gen[op];
+}
+
+static void decode_group12(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_group12[8] = {
+ {},
+ {},
+ X86_OP_ENTRY3(PSRLW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ X86_OP_ENTRY3(PSRAW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ X86_OP_ENTRY3(PSLLW_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ };
+
+ int op = (get_modrm(s, env) >> 3) & 7;
+ *entry = opcodes_group12[op];
+}
+
+static void decode_group13(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_group13[8] = {
+ {},
+ {},
+ X86_OP_ENTRY3(PSRLD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ X86_OP_ENTRY3(PSRAD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ X86_OP_ENTRY3(PSLLD_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ {},
+ };
+
+ int op = (get_modrm(s, env) >> 3) & 7;
+ *entry = opcodes_group13[op];
+}
+
+static void decode_group14(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_group14[8] = {
+ /* grp14 */
+ {},
+ {},
+ X86_OP_ENTRY3(PSRLQ_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ X86_OP_ENTRY3(PSRLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66),
+ {},
+ {},
+ X86_OP_ENTRY3(PSLLQ_i, H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+ X86_OP_ENTRY3(PSLLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66),
+ };
+
+ int op = (get_modrm(s, env) >> 3) & 7;
+ *entry = opcodes_group14[op];
+}
+
+static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F6F[4] = {
+ X86_OP_ENTRY3(MOVDQ, P,q, None,None, Q,q, vex1 mmx), /* movq */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex1), /* movdqa */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* movdqu */
+ {},
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F6F);
+}
+
+static void decode_0F70(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry pshufw[4] = {
+ X86_OP_ENTRY3(PSHUFW, P,q, Q,q, I,b, vex4 mmx),
+ X86_OP_ENTRY3(PSHUFD, V,x, W,x, I,b, vex4 avx2_256),
+ X86_OP_ENTRY3(PSHUFHW, V,x, W,x, I,b, vex4 avx2_256),
+ X86_OP_ENTRY3(PSHUFLW, V,x, W,x, I,b, vex4 avx2_256),
+ };
+
+ *entry = *decode_by_prefix(s, pshufw);
+}
+
+static void decode_0F77(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ if (!(s->prefix & PREFIX_VEX)) {
+ entry->gen = gen_EMMS;
+ } else if (!s->vex_l) {
+ entry->gen = gen_VZEROUPPER;
+ entry->vex_class = 8;
+ } else {
+ entry->gen = gen_VZEROALL;
+ entry->vex_class = 8;
+ }
+}
+
+static void decode_0F78(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F78[4] = {
+ {},
+ X86_OP_ENTRY3(EXTRQ_i, V,x, None,None, I,w, cpuid(SSE4A)),
+ {},
+ X86_OP_ENTRY3(INSERTQ_i, V,x, U,x, I,w, cpuid(SSE4A)),
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F78);
+}
+
+static void decode_0F79(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ if (s->prefix & PREFIX_REPNZ) {
+ entry->gen = gen_INSERTQ_r;
+ } else if (s->prefix & PREFIX_DATA) {
+ entry->gen = gen_EXTRQ_r;
+ } else {
+ entry->gen = NULL;
+ };
+}
+
+static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F7E[4] = {
+ X86_OP_ENTRY3(MOVD_from, E,y, None,None, P,y, vex5 mmx),
+ X86_OP_ENTRY3(MOVD_from, E,y, None,None, V,y, vex5),
+ X86_OP_ENTRY3(MOVQ, V,x, None,None, W,q, vex5), /* wrong dest Vy on SDM! */
+ {},
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F7E);
+}
+
+static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F7F[4] = {
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 mmx), /* movq */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1), /* movdqa */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4_unal), /* movdqu */
+ {},
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F7F);
+}
+
+static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry movq[4] = {
+ {},
+ X86_OP_ENTRY3(MOVQ, W,x, None, None, V,q, vex5),
+ X86_OP_ENTRY3(MOVq_dq, V,dq, None, None, N,q),
+ X86_OP_ENTRY3(MOVq_dq, P,q, None, None, U,q),
+ };
+
+ *entry = *decode_by_prefix(s, movq);
+}
+
+static const X86OpEntry opcodes_0F38_00toEF[240] = {
+ [0x00] = X86_OP_ENTRY3(PSHUFB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x01] = X86_OP_ENTRY3(PHADDW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x02] = X86_OP_ENTRY3(PHADDD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x03] = X86_OP_ENTRY3(PHADDSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x04] = X86_OP_ENTRY3(PMADDUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x05] = X86_OP_ENTRY3(PHSUBW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x06] = X86_OP_ENTRY3(PHSUBD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x07] = X86_OP_ENTRY3(PHSUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+
+ [0x10] = X86_OP_ENTRY2(PBLENDVB, V,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x14] = X86_OP_ENTRY2(BLENDVPS, V,x, W,x, vex4 cpuid(SSE41) p_66),
+ [0x15] = X86_OP_ENTRY2(BLENDVPD, V,x, W,x, vex4 cpuid(SSE41) p_66),
+ /* Listed incorrectly as type 4 */
+ [0x16] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+ [0x17] = X86_OP_ENTRY3(VPTEST, None,None, V,x, W,x, vex4 cpuid(SSE41) p_66),
+
+ /*
+ * Source operand listed as Mq/Ux and similar in the manual; incorrectly listed
+ * as 128-bit only in 2-17.
+ */
+ [0x20] = X86_OP_ENTRY3(VPMOVSXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x21] = X86_OP_ENTRY3(VPMOVSXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x22] = X86_OP_ENTRY3(VPMOVSXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x23] = X86_OP_ENTRY3(VPMOVSXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x24] = X86_OP_ENTRY3(VPMOVSXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x25] = X86_OP_ENTRY3(VPMOVSXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+
+ /* Same as PMOVSX. */
+ [0x30] = X86_OP_ENTRY3(VPMOVZXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x31] = X86_OP_ENTRY3(VPMOVZXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x32] = X86_OP_ENTRY3(VPMOVZXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x33] = X86_OP_ENTRY3(VPMOVZXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x34] = X86_OP_ENTRY3(VPMOVZXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x35] = X86_OP_ENTRY3(VPMOVZXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x36] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+ [0x37] = X86_OP_ENTRY3(PCMPGTQ, V,x, H,x, W,x, vex4 cpuid(SSE42) avx2_256 p_66),
+
+ [0x40] = X86_OP_ENTRY3(PMULLD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x41] = X86_OP_ENTRY3(VPHMINPOSUW, V,dq, None,None, W,dq, vex4 cpuid(SSE41) p_66),
+ /* Listed incorrectly as type 4 */
+ [0x45] = X86_OP_ENTRY3(VPSRLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+ [0x46] = X86_OP_ENTRY3(VPSRAV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+ [0x47] = X86_OP_ENTRY3(VPSLLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+
+ [0x90] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vpgatherdd/q */
+ [0x91] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vpgatherqd/q */
+ [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */
+ [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */
+
+ [0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0b] = X86_OP_ENTRY3(PMULHRSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0c] = X86_OP_ENTRY3(VPERMILPS, V,x, H,x, W,x, vex4 cpuid(AVX) p_00_66),
+ [0x0d] = X86_OP_ENTRY3(VPERMILPD, V,x, H,x, W,x, vex4 cpuid(AVX) p_66),
+ [0x0e] = X86_OP_ENTRY3(VTESTPS, None,None, V,x, W,x, vex4 cpuid(AVX) p_66),
+ [0x0f] = X86_OP_ENTRY3(VTESTPD, None,None, V,x, W,x, vex4 cpuid(AVX) p_66),
+
+ [0x18] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX) p_66), /* vbroadcastss */
+ [0x19] = X86_OP_ENTRY3(VPBROADCASTQ, V,qq, None,None, W,q, vex6 cpuid(AVX) p_66), /* vbroadcastsd */
+ [0x1a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX) p_66),
+ [0x1c] = X86_OP_ENTRY3(PABSB, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x1d] = X86_OP_ENTRY3(PABSW, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x1e] = X86_OP_ENTRY3(PABSD, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+
+ [0x28] = X86_OP_ENTRY3(PMULDQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x29] = X86_OP_ENTRY3(PCMPEQQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x2a] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, WM,x, vex1 cpuid(SSE41) avx2_256 p_66), /* movntdqa */
+ [0x2b] = X86_OP_ENTRY3(VPACKUSDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x2c] = X86_OP_ENTRY3(VMASKMOVPS, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66),
+ [0x2d] = X86_OP_ENTRY3(VMASKMOVPD, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66),
+ /* Incorrectly listed as Mx,Hx,Vx in the manual */
+ [0x2e] = X86_OP_ENTRY3(VMASKMOVPS_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66),
+ [0x2f] = X86_OP_ENTRY3(VMASKMOVPD_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66),
+
+ [0x38] = X86_OP_ENTRY3(PMINSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x39] = X86_OP_ENTRY3(PMINSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3a] = X86_OP_ENTRY3(PMINUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3b] = X86_OP_ENTRY3(PMINUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3c] = X86_OP_ENTRY3(PMAXSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3d] = X86_OP_ENTRY3(PMAXSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3e] = X86_OP_ENTRY3(PMAXUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3f] = X86_OP_ENTRY3(PMAXUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+
+ [0x58] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX2) p_66),
+ [0x59] = X86_OP_ENTRY3(VPBROADCASTQ, V,x, None,None, W,q, vex6 cpuid(AVX2) p_66),
+ [0x5a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX2) p_66),
+
+ [0x78] = X86_OP_ENTRY3(VPBROADCASTB, V,x, None,None, W,b, vex6 cpuid(AVX2) p_66),
+ [0x79] = X86_OP_ENTRY3(VPBROADCASTW, V,x, None,None, W,w, vex6 cpuid(AVX2) p_66),
+
+ [0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66),
+ [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66),
+
+ [0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66),
+ [0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xde] = X86_OP_ENTRY3(VAESDEC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+};
+
+/* five rows for no prefix, 66, F3, F2, 66+F2 */
+static const X86OpEntry opcodes_0F38_F0toFF[16][5] = {
+ [0] = {
+ X86_OP_ENTRY3(MOVBE, G,y, M,y, None,None, cpuid(MOVBE)),
+ X86_OP_ENTRY3(MOVBE, G,w, M,w, None,None, cpuid(MOVBE)),
+ {},
+ X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)),
+ X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)),
+ },
+ [1] = {
+ X86_OP_ENTRY3(MOVBE, M,y, G,y, None,None, cpuid(MOVBE)),
+ X86_OP_ENTRY3(MOVBE, M,w, G,w, None,None, cpuid(MOVBE)),
+ {},
+ X86_OP_ENTRY2(CRC32, G,d, E,y, cpuid(SSE42)),
+ X86_OP_ENTRY2(CRC32, G,d, E,w, cpuid(SSE42)),
+ },
+ [2] = {
+ X86_OP_ENTRY3(ANDN, G,y, B,y, E,y, vex13 cpuid(BMI1)),
+ {},
+ {},
+ {},
+ {},
+ },
+ [3] = {
+ X86_OP_GROUP3(group17, B,y, E,y, None,None, vex13 cpuid(BMI1)),
+ {},
+ {},
+ {},
+ {},
+ },
+ [5] = {
+ X86_OP_ENTRY3(BZHI, G,y, E,y, B,y, vex13 cpuid(BMI1)),
+ {},
+ X86_OP_ENTRY3(PEXT, G,y, B,y, E,y, vex13 cpuid(BMI2)),
+ X86_OP_ENTRY3(PDEP, G,y, B,y, E,y, vex13 cpuid(BMI2)),
+ {},
+ },
+ [6] = {
+ {},
+ X86_OP_ENTRY2(ADCX, G,y, E,y, cpuid(ADX)),
+ X86_OP_ENTRY2(ADOX, G,y, E,y, cpuid(ADX)),
+ X86_OP_ENTRY3(MULX, /* B,y, */ G,y, E,y, 2,y, vex13 cpuid(BMI2)),
+ {},
+ },
+ [7] = {
+ X86_OP_ENTRY3(BEXTR, G,y, E,y, B,y, vex13 cpuid(BMI1)),
+ X86_OP_ENTRY3(SHLX, G,y, E,y, B,y, vex13 cpuid(BMI1)),
+ X86_OP_ENTRY3(SARX, G,y, E,y, B,y, vex13 cpuid(BMI1)),
+ X86_OP_ENTRY3(SHRX, G,y, E,y, B,y, vex13 cpuid(BMI1)),
+ {},
+ },
+};
+
+static void decode_0F38(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ *b = x86_ldub_code(env, s);
+ if (*b < 0xf0) {
+ *entry = opcodes_0F38_00toEF[*b];
+ } else {
+ int row = 0;
+ if (s->prefix & PREFIX_REPZ) {
+ /* The REPZ (F3) prefix has priority over 66 */
+ row = 2;
+ } else {
+ row += s->prefix & PREFIX_REPNZ ? 3 : 0;
+ row += s->prefix & PREFIX_DATA ? 1 : 0;
+ }
+ *entry = opcodes_0F38_F0toFF[*b & 15][row];
+ }
+}
+
+static void decode_VINSERTPS(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry
+ vinsertps_reg = X86_OP_ENTRY4(VINSERTPS_r, V,dq, H,dq, U,dq, vex5 cpuid(SSE41) p_66),
+ vinsertps_mem = X86_OP_ENTRY4(VINSERTPS_m, V,dq, H,dq, M,d, vex5 cpuid(SSE41) p_66);
+
+ int modrm = get_modrm(s, env);
+ *entry = (modrm >> 6) == 3 ? vinsertps_reg : vinsertps_mem;
+}
+
+static const X86OpEntry opcodes_0F3A[256] = {
+ /*
+ * These are VEX-only, but incorrectly listed in the manual as exception type 4.
+ * Also the "qq" instructions are sometimes omitted by Table 2-17, but are VEX256
+ * only.
+ */
+ [0x00] = X86_OP_ENTRY3(VPERMQ, V,qq, W,qq, I,b, vex6 cpuid(AVX2) p_66),
+ [0x01] = X86_OP_ENTRY3(VPERMQ, V,qq, W,qq, I,b, vex6 cpuid(AVX2) p_66), /* VPERMPD */
+ [0x02] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66), /* VPBLENDD */
+ [0x04] = X86_OP_ENTRY3(VPERMILPS_i, V,x, W,x, I,b, vex6 cpuid(AVX) p_66),
+ [0x05] = X86_OP_ENTRY3(VPERMILPD_i, V,x, W,x, I,b, vex6 cpuid(AVX) p_66),
+ [0x06] = X86_OP_ENTRY4(VPERM2x128, V,qq, H,qq, W,qq, vex6 cpuid(AVX) p_66),
+
+ [0x14] = X86_OP_ENTRY3(PEXTRB, E,b, V,dq, I,b, vex5 cpuid(SSE41) zext0 p_66),
+ [0x15] = X86_OP_ENTRY3(PEXTRW, E,w, V,dq, I,b, vex5 cpuid(SSE41) zext0 p_66),
+ [0x16] = X86_OP_ENTRY3(PEXTR, E,y, V,dq, I,b, vex5 cpuid(SSE41) p_66),
+ [0x17] = X86_OP_ENTRY3(VEXTRACTPS, E,d, V,dq, I,b, vex5 cpuid(SSE41) p_66),
+
+ [0x20] = X86_OP_ENTRY4(PINSRB, V,dq, H,dq, E,b, vex5 cpuid(SSE41) zext2 p_66),
+ [0x21] = X86_OP_GROUP0(VINSERTPS),
+ [0x22] = X86_OP_ENTRY4(PINSR, V,dq, H,dq, E,y, vex5 cpuid(SSE41) p_66),
+
+ [0x40] = X86_OP_ENTRY4(VDDPS, V,x, H,x, W,x, vex2 cpuid(SSE41) p_66),
+ [0x41] = X86_OP_ENTRY4(VDDPD, V,dq, H,dq, W,dq, vex2 cpuid(SSE41) p_66),
+ [0x42] = X86_OP_ENTRY4(VMPSADBW, V,x, H,x, W,x, vex2 cpuid(SSE41) avx2_256 p_66),
+ [0x44] = X86_OP_ENTRY4(PCLMULQDQ, V,dq, H,dq, W,dq, vex4 cpuid(PCLMULQDQ) p_66),
+ [0x46] = X86_OP_ENTRY4(VPERM2x128, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+
+ [0x60] = X86_OP_ENTRY4(PCMPESTRM, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66),
+ [0x61] = X86_OP_ENTRY4(PCMPESTRI, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66),
+ [0x62] = X86_OP_ENTRY4(PCMPISTRM, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66),
+ [0x63] = X86_OP_ENTRY4(PCMPISTRI, None,None, V,dq, W,dq, vex4_unal cpuid(SSE42) p_66),
+
+ [0x08] = X86_OP_ENTRY3(VROUNDPS, V,x, W,x, I,b, vex2 cpuid(SSE41) p_66),
+ [0x09] = X86_OP_ENTRY3(VROUNDPD, V,x, W,x, I,b, vex2 cpuid(SSE41) p_66),
+ /*
+ * Not listed as four operand in the manual. Also writes and reads 128-bits
+ * from the first two operands due to the V operand picking higher entries of
+ * the H operand; the "Vss,Hss,Wss" description from the manual is incorrect.
+ * For other unary operations such as VSQRTSx this is hidden by the "REPScalar"
+ * value of vex_special, because the table lists the operand types of VSQRTPx.
+ */
+ [0x0a] = X86_OP_ENTRY4(VROUNDSS, V,x, H,x, W,ss, vex3 cpuid(SSE41) p_66),
+ [0x0b] = X86_OP_ENTRY4(VROUNDSD, V,x, H,x, W,sd, vex3 cpuid(SSE41) p_66),
+ [0x0c] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66),
+ [0x0d] = X86_OP_ENTRY4(VBLENDPD, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66),
+ [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+
+ [0x18] = X86_OP_ENTRY4(VINSERTx128, V,qq, H,qq, W,qq, vex6 cpuid(AVX) p_66),
+ [0x19] = X86_OP_ENTRY3(VEXTRACTx128, W,dq, V,qq, I,b, vex6 cpuid(AVX) p_66),
+
+ [0x38] = X86_OP_ENTRY4(VINSERTx128, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+ [0x39] = X86_OP_ENTRY3(VEXTRACTx128, W,dq, V,qq, I,b, vex6 cpuid(AVX2) p_66),
+
+ /* Listed incorrectly as type 4 */
+ [0x4a] = X86_OP_ENTRY4(VBLENDVPS, V,x, H,x, W,x, vex6 cpuid(AVX) p_66),
+ [0x4b] = X86_OP_ENTRY4(VBLENDVPD, V,x, H,x, W,x, vex6 cpuid(AVX) p_66),
+ [0x4c] = X86_OP_ENTRY4(VPBLENDVB, V,x, H,x, W,x, vex6 cpuid(AVX) p_66 avx2_256),
+
+ [0xdf] = X86_OP_ENTRY3(VAESKEYGEN, V,dq, W,dq, I,b, vex4 cpuid(AES) p_66),
+
+ [0xF0] = X86_OP_ENTRY3(RORX, G,y, E,y, I,b, vex13 cpuid(BMI2) p_f2),
+};
+
+static void decode_0F3A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ *b = x86_ldub_code(env, s);
+ *entry = opcodes_0F3A[*b];
+}
+
+/*
+ * There are some mistakes in the operands in the manual, and the load/store/register
+ * cases are easiest to keep separate, so the entries for 10-17 follow simplicity and
+ * efficiency of implementation rather than copying what the manual says.
+ *
+ * In particular:
+ *
+ * 1) "VMOVSS m32, xmm1" and "VMOVSD m64, xmm1" do not support VEX.vvvv != 1111b,
+ * but this is not mentioned in the tables.
+ *
+ * 2) MOVHLPS, MOVHPS, MOVHPD, MOVLPD, MOVLPS read the high quadword of one of their
+ * operands, which must therefore be dq; MOVLPD and MOVLPS also write the high
+ * quadword of the V operand.
+ */
+static void decode_0F10(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F10_reg[4] = {
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */
+ X86_OP_ENTRY3(VMOVSS, V,x, H,x, W,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx, V,x, H,x, W,x, vex4), /* MOVSD */
+ };
+
+ static const X86OpEntry opcodes_0F10_mem[4] = {
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */
+ X86_OP_ENTRY3(VMOVSS_ld, V,x, H,x, M,ss, vex4),
+ X86_OP_ENTRY3(VMOVSD_ld, V,x, H,x, M,sd, vex4),
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F10_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F10_mem);
+ }
+}
+
+static void decode_0F11(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F11_reg[4] = {
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */
+ X86_OP_ENTRY3(VMOVSS, W,x, H,x, V,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx, W,x, H,x, V,q, vex4), /* MOVSD */
+ };
+
+ static const X86OpEntry opcodes_0F11_mem[4] = {
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */
+ X86_OP_ENTRY3(VMOVSS_st, M,ss, None,None, V,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx_st, M,sd, None,None, V,x, vex4), /* MOVSD */
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F11_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F11_mem);
+ }
+}
+
+static void decode_0F12(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F12_mem[4] = {
+ /*
+ * Use dq for operand for compatibility with gen_MOVSD and
+ * to allow VEX128 only.
+ */
+ X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPS */
+ X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPD */
+ X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)),
+ X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, WM,q, vex4 cpuid(SSE3)), /* qq if VEX.256 */
+ };
+ static const X86OpEntry opcodes_0F12_reg[4] = {
+ X86_OP_ENTRY3(VMOVHLPS, V,dq, H,dq, U,dq, vex4),
+ X86_OP_ENTRY3(VMOVLPx, W,x, H,x, U,q, vex4), /* MOVLPD */
+ X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F12_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F12_mem);
+ if ((s->prefix & PREFIX_REPNZ) && s->vex_l) {
+ entry->s2 = X86_SIZE_qq;
+ }
+ }
+}
+
+static void decode_0F16(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F16_mem[4] = {
+ /*
+ * Operand 1 technically only reads the low 64 bits, but uses dq so that
+ * it is easier to check for op0 == op1 in an endianness-neutral manner.
+ */
+ X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPS */
+ X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPD */
+ X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)),
+ {},
+ };
+ static const X86OpEntry opcodes_0F16_reg[4] = {
+ /* Same as above, operand 1 could be Hq if it wasn't for big-endian. */
+ X86_OP_ENTRY3(VMOVLHPS, V,dq, H,dq, U,q, vex4),
+ X86_OP_ENTRY3(VMOVHPx, V,x, H,x, U,x, vex4), /* MOVHPD */
+ X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ {},
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F16_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F16_mem);
+ }
+}
+
+static void decode_0F2A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F2A[4] = {
+ X86_OP_ENTRY3(CVTPI2Px, V,x, None,None, Q,q),
+ X86_OP_ENTRY3(CVTPI2Px, V,x, None,None, Q,q),
+ X86_OP_ENTRY3(VCVTSI2Sx, V,x, H,x, E,y, vex3),
+ X86_OP_ENTRY3(VCVTSI2Sx, V,x, H,x, E,y, vex3),
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F2A);
+}
+
+static void decode_0F2B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F2B[4] = {
+ X86_OP_ENTRY3(MOVDQ, M,x, None,None, V,x, vex4), /* MOVNTPS */
+ X86_OP_ENTRY3(MOVDQ, M,x, None,None, V,x, vex4), /* MOVNTPD */
+ X86_OP_ENTRY3(VMOVSS_st, M,ss, None,None, V,x, vex4 cpuid(SSE4A)), /* MOVNTSS */
+ X86_OP_ENTRY3(VMOVLPx_st, M,sd, None,None, V,x, vex4 cpuid(SSE4A)), /* MOVNTSD */
+ };
+
+ *entry = *decode_by_prefix(s, opcodes_0F2B);
+}
+
+static void decode_0F2C(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F2C[4] = {
+ /* Listed as ps/pd in the manual, but CVTTPS2PI only reads 64-bit. */
+ X86_OP_ENTRY3(CVTTPx2PI, P,q, None,None, W,q),
+ X86_OP_ENTRY3(CVTTPx2PI, P,q, None,None, W,dq),
+ X86_OP_ENTRY3(VCVTTSx2SI, G,y, None,None, W,ss, vex3),
+ X86_OP_ENTRY3(VCVTTSx2SI, G,y, None,None, W,sd, vex3),
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F2C);
+}
+
+static void decode_0F2D(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F2D[4] = {
+ /* Listed as ps/pd in the manual, but CVTPS2PI only reads 64-bit. */
+ X86_OP_ENTRY3(CVTPx2PI, P,q, None,None, W,q),
+ X86_OP_ENTRY3(CVTPx2PI, P,q, None,None, W,dq),
+ X86_OP_ENTRY3(VCVTSx2SI, G,y, None,None, W,ss, vex3),
+ X86_OP_ENTRY3(VCVTSx2SI, G,y, None,None, W,sd, vex3),
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F2D);
+}
+
+static void decode_sse_unary(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))) {
+ entry->op1 = X86_TYPE_None;
+ entry->s1 = X86_SIZE_None;
+ }
+ switch (*b) {
+ case 0x51: entry->gen = gen_VSQRT; break;
+ case 0x52: entry->gen = gen_VRSQRT; break;
+ case 0x53: entry->gen = gen_VRCP; break;
+ case 0x5A: entry->gen = gen_VCVTfp2fp; break;
+ }
+}
+
+static void decode_0F5B(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F5B[4] = {
+ X86_OP_ENTRY2(VCVTDQ2PS, V,x, W,x, vex2),
+ X86_OP_ENTRY2(VCVTPS2DQ, V,x, W,x, vex2),
+ X86_OP_ENTRY2(VCVTTPS2DQ, V,x, W,x, vex2),
+ {},
+ };
+ *entry = *decode_by_prefix(s, opcodes_0F5B);
+}
+
+static void decode_0FE6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0FE6[4] = {
+ {},
+ X86_OP_ENTRY2(VCVTTPD2DQ, V,x, W,x, vex2),
+ X86_OP_ENTRY2(VCVTDQ2PD, V,x, W,x, vex2),
+ X86_OP_ENTRY2(VCVTPD2DQ, V,x, W,x, vex2),
+ };
+ *entry = *decode_by_prefix(s, opcodes_0FE6);
+}
+
+static const X86OpEntry opcodes_0F[256] = {
+ [0x0E] = X86_OP_ENTRY0(EMMS, cpuid(3DNOW)), /* femms */
+ /*
+ * 3DNow!'s opcode byte comes *after* modrm and displacements, making it
+ * more like an Ib operand. Dispatch to the right helper in a single gen_*
+ * function.
+ */
+ [0x0F] = X86_OP_ENTRY3(3dnow, P,q, Q,q, I,b, cpuid(3DNOW)),
+
+ [0x10] = X86_OP_GROUP0(0F10),
+ [0x11] = X86_OP_GROUP0(0F11),
+ [0x12] = X86_OP_GROUP0(0F12),
+ [0x13] = X86_OP_ENTRY3(VMOVLPx_st, M,q, None,None, V,q, vex4 p_00_66),
+ [0x14] = X86_OP_ENTRY3(VUNPCKLPx, V,x, H,x, W,x, vex4 p_00_66),
+ [0x15] = X86_OP_ENTRY3(VUNPCKHPx, V,x, H,x, W,x, vex4 p_00_66),
+ [0x16] = X86_OP_GROUP0(0F16),
+ /* Incorrectly listed as Mq,Vq in the manual */
+ [0x17] = X86_OP_ENTRY3(VMOVHPx_st, M,q, None,None, V,dq, vex4 p_00_66),
+
+ [0x50] = X86_OP_ENTRY3(MOVMSK, G,y, None,None, U,x, vex7 p_00_66),
+ [0x51] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x52] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex5 p_00_f3),
+ [0x53] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex5 p_00_f3),
+ [0x54] = X86_OP_ENTRY3(PAND, V,x, H,x, W,x, vex4 p_00_66), /* vand */
+ [0x55] = X86_OP_ENTRY3(PANDN, V,x, H,x, W,x, vex4 p_00_66), /* vandn */
+ [0x56] = X86_OP_ENTRY3(POR, V,x, H,x, W,x, vex4 p_00_66), /* vor */
+ [0x57] = X86_OP_ENTRY3(PXOR, V,x, H,x, W,x, vex4 p_00_66), /* vxor */
+
+ [0x60] = X86_OP_ENTRY3(PUNPCKLBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x61] = X86_OP_ENTRY3(PUNPCKLWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x62] = X86_OP_ENTRY3(PUNPCKLDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x63] = X86_OP_ENTRY3(PACKSSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x64] = X86_OP_ENTRY3(PCMPGTB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x65] = X86_OP_ENTRY3(PCMPGTW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x66] = X86_OP_ENTRY3(PCMPGTD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x67] = X86_OP_ENTRY3(PACKUSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+
+ [0x70] = X86_OP_GROUP0(0F70),
+ [0x71] = X86_OP_GROUP0(group12),
+ [0x72] = X86_OP_GROUP0(group13),
+ [0x73] = X86_OP_GROUP0(group14),
+ [0x74] = X86_OP_ENTRY3(PCMPEQB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x75] = X86_OP_ENTRY3(PCMPEQW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x76] = X86_OP_ENTRY3(PCMPEQD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x77] = X86_OP_GROUP0(0F77),
+
+ [0x28] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex1 p_00_66), /* MOVAPS */
+ [0x29] = X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 p_00_66), /* MOVAPS */
+ [0x2A] = X86_OP_GROUP0(0F2A),
+ [0x2B] = X86_OP_GROUP0(0F2B),
+ [0x2C] = X86_OP_GROUP0(0F2C),
+ [0x2D] = X86_OP_GROUP0(0F2D),
+ [0x2E] = X86_OP_ENTRY3(VUCOMI, None,None, V,x, W,x, vex4 p_00_66),
+ [0x2F] = X86_OP_ENTRY3(VCOMI, None,None, V,x, W,x, vex4 p_00_66),
+
+ [0x38] = X86_OP_GROUP0(0F38),
+ [0x3a] = X86_OP_GROUP0(0F3A),
+
+ [0x58] = X86_OP_ENTRY3(VADD, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x59] = X86_OP_ENTRY3(VMUL, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x5a] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex3 p_00_66_f3_f2),
+ [0x5b] = X86_OP_GROUP0(0F5B),
+ [0x5c] = X86_OP_ENTRY3(VSUB, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x5d] = X86_OP_ENTRY3(VMIN, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x5e] = X86_OP_ENTRY3(VDIV, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0x5f] = X86_OP_ENTRY3(VMAX, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+
+ [0x68] = X86_OP_ENTRY3(PUNPCKHBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x69] = X86_OP_ENTRY3(PUNPCKHWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x6a] = X86_OP_ENTRY3(PUNPCKHDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x6b] = X86_OP_ENTRY3(PACKSSDW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0x6c] = X86_OP_ENTRY3(PUNPCKLQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256),
+ [0x6d] = X86_OP_ENTRY3(PUNPCKHQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256),
+ [0x6e] = X86_OP_ENTRY3(MOVD_to, V,x, None,None, E,y, vex5 mmx p_00_66), /* wrong dest Vy on SDM! */
+ [0x6f] = X86_OP_GROUP0(0F6F),
+
+ [0x78] = X86_OP_GROUP0(0F78),
+ [0x79] = X86_OP_GROUP2(0F79, V,x, U,x, cpuid(SSE4A)),
+ [0x7c] = X86_OP_ENTRY3(VHADD, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2),
+ [0x7d] = X86_OP_ENTRY3(VHSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2),
+ [0x7e] = X86_OP_GROUP0(0F7E),
+ [0x7f] = X86_OP_GROUP0(0F7F),
+
+ [0xae] = X86_OP_GROUP0(group15),
+
+ [0xc2] = X86_OP_ENTRY4(VCMP, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
+ [0xc4] = X86_OP_ENTRY4(PINSRW, V,dq,H,dq,E,w, vex5 mmx p_00_66),
+ [0xc5] = X86_OP_ENTRY3(PEXTRW, G,d, U,dq,I,b, vex5 mmx p_00_66),
+ [0xc6] = X86_OP_ENTRY4(VSHUF, V,x, H,x, W,x, vex4 p_00_66),
+
+ [0xd0] = X86_OP_ENTRY3(VADDSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2),
+ [0xd1] = X86_OP_ENTRY3(PSRLW_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd2] = X86_OP_ENTRY3(PSRLD_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd3] = X86_OP_ENTRY3(PSRLQ_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd4] = X86_OP_ENTRY3(PADDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd5] = X86_OP_ENTRY3(PMULLW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd6] = X86_OP_GROUP0(0FD6),
+ [0xd7] = X86_OP_ENTRY3(PMOVMSKB, G,d, None,None, U,x, vex7 mmx avx2_256 p_00_66),
+
+ [0xe0] = X86_OP_ENTRY3(PAVGB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xe1] = X86_OP_ENTRY3(PSRAW_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66),
+ [0xe2] = X86_OP_ENTRY3(PSRAD_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66),
+ [0xe3] = X86_OP_ENTRY3(PAVGW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xe4] = X86_OP_ENTRY3(PMULHUW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xe5] = X86_OP_ENTRY3(PMULHW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xe6] = X86_OP_GROUP0(0FE6),
+ [0xe7] = X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex1 mmx p_00_66), /* MOVNTQ/MOVNTDQ */
+
+ [0xf0] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, WM,x, vex4_unal cpuid(SSE3) p_f2), /* LDDQU */
+ [0xf1] = X86_OP_ENTRY3(PSLLW_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66),
+ [0xf2] = X86_OP_ENTRY3(PSLLD_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66),
+ [0xf3] = X86_OP_ENTRY3(PSLLQ_r, V,x, H,x, W,x, vex7 mmx avx2_256 p_00_66),
+ [0xf4] = X86_OP_ENTRY3(PMULUDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xf5] = X86_OP_ENTRY3(PMADDWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xf6] = X86_OP_ENTRY3(PSADBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xf7] = X86_OP_ENTRY3(MASKMOV, None,None, V,dq, U,dq, vex4_unal avx2_256 mmx p_00_66),
+
+ /* Incorrectly missing from 2-17 */
+ [0xd8] = X86_OP_ENTRY3(PSUBUSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xd9] = X86_OP_ENTRY3(PSUBUSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xda] = X86_OP_ENTRY3(PMINUB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xdb] = X86_OP_ENTRY3(PAND, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xdc] = X86_OP_ENTRY3(PADDUSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xdd] = X86_OP_ENTRY3(PADDUSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xde] = X86_OP_ENTRY3(PMAXUB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xdf] = X86_OP_ENTRY3(PANDN, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+
+ [0xe8] = X86_OP_ENTRY3(PSUBSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xe9] = X86_OP_ENTRY3(PSUBSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xea] = X86_OP_ENTRY3(PMINSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xeb] = X86_OP_ENTRY3(POR, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xec] = X86_OP_ENTRY3(PADDSB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xed] = X86_OP_ENTRY3(PADDSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xee] = X86_OP_ENTRY3(PMAXSW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xef] = X86_OP_ENTRY3(PXOR, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+
+ [0xf8] = X86_OP_ENTRY3(PSUBB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xf9] = X86_OP_ENTRY3(PSUBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xfa] = X86_OP_ENTRY3(PSUBD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xfb] = X86_OP_ENTRY3(PSUBQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xfc] = X86_OP_ENTRY3(PADDB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xfd] = X86_OP_ENTRY3(PADDW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ [0xfe] = X86_OP_ENTRY3(PADDD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66),
+ /* 0xff = UD0 */
+};
+
+static void do_decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ *entry = opcodes_0F[*b];
+}
+
+static void decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ *b = x86_ldub_code(env, s);
+ do_decode_0F(s, env, entry, b);
+}
+
+static const X86OpEntry opcodes_root[256] = {
+ [0x0F] = X86_OP_GROUP0(0F),
+};
+
+#undef mmx
+#undef vex1
+#undef vex2
+#undef vex3
+#undef vex4
+#undef vex4_unal
+#undef vex5
+#undef vex6
+#undef vex7
+#undef vex8
+#undef vex11
+#undef vex12
+#undef vex13
+
+/*
+ * Decode the fixed part of the opcode and place the last
+ * in b.
+ */
+static void decode_root(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ *entry = opcodes_root[*b];
+}
+
+
+static int decode_modrm(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ X86DecodedOp *op, X86OpType type)
+{
+ int modrm = get_modrm(s, env);
+ if ((modrm >> 6) == 3) {
+ if (s->prefix & PREFIX_LOCK) {
+ decode->e.gen = gen_illegal;
+ return 0xff;
+ }
+ op->n = (modrm & 7);
+ if (type != X86_TYPE_Q && type != X86_TYPE_N) {
+ op->n |= REX_B(s);
+ }
+ } else {
+ op->has_ea = true;
+ op->n = -1;
+ decode->mem = gen_lea_modrm_0(env, s, get_modrm(s, env));
+ }
+ return modrm;
+}
+
+static bool decode_op_size(DisasContext *s, X86OpEntry *e, X86OpSize size, MemOp *ot)
+{
+ switch (size) {
+ case X86_SIZE_b: /* byte */
+ *ot = MO_8;
+ return true;
+
+ case X86_SIZE_d: /* 32-bit */
+ case X86_SIZE_ss: /* SSE/AVX scalar single precision */
+ *ot = MO_32;
+ return true;
+
+ case X86_SIZE_p: /* Far pointer, return offset size */
+ case X86_SIZE_s: /* Descriptor, return offset size */
+ case X86_SIZE_v: /* 16/32/64-bit, based on operand size */
+ *ot = s->dflag;
+ return true;
+
+ case X86_SIZE_pi: /* MMX */
+ case X86_SIZE_q: /* 64-bit */
+ case X86_SIZE_sd: /* SSE/AVX scalar double precision */
+ *ot = MO_64;
+ return true;
+
+ case X86_SIZE_w: /* 16-bit */
+ *ot = MO_16;
+ return true;
+
+ case X86_SIZE_y: /* 32/64-bit, based on operand size */
+ *ot = s->dflag == MO_16 ? MO_32 : s->dflag;
+ return true;
+
+ case X86_SIZE_z: /* 16-bit for 16-bit operand size, else 32-bit */
+ *ot = s->dflag == MO_16 ? MO_16 : MO_32;
+ return true;
+
+ case X86_SIZE_dq: /* SSE/AVX 128-bit */
+ if (e->special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ *ot = MO_64;
+ return true;
+ }
+ if (s->vex_l && e->s0 != X86_SIZE_qq && e->s1 != X86_SIZE_qq) {
+ return false;
+ }
+ *ot = MO_128;
+ return true;
+
+ case X86_SIZE_qq: /* AVX 256-bit */
+ if (!s->vex_l) {
+ return false;
+ }
+ *ot = MO_256;
+ return true;
+
+ case X86_SIZE_x: /* 128/256-bit, based on operand size */
+ if (e->special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ *ot = MO_64;
+ return true;
+ }
+ /* fall through */
+ case X86_SIZE_ps: /* SSE/AVX packed single precision */
+ case X86_SIZE_pd: /* SSE/AVX packed double precision */
+ *ot = s->vex_l ? MO_256 : MO_128;
+ return true;
+
+ case X86_SIZE_d64: /* Default to 64-bit in 64-bit mode */
+ *ot = CODE64(s) && s->dflag == MO_32 ? MO_64 : s->dflag;
+ return true;
+
+ case X86_SIZE_f64: /* Ignore size override prefix in 64-bit mode */
+ *ot = CODE64(s) ? MO_64 : s->dflag;
+ return true;
+
+ default:
+ *ot = -1;
+ return true;
+ }
+}
+
+static bool decode_op(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ X86DecodedOp *op, X86OpType type, int b)
+{
+ int modrm;
+
+ switch (type) {
+ case X86_TYPE_None: /* Implicit or absent */
+ case X86_TYPE_A: /* Implicit */
+ case X86_TYPE_F: /* EFLAGS/RFLAGS */
+ break;
+
+ case X86_TYPE_B: /* VEX.vvvv selects a GPR */
+ op->unit = X86_OP_INT;
+ op->n = s->vex_v;
+ break;
+
+ case X86_TYPE_C: /* REG in the modrm byte selects a control register */
+ op->unit = X86_OP_CR;
+ goto get_reg;
+
+ case X86_TYPE_D: /* REG in the modrm byte selects a debug register */
+ op->unit = X86_OP_DR;
+ goto get_reg;
+
+ case X86_TYPE_G: /* REG in the modrm byte selects a GPR */
+ op->unit = X86_OP_INT;
+ goto get_reg;
+
+ case X86_TYPE_S: /* reg selects a segment register */
+ op->unit = X86_OP_SEG;
+ goto get_reg;
+
+ case X86_TYPE_P:
+ op->unit = X86_OP_MMX;
+ goto get_reg;
+
+ case X86_TYPE_V: /* reg in the modrm byte selects an XMM/YMM register */
+ if (decode->e.special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ op->unit = X86_OP_MMX;
+ } else {
+ op->unit = X86_OP_SSE;
+ }
+ get_reg:
+ op->n = ((get_modrm(s, env) >> 3) & 7) | REX_R(s);
+ break;
+
+ case X86_TYPE_E: /* ALU modrm operand */
+ op->unit = X86_OP_INT;
+ goto get_modrm;
+
+ case X86_TYPE_Q: /* MMX modrm operand */
+ op->unit = X86_OP_MMX;
+ goto get_modrm;
+
+ case X86_TYPE_W: /* XMM/YMM modrm operand */
+ if (decode->e.special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ op->unit = X86_OP_MMX;
+ } else {
+ op->unit = X86_OP_SSE;
+ }
+ goto get_modrm;
+
+ case X86_TYPE_N: /* R/M in the modrm byte selects an MMX register */
+ op->unit = X86_OP_MMX;
+ goto get_modrm_reg;
+
+ case X86_TYPE_U: /* R/M in the modrm byte selects an XMM/YMM register */
+ if (decode->e.special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ op->unit = X86_OP_MMX;
+ } else {
+ op->unit = X86_OP_SSE;
+ }
+ goto get_modrm_reg;
+
+ case X86_TYPE_R: /* R/M in the modrm byte selects a register */
+ op->unit = X86_OP_INT;
+ get_modrm_reg:
+ modrm = get_modrm(s, env);
+ if ((modrm >> 6) != 3) {
+ return false;
+ }
+ goto get_modrm;
+
+ case X86_TYPE_WM: /* modrm byte selects an XMM/YMM memory operand */
+ op->unit = X86_OP_SSE;
+ /* fall through */
+ case X86_TYPE_M: /* modrm byte selects a memory operand */
+ modrm = get_modrm(s, env);
+ if ((modrm >> 6) == 3) {
+ return false;
+ }
+ get_modrm:
+ decode_modrm(s, env, decode, op, type);
+ break;
+
+ case X86_TYPE_O: /* Absolute address encoded in the instruction */
+ op->unit = X86_OP_INT;
+ op->has_ea = true;
+ op->n = -1;
+ decode->mem = (AddressParts) {
+ .def_seg = R_DS,
+ .base = -1,
+ .index = -1,
+ .disp = insn_get_addr(env, s, s->aflag)
+ };
+ break;
+
+ case X86_TYPE_H: /* For AVX, VEX.vvvv selects an XMM/YMM register */
+ if ((s->prefix & PREFIX_VEX)) {
+ op->unit = X86_OP_SSE;
+ op->n = s->vex_v;
+ break;
+ }
+ if (op == &decode->op[0]) {
+ /* shifts place the destination in VEX.vvvv, use modrm */
+ return decode_op(s, env, decode, op, decode->e.op1, b);
+ } else {
+ return decode_op(s, env, decode, op, decode->e.op0, b);
+ }
+
+ case X86_TYPE_I: /* Immediate */
+ op->unit = X86_OP_IMM;
+ decode->immediate = insn_get_signed(env, s, op->ot);
+ break;
+
+ case X86_TYPE_J: /* Relative offset for a jump */
+ op->unit = X86_OP_IMM;
+ decode->immediate = insn_get_signed(env, s, op->ot);
+ decode->immediate += s->pc - s->cs_base;
+ if (s->dflag == MO_16) {
+ decode->immediate &= 0xffff;
+ } else if (!CODE64(s)) {
+ decode->immediate &= 0xffffffffu;
+ }
+ break;
+
+ case X86_TYPE_L: /* The upper 4 bits of the immediate select a 128-bit register */
+ op->n = insn_get(env, s, op->ot) >> 4;
+ break;
+
+ case X86_TYPE_X: /* string source */
+ op->n = -1;
+ decode->mem = (AddressParts) {
+ .def_seg = R_DS,
+ .base = R_ESI,
+ .index = -1,
+ };
+ break;
+
+ case X86_TYPE_Y: /* string destination */
+ op->n = -1;
+ decode->mem = (AddressParts) {
+ .def_seg = R_ES,
+ .base = R_EDI,
+ .index = -1,
+ };
+ break;
+
+ case X86_TYPE_2op:
+ *op = decode->op[0];
+ break;
+
+ case X86_TYPE_LoBits:
+ op->n = (b & 7) | REX_B(s);
+ op->unit = X86_OP_INT;
+ break;
+
+ case X86_TYPE_0 ... X86_TYPE_7:
+ op->n = type - X86_TYPE_0;
+ op->unit = X86_OP_INT;
+ break;
+
+ case X86_TYPE_ES ... X86_TYPE_GS:
+ op->n = type - X86_TYPE_ES;
+ op->unit = X86_OP_SEG;
+ break;
+ }
+
+ return true;
+}
+
+static bool validate_sse_prefix(DisasContext *s, X86OpEntry *e)
+{
+ uint16_t sse_prefixes;
+
+ if (!e->valid_prefix) {
+ return true;
+ }
+ if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+ /* In SSE instructions, 0xF3 and 0xF2 cancel 0x66. */
+ s->prefix &= ~PREFIX_DATA;
+ }
+
+ /* Now, either zero or one bit is set in sse_prefixes. */
+ sse_prefixes = s->prefix & (PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
+ return e->valid_prefix & (1 << sse_prefixes);
+}
+
+static bool decode_insn(DisasContext *s, CPUX86State *env, X86DecodeFunc decode_func,
+ X86DecodedInsn *decode)
+{
+ X86OpEntry *e = &decode->e;
+
+ decode_func(s, env, e, &decode->b);
+ while (e->is_decode) {
+ e->is_decode = false;
+ e->decode(s, env, e, &decode->b);
+ }
+
+ if (!validate_sse_prefix(s, e)) {
+ return false;
+ }
+
+ /* First compute size of operands in order to initialize s->rip_offset. */
+ if (e->op0 != X86_TYPE_None) {
+ if (!decode_op_size(s, e, e->s0, &decode->op[0].ot)) {
+ return false;
+ }
+ if (e->op0 == X86_TYPE_I) {
+ s->rip_offset += 1 << decode->op[0].ot;
+ }
+ }
+ if (e->op1 != X86_TYPE_None) {
+ if (!decode_op_size(s, e, e->s1, &decode->op[1].ot)) {
+ return false;
+ }
+ if (e->op1 == X86_TYPE_I) {
+ s->rip_offset += 1 << decode->op[1].ot;
+ }
+ }
+ if (e->op2 != X86_TYPE_None) {
+ if (!decode_op_size(s, e, e->s2, &decode->op[2].ot)) {
+ return false;
+ }
+ if (e->op2 == X86_TYPE_I) {
+ s->rip_offset += 1 << decode->op[2].ot;
+ }
+ }
+ if (e->op3 != X86_TYPE_None) {
+ /*
+ * A couple instructions actually use the extra immediate byte for an Lx
+ * register operand; those are handled in the gen_* functions as one off.
+ */
+ assert(e->op3 == X86_TYPE_I && e->s3 == X86_SIZE_b);
+ s->rip_offset += 1;
+ }
+
+ if (e->op0 != X86_TYPE_None &&
+ !decode_op(s, env, decode, &decode->op[0], e->op0, decode->b)) {
+ return false;
+ }
+
+ if (e->op1 != X86_TYPE_None &&
+ !decode_op(s, env, decode, &decode->op[1], e->op1, decode->b)) {
+ return false;
+ }
+
+ if (e->op2 != X86_TYPE_None &&
+ !decode_op(s, env, decode, &decode->op[2], e->op2, decode->b)) {
+ return false;
+ }
+
+ if (e->op3 != X86_TYPE_None) {
+ decode->immediate = insn_get_signed(env, s, MO_8);
+ }
+
+ return true;
+}
+
+static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
+{
+ switch (cpuid) {
+ case X86_FEAT_None:
+ return true;
+ case X86_FEAT_MOVBE:
+ return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
+ case X86_FEAT_PCLMULQDQ:
+ return (s->cpuid_ext_features & CPUID_EXT_PCLMULQDQ);
+ case X86_FEAT_SSE:
+ return (s->cpuid_ext_features & CPUID_SSE);
+ case X86_FEAT_SSE2:
+ return (s->cpuid_ext_features & CPUID_SSE2);
+ case X86_FEAT_SSE3:
+ return (s->cpuid_ext_features & CPUID_EXT_SSE3);
+ case X86_FEAT_SSSE3:
+ return (s->cpuid_ext_features & CPUID_EXT_SSSE3);
+ case X86_FEAT_SSE41:
+ return (s->cpuid_ext_features & CPUID_EXT_SSE41);
+ case X86_FEAT_SSE42:
+ return (s->cpuid_ext_features & CPUID_EXT_SSE42);
+ case X86_FEAT_AES:
+ if (!(s->cpuid_ext_features & CPUID_EXT_AES)) {
+ return false;
+ } else if (!(s->prefix & PREFIX_VEX)) {
+ return true;
+ } else if (!(s->cpuid_ext_features & CPUID_EXT_AVX)) {
+ return false;
+ } else {
+ return !s->vex_l || (s->cpuid_7_0_ecx_features & CPUID_7_0_ECX_VAES);
+ }
+
+ case X86_FEAT_AVX:
+ return (s->cpuid_ext_features & CPUID_EXT_AVX);
+
+ case X86_FEAT_3DNOW:
+ return (s->cpuid_ext2_features & CPUID_EXT2_3DNOW);
+ case X86_FEAT_SSE4A:
+ return (s->cpuid_ext3_features & CPUID_EXT3_SSE4A);
+
+ case X86_FEAT_ADX:
+ return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX);
+ case X86_FEAT_BMI1:
+ return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1);
+ case X86_FEAT_BMI2:
+ return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2);
+ case X86_FEAT_AVX2:
+ return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2);
+ }
+ g_assert_not_reached();
+}
+
+static bool validate_vex(DisasContext *s, X86DecodedInsn *decode)
+{
+ X86OpEntry *e = &decode->e;
+
+ switch (e->vex_special) {
+ case X86_VEX_REPScalar:
+ /*
+ * Instructions which differ between 00/66 and F2/F3 in the
+ * exception classification and the size of the memory operand.
+ */
+ assert(e->vex_class == 1 || e->vex_class == 2);
+ if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+ e->vex_class = 3;
+ if (s->vex_l) {
+ goto illegal;
+ }
+ assert(decode->e.s2 == X86_SIZE_x);
+ if (decode->op[2].has_ea) {
+ decode->op[2].ot = s->prefix & PREFIX_REPZ ? MO_32 : MO_64;
+ }
+ }
+ break;
+
+ case X86_VEX_SSEUnaligned:
+ /* handled in sse_needs_alignment. */
+ break;
+
+ case X86_VEX_AVX2_256:
+ if ((s->prefix & PREFIX_VEX) && s->vex_l && !has_cpuid_feature(s, X86_FEAT_AVX2)) {
+ goto illegal;
+ }
+ }
+
+ /* TODO: instructions that require VEX.W=0 (Table 2-16) */
+
+ switch (e->vex_class) {
+ case 0:
+ if (s->prefix & PREFIX_VEX) {
+ goto illegal;
+ }
+ return true;
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 7:
+ if (s->prefix & PREFIX_VEX) {
+ if (!(s->flags & HF_AVX_EN_MASK)) {
+ goto illegal;
+ }
+ } else {
+ if (!(s->flags & HF_OSFXSR_MASK)) {
+ goto illegal;
+ }
+ }
+ break;
+ case 12:
+ /* Must have a VSIB byte and no address prefix. */
+ assert(s->has_modrm);
+ if ((s->modrm & 7) != 4 || s->aflag == MO_16) {
+ goto illegal;
+ }
+
+ /* Check no overlap between registers. */
+ if (!decode->op[0].has_ea &&
+ (decode->op[0].n == decode->mem.index || decode->op[0].n == decode->op[1].n)) {
+ goto illegal;
+ }
+ assert(!decode->op[1].has_ea);
+ if (decode->op[1].n == decode->mem.index) {
+ goto illegal;
+ }
+ if (!decode->op[2].has_ea &&
+ (decode->op[2].n == decode->mem.index || decode->op[2].n == decode->op[1].n)) {
+ goto illegal;
+ }
+ /* fall through */
+ case 6:
+ case 11:
+ if (!(s->prefix & PREFIX_VEX)) {
+ goto illegal;
+ }
+ if (!(s->flags & HF_AVX_EN_MASK)) {
+ goto illegal;
+ }
+ break;
+ case 8:
+ /* Non-VEX case handled in decode_0F77. */
+ assert(s->prefix & PREFIX_VEX);
+ if (!(s->flags & HF_AVX_EN_MASK)) {
+ goto illegal;
+ }
+ break;
+ case 13:
+ if (!(s->prefix & PREFIX_VEX)) {
+ goto illegal;
+ }
+ if (s->vex_l) {
+ goto illegal;
+ }
+ /* All integer instructions use VEX.vvvv, so exit. */
+ return true;
+ }
+
+ if (s->vex_v != 0 &&
+ e->op0 != X86_TYPE_H && e->op0 != X86_TYPE_B &&
+ e->op1 != X86_TYPE_H && e->op1 != X86_TYPE_B &&
+ e->op2 != X86_TYPE_H && e->op2 != X86_TYPE_B) {
+ goto illegal;
+ }
+
+ if (s->flags & HF_TS_MASK) {
+ goto nm_exception;
+ }
+ if (s->flags & HF_EM_MASK) {
+ goto illegal;
+ }
+ return true;
+
+nm_exception:
+ gen_NM_exception(s);
+ return false;
+illegal:
+ gen_illegal_opcode(s);
+ return false;
+}
+
+static void decode_temp_free(X86DecodedOp *op)
+{
+ if (op->v_ptr) {
+ tcg_temp_free_ptr(op->v_ptr);
+ }
+}
+
+static void decode_temps_free(X86DecodedInsn *decode)
+{
+ decode_temp_free(&decode->op[0]);
+ decode_temp_free(&decode->op[1]);
+ decode_temp_free(&decode->op[2]);
+}
+
+/*
+ * Convert one instruction. s->base.is_jmp is set if the translation must
+ * be stopped.
+ */
+static void disas_insn_new(DisasContext *s, CPUState *cpu, int b)
+{
+ CPUX86State *env = cpu->env_ptr;
+ bool first = true;
+ X86DecodedInsn decode;
+ X86DecodeFunc decode_func = decode_root;
+
+ s->has_modrm = false;
+
+ next_byte:
+ if (first) {
+ first = false;
+ } else {
+ b = x86_ldub_code(env, s);
+ }
+ /* Collect prefixes. */
+ switch (b) {
+ case 0xf3:
+ s->prefix |= PREFIX_REPZ;
+ s->prefix &= ~PREFIX_REPNZ;
+ goto next_byte;
+ case 0xf2:
+ s->prefix |= PREFIX_REPNZ;
+ s->prefix &= ~PREFIX_REPZ;
+ goto next_byte;
+ case 0xf0:
+ s->prefix |= PREFIX_LOCK;
+ goto next_byte;
+ case 0x2e:
+ s->override = R_CS;
+ goto next_byte;
+ case 0x36:
+ s->override = R_SS;
+ goto next_byte;
+ case 0x3e:
+ s->override = R_DS;
+ goto next_byte;
+ case 0x26:
+ s->override = R_ES;
+ goto next_byte;
+ case 0x64:
+ s->override = R_FS;
+ goto next_byte;
+ case 0x65:
+ s->override = R_GS;
+ goto next_byte;
+ case 0x66:
+ s->prefix |= PREFIX_DATA;
+ goto next_byte;
+ case 0x67:
+ s->prefix |= PREFIX_ADR;
+ goto next_byte;
+#ifdef TARGET_X86_64
+ case 0x40 ... 0x4f:
+ if (CODE64(s)) {
+ /* REX prefix */
+ s->prefix |= PREFIX_REX;
+ s->vex_w = (b >> 3) & 1;
+ s->rex_r = (b & 0x4) << 1;
+ s->rex_x = (b & 0x2) << 2;
+ s->rex_b = (b & 0x1) << 3;
+ goto next_byte;
+ }
+ break;
+#endif
+ case 0xc5: /* 2-byte VEX */
+ case 0xc4: /* 3-byte VEX */
+ /*
+ * VEX prefixes cannot be used except in 32-bit mode.
+ * Otherwise the instruction is LES or LDS.
+ */
+ if (CODE32(s) && !VM86(s)) {
+ static const int pp_prefix[4] = {
+ 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
+ };
+ int vex3, vex2 = x86_ldub_code(env, s);
+
+ if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
+ /*
+ * 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
+ * otherwise the instruction is LES or LDS.
+ */
+ s->pc--; /* rewind the advance_pc() x86_ldub_code() did */
+ break;
+ }
+
+ /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
+ if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ
+ | PREFIX_LOCK | PREFIX_DATA | PREFIX_REX)) {
+ goto illegal_op;
+ }
+#ifdef TARGET_X86_64
+ s->rex_r = (~vex2 >> 4) & 8;
+#endif
+ if (b == 0xc5) {
+ /* 2-byte VEX prefix: RVVVVlpp, implied 0f leading opcode byte */
+ vex3 = vex2;
+ decode_func = decode_0F;
+ } else {
+ /* 3-byte VEX prefix: RXBmmmmm wVVVVlpp */
+ vex3 = x86_ldub_code(env, s);
+#ifdef TARGET_X86_64
+ s->rex_x = (~vex2 >> 3) & 8;
+ s->rex_b = (~vex2 >> 2) & 8;
+#endif
+ s->vex_w = (vex3 >> 7) & 1;
+ switch (vex2 & 0x1f) {
+ case 0x01: /* Implied 0f leading opcode bytes. */
+ decode_func = decode_0F;
+ break;
+ case 0x02: /* Implied 0f 38 leading opcode bytes. */
+ decode_func = decode_0F38;
+ break;
+ case 0x03: /* Implied 0f 3a leading opcode bytes. */
+ decode_func = decode_0F3A;
+ break;
+ default: /* Reserved for future use. */
+ goto unknown_op;
+ }
+ }
+ s->vex_v = (~vex3 >> 3) & 0xf;
+ s->vex_l = (vex3 >> 2) & 1;
+ s->prefix |= pp_prefix[vex3 & 3] | PREFIX_VEX;
+ }
+ break;
+ default:
+ if (b >= 0x100) {
+ b -= 0x100;
+ decode_func = do_decode_0F;
+ }
+ break;
+ }
+
+ /* Post-process prefixes. */
+ if (CODE64(s)) {
+ /*
+ * In 64-bit mode, the default data size is 32-bit. Select 64-bit
+ * data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
+ * over 0x66 if both are present.
+ */
+ s->dflag = (REX_W(s) ? MO_64 : s->prefix & PREFIX_DATA ? MO_16 : MO_32);
+ /* In 64-bit mode, 0x67 selects 32-bit addressing. */
+ s->aflag = (s->prefix & PREFIX_ADR ? MO_32 : MO_64);
+ } else {
+ /* In 16/32-bit mode, 0x66 selects the opposite data size. */
+ if (CODE32(s) ^ ((s->prefix & PREFIX_DATA) != 0)) {
+ s->dflag = MO_32;
+ } else {
+ s->dflag = MO_16;
+ }
+ /* In 16/32-bit mode, 0x67 selects the opposite addressing. */
+ if (CODE32(s) ^ ((s->prefix & PREFIX_ADR) != 0)) {
+ s->aflag = MO_32;
+ } else {
+ s->aflag = MO_16;
+ }
+ }
+
+ memset(&decode, 0, sizeof(decode));
+ decode.b = b;
+ if (!decode_insn(s, env, decode_func, &decode)) {
+ goto illegal_op;
+ }
+ if (!decode.e.gen) {
+ goto unknown_op;
+ }
+
+ if (!has_cpuid_feature(s, decode.e.cpuid)) {
+ goto illegal_op;
+ }
+
+ switch (decode.e.special) {
+ case X86_SPECIAL_None:
+ break;
+
+ case X86_SPECIAL_Locked:
+ if (decode.op[0].has_ea) {
+ s->prefix |= PREFIX_LOCK;
+ }
+ break;
+
+ case X86_SPECIAL_ProtMode:
+ if (!PE(s) || VM86(s)) {
+ goto illegal_op;
+ }
+ break;
+
+ case X86_SPECIAL_i64:
+ if (CODE64(s)) {
+ goto illegal_op;
+ }
+ break;
+ case X86_SPECIAL_o64:
+ if (!CODE64(s)) {
+ goto illegal_op;
+ }
+ break;
+
+ case X86_SPECIAL_ZExtOp0:
+ assert(decode.op[0].unit == X86_OP_INT);
+ if (!decode.op[0].has_ea) {
+ decode.op[0].ot = MO_32;
+ }
+ break;
+
+ case X86_SPECIAL_ZExtOp2:
+ assert(decode.op[2].unit == X86_OP_INT);
+ if (!decode.op[2].has_ea) {
+ decode.op[2].ot = MO_32;
+ }
+ break;
+
+ case X86_SPECIAL_AVXExtMov:
+ if (!decode.op[2].has_ea) {
+ decode.op[2].ot = s->vex_l ? MO_256 : MO_128;
+ } else if (s->vex_l) {
+ decode.op[2].ot++;
+ }
+ break;
+
+ case X86_SPECIAL_MMX:
+ if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA))) {
+ gen_helper_enter_mmx(cpu_env);
+ }
+ break;
+ }
+
+ if (!validate_vex(s, &decode)) {
+ return;
+ }
+ if (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea) {
+ gen_load_ea(s, &decode.mem, decode.e.vex_class == 12);
+ }
+ if (s->prefix & PREFIX_LOCK) {
+ if (decode.op[0].unit != X86_OP_INT || !decode.op[0].has_ea) {
+ goto illegal_op;
+ }
+ gen_load(s, &decode, 2, s->T1);
+ decode.e.gen(s, env, &decode);
+ } else {
+ if (decode.op[0].unit == X86_OP_MMX) {
+ compute_mmx_offset(&decode.op[0]);
+ } else if (decode.op[0].unit == X86_OP_SSE) {
+ compute_xmm_offset(&decode.op[0]);
+ }
+ gen_load(s, &decode, 1, s->T0);
+ gen_load(s, &decode, 2, s->T1);
+ decode.e.gen(s, env, &decode);
+ gen_writeback(s, &decode, 0, s->T0);
+ }
+ decode_temps_free(&decode);
+ return;
+ illegal_op:
+ gen_illegal_opcode(s);
+ return;
+ unknown_op:
+ gen_unknown_opcode(env, s);
+}
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
new file mode 100644
index 0000000..f159c26
--- /dev/null
+++ b/target/i386/tcg/decode-new.h
@@ -0,0 +1,249 @@
+/*
+ * Decode table flags, mostly based on Intel SDM.
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+typedef enum X86OpType {
+ X86_TYPE_None,
+
+ X86_TYPE_A, /* Implicit */
+ X86_TYPE_B, /* VEX.vvvv selects a GPR */
+ X86_TYPE_C, /* REG in the modrm byte selects a control register */
+ X86_TYPE_D, /* REG in the modrm byte selects a debug register */
+ X86_TYPE_E, /* ALU modrm operand */
+ X86_TYPE_F, /* EFLAGS/RFLAGS */
+ X86_TYPE_G, /* REG in the modrm byte selects a GPR */
+ X86_TYPE_H, /* For AVX, VEX.vvvv selects an XMM/YMM register */
+ X86_TYPE_I, /* Immediate */
+ X86_TYPE_J, /* Relative offset for a jump */
+ X86_TYPE_L, /* The upper 4 bits of the immediate select a 128-bit register */
+ X86_TYPE_M, /* modrm byte selects a memory operand */
+ X86_TYPE_N, /* R/M in the modrm byte selects an MMX register */
+ X86_TYPE_O, /* Absolute address encoded in the instruction */
+ X86_TYPE_P, /* reg in the modrm byte selects an MMX register */
+ X86_TYPE_Q, /* MMX modrm operand */
+ X86_TYPE_R, /* R/M in the modrm byte selects a register */
+ X86_TYPE_S, /* reg selects a segment register */
+ X86_TYPE_U, /* R/M in the modrm byte selects an XMM/YMM register */
+ X86_TYPE_V, /* reg in the modrm byte selects an XMM/YMM register */
+ X86_TYPE_W, /* XMM/YMM modrm operand */
+ X86_TYPE_X, /* string source */
+ X86_TYPE_Y, /* string destination */
+
+ /* Custom */
+ X86_TYPE_WM, /* modrm byte selects an XMM/YMM memory operand */
+ X86_TYPE_2op, /* 2-operand RMW instruction */
+ X86_TYPE_LoBits, /* encoded in bits 0-2 of the operand + REX.B */
+ X86_TYPE_0, /* Hard-coded GPRs (RAX..RDI) */
+ X86_TYPE_1,
+ X86_TYPE_2,
+ X86_TYPE_3,
+ X86_TYPE_4,
+ X86_TYPE_5,
+ X86_TYPE_6,
+ X86_TYPE_7,
+ X86_TYPE_ES, /* Hard-coded segment registers */
+ X86_TYPE_CS,
+ X86_TYPE_SS,
+ X86_TYPE_DS,
+ X86_TYPE_FS,
+ X86_TYPE_GS,
+} X86OpType;
+
+typedef enum X86OpSize {
+ X86_SIZE_None,
+
+ X86_SIZE_a, /* BOUND operand */
+ X86_SIZE_b, /* byte */
+ X86_SIZE_d, /* 32-bit */
+ X86_SIZE_dq, /* SSE/AVX 128-bit */
+ X86_SIZE_p, /* Far pointer */
+ X86_SIZE_pd, /* SSE/AVX packed double precision */
+ X86_SIZE_pi, /* MMX */
+ X86_SIZE_ps, /* SSE/AVX packed single precision */
+ X86_SIZE_q, /* 64-bit */
+ X86_SIZE_qq, /* AVX 256-bit */
+ X86_SIZE_s, /* Descriptor */
+ X86_SIZE_sd, /* SSE/AVX scalar double precision */
+ X86_SIZE_ss, /* SSE/AVX scalar single precision */
+ X86_SIZE_si, /* 32-bit GPR */
+ X86_SIZE_v, /* 16/32/64-bit, based on operand size */
+ X86_SIZE_w, /* 16-bit */
+ X86_SIZE_x, /* 128/256-bit, based on operand size */
+ X86_SIZE_y, /* 32/64-bit, based on operand size */
+ X86_SIZE_z, /* 16-bit for 16-bit operand size, else 32-bit */
+
+ /* Custom */
+ X86_SIZE_d64,
+ X86_SIZE_f64,
+} X86OpSize;
+
+typedef enum X86CPUIDFeature {
+ X86_FEAT_None,
+ X86_FEAT_3DNOW,
+ X86_FEAT_ADX,
+ X86_FEAT_AES,
+ X86_FEAT_AVX,
+ X86_FEAT_AVX2,
+ X86_FEAT_BMI1,
+ X86_FEAT_BMI2,
+ X86_FEAT_MOVBE,
+ X86_FEAT_PCLMULQDQ,
+ X86_FEAT_SSE,
+ X86_FEAT_SSE2,
+ X86_FEAT_SSE3,
+ X86_FEAT_SSSE3,
+ X86_FEAT_SSE41,
+ X86_FEAT_SSE42,
+ X86_FEAT_SSE4A,
+} X86CPUIDFeature;
+
+/* Execution flags */
+
+typedef enum X86OpUnit {
+ X86_OP_SKIP, /* not valid or managed by emission function */
+ X86_OP_SEG, /* segment selector */
+ X86_OP_CR, /* control register */
+ X86_OP_DR, /* debug register */
+ X86_OP_INT, /* loaded into/stored from s->T0/T1 */
+ X86_OP_IMM, /* immediate */
+ X86_OP_SSE, /* address in either s->ptrX or s->A0 depending on has_ea */
+ X86_OP_MMX, /* address in either s->ptrX or s->A0 depending on has_ea */
+} X86OpUnit;
+
+typedef enum X86InsnSpecial {
+ X86_SPECIAL_None,
+
+ /* Always locked if it has a memory operand (XCHG) */
+ X86_SPECIAL_Locked,
+
+ /* Fault outside protected mode */
+ X86_SPECIAL_ProtMode,
+
+ /*
+ * Register operand 0/2 is zero extended to 32 bits. Rd/Mb or Rd/Mw
+ * in the manual.
+ */
+ X86_SPECIAL_ZExtOp0,
+ X86_SPECIAL_ZExtOp2,
+
+ /*
+ * Register operand 2 is extended to full width, while a memory operand
+ * is doubled in size if VEX.L=1.
+ */
+ X86_SPECIAL_AVXExtMov,
+
+ /*
+ * MMX instruction exists with no prefix; if there is no prefix, V/H/W/U operands
+ * become P/P/Q/N, and size "x" becomes "q".
+ */
+ X86_SPECIAL_MMX,
+
+ /* Illegal or exclusive to 64-bit mode */
+ X86_SPECIAL_i64,
+ X86_SPECIAL_o64,
+} X86InsnSpecial;
+
+/*
+ * Special cases for instructions that operate on XMM/YMM registers. Intel
+ * retconned all of them to have VEX exception classes other than 0 and 13, so
+ * all these only matter for instructions that have a VEX exception class.
+ * Based on tables in the "AVX and SSE Instruction Exception Specification"
+ * section of the manual.
+ */
+typedef enum X86VEXSpecial {
+ /* Legacy SSE instructions that allow unaligned operands */
+ X86_VEX_SSEUnaligned,
+
+ /*
+ * Used for instructions that distinguish the XMM operand type with an
+ * instruction prefix; legacy SSE encodings will allow unaligned operands
+ * for scalar operands only (identified by a REP prefix). In this case,
+ * the decoding table uses "x" for the vector operands instead of specifying
+ * pd/ps/sd/ss individually.
+ */
+ X86_VEX_REPScalar,
+
+ /*
+ * VEX instructions that only support 256-bit operands with AVX2 (Table 2-17
+ * column 3). Columns 2 and 4 (instructions limited to 256- and 127-bit
+ * operands respectively) are implicit in the presence of dq and qq
+ * operands, and thus handled by decode_op_size.
+ */
+ X86_VEX_AVX2_256,
+} X86VEXSpecial;
+
+
+typedef struct X86OpEntry X86OpEntry;
+typedef struct X86DecodedInsn X86DecodedInsn;
+
+/* Decode function for multibyte opcodes. */
+typedef void (*X86DecodeFunc)(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b);
+
+/* Code generation function. */
+typedef void (*X86GenFunc)(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode);
+
+struct X86OpEntry {
+ /* Based on the is_decode flags. */
+ union {
+ X86GenFunc gen;
+ X86DecodeFunc decode;
+ };
+ /* op0 is always written, op1 and op2 are always read. */
+ X86OpType op0:8;
+ X86OpSize s0:8;
+ X86OpType op1:8;
+ X86OpSize s1:8;
+ X86OpType op2:8;
+ X86OpSize s2:8;
+ /* Must be I and b respectively if present. */
+ X86OpType op3:8;
+ X86OpSize s3:8;
+
+ X86InsnSpecial special:8;
+ X86CPUIDFeature cpuid:8;
+ unsigned vex_class:8;
+ X86VEXSpecial vex_special:8;
+ uint16_t valid_prefix:16;
+ bool is_decode:1;
+};
+
+typedef struct X86DecodedOp {
+ int8_t n;
+ MemOp ot; /* For b/c/d/p/s/q/v/w/y/z */
+ X86OpUnit unit;
+ bool has_ea;
+ int offset; /* For MMX and SSE */
+
+ /*
+ * This field is used internally by macros OP0_PTR/OP1_PTR/OP2_PTR,
+ * do not access directly!
+ */
+ TCGv_ptr v_ptr;
+} X86DecodedOp;
+
+struct X86DecodedInsn {
+ X86OpEntry e;
+ X86DecodedOp op[3];
+ target_ulong immediate;
+ AddressParts mem;
+
+ uint8_t b;
+};
+
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
new file mode 100644
index 0000000..27eca59
--- /dev/null
+++ b/target/i386/tcg/emit.c.inc
@@ -0,0 +1,2234 @@
+/*
+ * New-style TCG opcode generator for i386 instructions
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
+
+typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
+typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
+typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
+typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_ptr reg_c);
+typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_ptr reg_c, TCGv_ptr reg_d);
+typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_i32 val);
+typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_ptr reg_c, TCGv_i32 val);
+typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
+typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c,
+ TCGv_i32 val);
+typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv val);
+typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
+
+static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
+{
+ return tcg_constant_i32(val);
+}
+
+static void gen_NM_exception(DisasContext *s)
+{
+ gen_exception(s, EXCP07_PREX);
+}
+
+static void gen_illegal(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_illegal_opcode(s);
+}
+
+static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib)
+{
+ TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib);
+ gen_lea_v_seg(s, s->aflag, ea, mem->def_seg, s->override);
+}
+
+static inline int mmx_offset(MemOp ot)
+{
+ switch (ot) {
+ case MO_8:
+ return offsetof(MMXReg, MMX_B(0));
+ case MO_16:
+ return offsetof(MMXReg, MMX_W(0));
+ case MO_32:
+ return offsetof(MMXReg, MMX_L(0));
+ case MO_64:
+ return offsetof(MMXReg, MMX_Q(0));
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static inline int xmm_offset(MemOp ot)
+{
+ switch (ot) {
+ case MO_8:
+ return offsetof(ZMMReg, ZMM_B(0));
+ case MO_16:
+ return offsetof(ZMMReg, ZMM_W(0));
+ case MO_32:
+ return offsetof(ZMMReg, ZMM_L(0));
+ case MO_64:
+ return offsetof(ZMMReg, ZMM_Q(0));
+ case MO_128:
+ return offsetof(ZMMReg, ZMM_X(0));
+ case MO_256:
+ return offsetof(ZMMReg, ZMM_Y(0));
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static int vector_reg_offset(X86DecodedOp *op)
+{
+ assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
+
+ if (op->unit == X86_OP_MMX) {
+ return op->offset - mmx_offset(op->ot);
+ } else {
+ return op->offset - xmm_offset(op->ot);
+ }
+}
+
+static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n)
+{
+ int base_ofs = vector_reg_offset(op);
+ switch(ot) {
+ case MO_8:
+ if (op->unit == X86_OP_MMX) {
+ return base_ofs + offsetof(MMXReg, MMX_B(n));
+ } else {
+ return base_ofs + offsetof(ZMMReg, ZMM_B(n));
+ }
+ case MO_16:
+ if (op->unit == X86_OP_MMX) {
+ return base_ofs + offsetof(MMXReg, MMX_W(n));
+ } else {
+ return base_ofs + offsetof(ZMMReg, ZMM_W(n));
+ }
+ case MO_32:
+ if (op->unit == X86_OP_MMX) {
+ return base_ofs + offsetof(MMXReg, MMX_L(n));
+ } else {
+ return base_ofs + offsetof(ZMMReg, ZMM_L(n));
+ }
+ case MO_64:
+ if (op->unit == X86_OP_MMX) {
+ return base_ofs;
+ } else {
+ return base_ofs + offsetof(ZMMReg, ZMM_Q(n));
+ }
+ case MO_128:
+ assert(op->unit == X86_OP_SSE);
+ return base_ofs + offsetof(ZMMReg, ZMM_X(n));
+ case MO_256:
+ assert(op->unit == X86_OP_SSE);
+ return base_ofs + offsetof(ZMMReg, ZMM_Y(n));
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void compute_mmx_offset(X86DecodedOp *op)
+{
+ if (!op->has_ea) {
+ op->offset = offsetof(CPUX86State, fpregs[op->n].mmx) + mmx_offset(op->ot);
+ } else {
+ op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot);
+ }
+}
+
+static void compute_xmm_offset(X86DecodedOp *op)
+{
+ if (!op->has_ea) {
+ op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot);
+ } else {
+ op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot);
+ }
+}
+
+static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned)
+{
+ switch(ot) {
+ case MO_8:
+ gen_op_ld_v(s, MO_8, temp, s->A0);
+ tcg_gen_st8_tl(temp, cpu_env, dest_ofs);
+ break;
+ case MO_16:
+ gen_op_ld_v(s, MO_16, temp, s->A0);
+ tcg_gen_st16_tl(temp, cpu_env, dest_ofs);
+ break;
+ case MO_32:
+ gen_op_ld_v(s, MO_32, temp, s->A0);
+ tcg_gen_st32_tl(temp, cpu_env, dest_ofs);
+ break;
+ case MO_64:
+ gen_ldq_env_A0(s, dest_ofs);
+ break;
+ case MO_128:
+ gen_ldo_env_A0(s, dest_ofs, aligned);
+ break;
+ case MO_256:
+ gen_ldy_env_A0(s, dest_ofs, aligned);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
+{
+ switch (decode->e.vex_class) {
+ case 2:
+ case 4:
+ if ((s->prefix & PREFIX_VEX) ||
+ decode->e.vex_special == X86_VEX_SSEUnaligned) {
+ /* MOST legacy SSE instructions require aligned memory operands, but not all. */
+ return false;
+ }
+ /* fall through */
+ case 1:
+ return ot >= MO_128;
+
+ default:
+ return false;
+ }
+}
+
+static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
+{
+ X86DecodedOp *op = &decode->op[opn];
+
+ switch (op->unit) {
+ case X86_OP_SKIP:
+ return;
+ case X86_OP_SEG:
+ tcg_gen_ld32u_tl(v, cpu_env,
+ offsetof(CPUX86State,segs[op->n].selector));
+ break;
+ case X86_OP_CR:
+ tcg_gen_ld_tl(v, cpu_env, offsetof(CPUX86State, cr[op->n]));
+ break;
+ case X86_OP_DR:
+ tcg_gen_ld_tl(v, cpu_env, offsetof(CPUX86State, dr[op->n]));
+ break;
+ case X86_OP_INT:
+ if (op->has_ea) {
+ gen_op_ld_v(s, op->ot, v, s->A0);
+ } else {
+ gen_op_mov_v_reg(s, op->ot, v, op->n);
+ }
+ break;
+ case X86_OP_IMM:
+ tcg_gen_movi_tl(v, decode->immediate);
+ break;
+
+ case X86_OP_MMX:
+ compute_mmx_offset(op);
+ goto load_vector;
+
+ case X86_OP_SSE:
+ compute_xmm_offset(op);
+ load_vector:
+ if (op->has_ea) {
+ bool aligned = sse_needs_alignment(s, decode, op->ot);
+ gen_load_sse(s, v, op->ot, op->offset, aligned);
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn)
+{
+ X86DecodedOp *op = &decode->op[opn];
+ if (op->v_ptr) {
+ return op->v_ptr;
+ }
+ op->v_ptr = tcg_temp_new_ptr();
+
+ /* The temporary points to the MMXReg or ZMMReg. */
+ tcg_gen_addi_ptr(op->v_ptr, cpu_env, vector_reg_offset(op));
+ return op->v_ptr;
+}
+
+#define OP_PTR0 op_ptr(decode, 0)
+#define OP_PTR1 op_ptr(decode, 1)
+#define OP_PTR2 op_ptr(decode, 2)
+
+static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
+{
+ X86DecodedOp *op = &decode->op[opn];
+ switch (op->unit) {
+ case X86_OP_SKIP:
+ break;
+ case X86_OP_SEG:
+ /* Note that gen_movl_seg_T0 takes care of interrupt shadow and TF. */
+ gen_movl_seg_T0(s, op->n);
+ break;
+ case X86_OP_INT:
+ if (op->has_ea) {
+ gen_op_st_v(s, op->ot, v, s->A0);
+ } else {
+ gen_op_mov_reg_v(s, op->ot, op->n, v);
+ }
+ break;
+ case X86_OP_MMX:
+ break;
+ case X86_OP_SSE:
+ if ((s->prefix & PREFIX_VEX) && op->ot == MO_128) {
+ tcg_gen_gvec_dup_imm(MO_64,
+ offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)),
+ 16, 16, 0);
+ }
+ break;
+ case X86_OP_CR:
+ case X86_OP_DR:
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static inline int vector_len(DisasContext *s, X86DecodedInsn *decode)
+{
+ if (decode->e.special == X86_SPECIAL_MMX &&
+ !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+ return 8;
+ }
+ return s->vex_l ? 32 : 16;
+}
+
+static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs)
+{
+ MemOp ot = decode->op[0].ot;
+ int vec_len = vector_len(s, decode);
+ bool aligned = sse_needs_alignment(s, decode, ot);
+
+ if (!decode->op[0].has_ea) {
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
+ return;
+ }
+
+ switch (ot) {
+ case MO_64:
+ gen_stq_env_A0(s, src_ofs);
+ break;
+ case MO_128:
+ gen_sto_env_A0(s, src_ofs, aligned);
+ break;
+ case MO_256:
+ gen_sty_env_A0(s, src_ofs, aligned);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b)
+{
+ gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b);
+}
+
+#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1)
+static const SSEFunc_0_epp fns_3dnow[] = {
+ [0x0c] = gen_helper_pi2fw,
+ [0x0d] = gen_helper_pi2fd,
+ [0x1c] = gen_helper_pf2iw,
+ [0x1d] = gen_helper_pf2id,
+ [0x8a] = gen_helper_pfnacc,
+ [0x8e] = gen_helper_pfpnacc,
+ [0x90] = gen_helper_pfcmpge,
+ [0x94] = gen_helper_pfmin,
+ [0x96] = gen_helper_pfrcp,
+ [0x97] = gen_helper_pfrsqrt,
+ [0x9a] = gen_helper_pfsub,
+ [0x9e] = gen_helper_pfadd,
+ [0xa0] = gen_helper_pfcmpgt,
+ [0xa4] = gen_helper_pfmax,
+ [0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */
+ [0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */
+ [0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */
+ [0xaa] = gen_helper_pfsubr,
+ [0xae] = gen_helper_pfacc,
+ [0xb0] = gen_helper_pfcmpeq,
+ [0xb4] = gen_helper_pfmul,
+ [0xb7] = gen_helper_pmulhrw_mmx,
+ [0xbb] = gen_helper_pswapd,
+ [0xbf] = gen_helper_pavgusb,
+};
+
+static void gen_3dnow(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ uint8_t b = decode->immediate;
+ SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
+
+ if (!fn) {
+ gen_illegal_opcode(s);
+ return;
+ }
+ if (s->flags & HF_TS_MASK) {
+ gen_NM_exception(s);
+ return;
+ }
+ if (s->flags & HF_EM_MASK) {
+ gen_illegal_opcode(s);
+ return;
+ }
+
+ gen_helper_enter_mmx(cpu_env);
+ if (fn == FN_3DNOW_MOVE) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset);
+ } else {
+ fn(cpu_env, OP_PTR0, OP_PTR1);
+ }
+}
+
+/*
+ * 00 = v*ps Vps, Hps, Wpd
+ * 66 = v*pd Vpd, Hpd, Wps
+ * f3 = v*ss Vss, Hss, Wps
+ * f2 = v*sd Vsd, Hsd, Wps
+ */
+static inline void gen_unary_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
+ SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
+ SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
+{
+ if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
+ SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd;
+ if (!fn) {
+ gen_illegal_opcode(s);
+ return;
+ }
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ } else {
+ SSEFunc_0_epp ps, pd, fn;
+ ps = s->vex_l ? ps_ymm : ps_xmm;
+ pd = s->vex_l ? pd_ymm : pd_xmm;
+ fn = s->prefix & PREFIX_DATA ? pd : ps;
+ if (!fn) {
+ gen_illegal_opcode(s);
+ return;
+ }
+ fn(cpu_env, OP_PTR0, OP_PTR2);
+ }
+}
+#define UNARY_FP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_unary_fp_sse(s, env, decode, \
+ gen_helper_##lname##pd_xmm, \
+ gen_helper_##lname##ps_xmm, \
+ gen_helper_##lname##pd_ymm, \
+ gen_helper_##lname##ps_ymm, \
+ gen_helper_##lname##sd, \
+ gen_helper_##lname##ss); \
+}
+UNARY_FP_SSE(VSQRT, sqrt)
+
+/*
+ * 00 = v*ps Vps, Hps, Wpd
+ * 66 = v*pd Vpd, Hpd, Wps
+ * f3 = v*ss Vss, Hss, Wps
+ * f2 = v*sd Vsd, Hsd, Wps
+ */
+static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
+ SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
+ SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
+{
+ SSEFunc_0_eppp ps, pd, fn;
+ if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
+ fn = s->prefix & PREFIX_REPZ ? ss : sd;
+ } else {
+ ps = s->vex_l ? ps_ymm : ps_xmm;
+ pd = s->vex_l ? pd_ymm : pd_xmm;
+ fn = s->prefix & PREFIX_DATA ? pd : ps;
+ }
+ if (fn) {
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ } else {
+ gen_illegal_opcode(s);
+ }
+}
+
+#define FP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_fp_sse(s, env, decode, \
+ gen_helper_##lname##pd_xmm, \
+ gen_helper_##lname##ps_xmm, \
+ gen_helper_##lname##pd_ymm, \
+ gen_helper_##lname##ps_ymm, \
+ gen_helper_##lname##sd, \
+ gen_helper_##lname##ss); \
+}
+FP_SSE(VADD, add)
+FP_SSE(VMUL, mul)
+FP_SSE(VSUB, sub)
+FP_SSE(VMIN, min)
+FP_SSE(VDIV, div)
+FP_SSE(VMAX, max)
+
+#define FP_UNPACK_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ /* PS maps to the DQ integer instruction, PD maps to QDQ. */ \
+ gen_fp_sse(s, env, decode, \
+ gen_helper_##lname##qdq_xmm, \
+ gen_helper_##lname##dq_xmm, \
+ gen_helper_##lname##qdq_ymm, \
+ gen_helper_##lname##dq_ymm, \
+ NULL, NULL); \
+}
+FP_UNPACK_SSE(VUNPCKLPx, punpckl)
+FP_UNPACK_SSE(VUNPCKHPx, punpckh)
+
+/*
+ * 00 = v*ps Vps, Wpd
+ * f3 = v*ss Vss, Wps
+ */
+static inline void gen_unary_fp32_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epp ps_xmm,
+ SSEFunc_0_epp ps_ymm,
+ SSEFunc_0_eppp ss)
+{
+ if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) {
+ goto illegal_op;
+ } else if (s->prefix & PREFIX_REPZ) {
+ if (!ss) {
+ goto illegal_op;
+ }
+ ss(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ } else {
+ SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm;
+ if (!fn) {
+ goto illegal_op;
+ }
+ fn(cpu_env, OP_PTR0, OP_PTR2);
+ }
+ return;
+
+illegal_op:
+ gen_illegal_opcode(s);
+}
+#define UNARY_FP32_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_unary_fp32_sse(s, env, decode, \
+ gen_helper_##lname##ps_xmm, \
+ gen_helper_##lname##ps_ymm, \
+ gen_helper_##lname##ss); \
+}
+UNARY_FP32_SSE(VRSQRT, rsqrt)
+UNARY_FP32_SSE(VRCP, rcp)
+
+/*
+ * 66 = v*pd Vpd, Hpd, Wpd
+ * f2 = v*ps Vps, Hps, Wps
+ */
+static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
+ SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
+{
+ SSEFunc_0_eppp ps, pd, fn;
+ ps = s->vex_l ? ps_ymm : ps_xmm;
+ pd = s->vex_l ? pd_ymm : pd_xmm;
+ fn = s->prefix & PREFIX_DATA ? pd : ps;
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+#define HORIZONTAL_FP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_horizontal_fp_sse(s, env, decode, \
+ gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm, \
+ gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
+}
+HORIZONTAL_FP_SSE(VHADD, hadd)
+HORIZONTAL_FP_SSE(VHSUB, hsub)
+HORIZONTAL_FP_SSE(VADDSUB, addsub)
+
+static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
+{
+ SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
+ TCGv_ptr ptr3 = tcg_temp_new_ptr();
+
+ /* The format of the fourth input is Lx */
+ tcg_gen_addi_ptr(ptr3, cpu_env, ZMM_OFFSET(op3));
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
+ tcg_temp_free_ptr(ptr3);
+}
+#define TERNARY_SSE(uname, uvname, lname) \
+static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4, \
+ gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
+} \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_ternary_sse(s, env, decode, 0, \
+ gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
+}
+TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
+TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
+TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
+
+static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+ } else {
+ ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+ }
+}
+
+#define BINARY_IMM_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_binary_imm_sse(s, env, decode, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+
+BINARY_IMM_SSE(VBLENDPD, blendpd)
+BINARY_IMM_SSE(VBLENDPS, blendps)
+BINARY_IMM_SSE(VPBLENDW, pblendw)
+BINARY_IMM_SSE(VDDPS, dpps)
+#define gen_helper_dppd_ymm NULL
+BINARY_IMM_SSE(VDDPD, dppd)
+BINARY_IMM_SSE(VMPSADBW, mpsadbw)
+BINARY_IMM_SSE(PCLMULQDQ, pclmulqdq)
+
+
+#define UNARY_INT_GVEC(uname, func, ...) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ int vec_len = vector_len(s, decode); \
+ \
+ func(__VA_ARGS__, decode->op[0].offset, \
+ decode->op[2].offset, vec_len, vec_len); \
+}
+UNARY_INT_GVEC(PABSB, tcg_gen_gvec_abs, MO_8)
+UNARY_INT_GVEC(PABSW, tcg_gen_gvec_abs, MO_16)
+UNARY_INT_GVEC(PABSD, tcg_gen_gvec_abs, MO_32)
+UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
+UNARY_INT_GVEC(VPBROADCASTB, tcg_gen_gvec_dup_mem, MO_8)
+UNARY_INT_GVEC(VPBROADCASTW, tcg_gen_gvec_dup_mem, MO_16)
+UNARY_INT_GVEC(VPBROADCASTD, tcg_gen_gvec_dup_mem, MO_32)
+UNARY_INT_GVEC(VPBROADCASTQ, tcg_gen_gvec_dup_mem, MO_64)
+
+
+#define BINARY_INT_GVEC(uname, func, ...) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ int vec_len = vector_len(s, decode); \
+ \
+ func(__VA_ARGS__, \
+ decode->op[0].offset, decode->op[1].offset, \
+ decode->op[2].offset, vec_len, vec_len); \
+}
+
+BINARY_INT_GVEC(PADDB, tcg_gen_gvec_add, MO_8)
+BINARY_INT_GVEC(PADDW, tcg_gen_gvec_add, MO_16)
+BINARY_INT_GVEC(PADDD, tcg_gen_gvec_add, MO_32)
+BINARY_INT_GVEC(PADDQ, tcg_gen_gvec_add, MO_64)
+BINARY_INT_GVEC(PADDSB, tcg_gen_gvec_ssadd, MO_8)
+BINARY_INT_GVEC(PADDSW, tcg_gen_gvec_ssadd, MO_16)
+BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8)
+BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16)
+BINARY_INT_GVEC(PAND, tcg_gen_gvec_and, MO_64)
+BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
+BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
+BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
+BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
+BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
+BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
+BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
+BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
+BINARY_INT_GVEC(PMAXSB, tcg_gen_gvec_smax, MO_8)
+BINARY_INT_GVEC(PMAXSW, tcg_gen_gvec_smax, MO_16)
+BINARY_INT_GVEC(PMAXSD, tcg_gen_gvec_smax, MO_32)
+BINARY_INT_GVEC(PMAXUB, tcg_gen_gvec_umax, MO_8)
+BINARY_INT_GVEC(PMAXUW, tcg_gen_gvec_umax, MO_16)
+BINARY_INT_GVEC(PMAXUD, tcg_gen_gvec_umax, MO_32)
+BINARY_INT_GVEC(PMINSB, tcg_gen_gvec_smin, MO_8)
+BINARY_INT_GVEC(PMINSW, tcg_gen_gvec_smin, MO_16)
+BINARY_INT_GVEC(PMINSD, tcg_gen_gvec_smin, MO_32)
+BINARY_INT_GVEC(PMINUB, tcg_gen_gvec_umin, MO_8)
+BINARY_INT_GVEC(PMINUW, tcg_gen_gvec_umin, MO_16)
+BINARY_INT_GVEC(PMINUD, tcg_gen_gvec_umin, MO_32)
+BINARY_INT_GVEC(PMULLW, tcg_gen_gvec_mul, MO_16)
+BINARY_INT_GVEC(PMULLD, tcg_gen_gvec_mul, MO_32)
+BINARY_INT_GVEC(POR, tcg_gen_gvec_or, MO_64)
+BINARY_INT_GVEC(PSUBB, tcg_gen_gvec_sub, MO_8)
+BINARY_INT_GVEC(PSUBW, tcg_gen_gvec_sub, MO_16)
+BINARY_INT_GVEC(PSUBD, tcg_gen_gvec_sub, MO_32)
+BINARY_INT_GVEC(PSUBQ, tcg_gen_gvec_sub, MO_64)
+BINARY_INT_GVEC(PSUBSB, tcg_gen_gvec_sssub, MO_8)
+BINARY_INT_GVEC(PSUBSW, tcg_gen_gvec_sssub, MO_16)
+BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8)
+BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16)
+BINARY_INT_GVEC(PXOR, tcg_gen_gvec_xor, MO_64)
+
+
+/*
+ * 00 = p* Pq, Qq (if mmx not NULL; no VEX)
+ * 66 = vp* Vx, Hx, Wx
+ *
+ * These are really the same encoding, because 1) V is the same as P when VEX.V
+ * is not present 2) P and Q are the same as H and W apart from MM/XMM
+ */
+static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
+{
+ assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
+
+ if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
+ /* VEX encoding is not applicable to MMX instructions. */
+ gen_illegal_opcode(s);
+ return;
+ }
+ if (!(s->prefix & PREFIX_DATA)) {
+ mmx(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ } else if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ } else {
+ ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+ }
+}
+
+
+#define BINARY_INT_MMX(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_binary_int_sse(s, env, decode, \
+ gen_helper_##lname##_mmx, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+BINARY_INT_MMX(PUNPCKLBW, punpcklbw)
+BINARY_INT_MMX(PUNPCKLWD, punpcklwd)
+BINARY_INT_MMX(PUNPCKLDQ, punpckldq)
+BINARY_INT_MMX(PACKSSWB, packsswb)
+BINARY_INT_MMX(PACKUSWB, packuswb)
+BINARY_INT_MMX(PUNPCKHBW, punpckhbw)
+BINARY_INT_MMX(PUNPCKHWD, punpckhwd)
+BINARY_INT_MMX(PUNPCKHDQ, punpckhdq)
+BINARY_INT_MMX(PACKSSDW, packssdw)
+
+BINARY_INT_MMX(PAVGB, pavgb)
+BINARY_INT_MMX(PAVGW, pavgw)
+BINARY_INT_MMX(PMADDWD, pmaddwd)
+BINARY_INT_MMX(PMULHUW, pmulhuw)
+BINARY_INT_MMX(PMULHW, pmulhw)
+BINARY_INT_MMX(PMULUDQ, pmuludq)
+BINARY_INT_MMX(PSADBW, psadbw)
+
+BINARY_INT_MMX(PSLLW_r, psllw)
+BINARY_INT_MMX(PSLLD_r, pslld)
+BINARY_INT_MMX(PSLLQ_r, psllq)
+BINARY_INT_MMX(PSRLW_r, psrlw)
+BINARY_INT_MMX(PSRLD_r, psrld)
+BINARY_INT_MMX(PSRLQ_r, psrlq)
+BINARY_INT_MMX(PSRAW_r, psraw)
+BINARY_INT_MMX(PSRAD_r, psrad)
+
+BINARY_INT_MMX(PHADDW, phaddw)
+BINARY_INT_MMX(PHADDSW, phaddsw)
+BINARY_INT_MMX(PHADDD, phaddd)
+BINARY_INT_MMX(PHSUBW, phsubw)
+BINARY_INT_MMX(PHSUBSW, phsubsw)
+BINARY_INT_MMX(PHSUBD, phsubd)
+BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
+BINARY_INT_MMX(PSHUFB, pshufb)
+BINARY_INT_MMX(PSIGNB, psignb)
+BINARY_INT_MMX(PSIGNW, psignw)
+BINARY_INT_MMX(PSIGND, psignd)
+BINARY_INT_MMX(PMULHRSW, pmulhrsw)
+
+/* Instructions with no MMX equivalent. */
+#define BINARY_INT_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_binary_int_sse(s, env, decode, \
+ NULL, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+
+/* Instructions with no MMX equivalent. */
+BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
+BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
+BINARY_INT_SSE(VPACKUSDW, packusdw)
+BINARY_INT_SSE(VPERMILPS, vpermilps)
+BINARY_INT_SSE(VPERMILPD, vpermilpd)
+BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
+BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
+
+BINARY_INT_SSE(PMULDQ, pmuldq)
+
+BINARY_INT_SSE(VAESDEC, aesdec)
+BINARY_INT_SSE(VAESDECLAST, aesdeclast)
+BINARY_INT_SSE(VAESENC, aesenc)
+BINARY_INT_SSE(VAESENCLAST, aesenclast)
+
+#define UNARY_CMP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ if (!s->vex_l) { \
+ gen_helper_##lname##_xmm(cpu_env, OP_PTR1, OP_PTR2); \
+ } else { \
+ gen_helper_##lname##_ymm(cpu_env, OP_PTR1, OP_PTR2); \
+ } \
+ set_cc_op(s, CC_OP_EFLAGS); \
+}
+UNARY_CMP_SSE(VPTEST, ptest)
+UNARY_CMP_SSE(VTESTPS, vtestps)
+UNARY_CMP_SSE(VTESTPD, vtestpd)
+
+static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
+{
+ if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR0, OP_PTR2);
+ } else {
+ ymm(cpu_env, OP_PTR0, OP_PTR2);
+ }
+}
+
+#define UNARY_INT_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_unary_int_sse(s, env, decode, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+
+UNARY_INT_SSE(VPMOVSXBW, pmovsxbw)
+UNARY_INT_SSE(VPMOVSXBD, pmovsxbd)
+UNARY_INT_SSE(VPMOVSXBQ, pmovsxbq)
+UNARY_INT_SSE(VPMOVSXWD, pmovsxwd)
+UNARY_INT_SSE(VPMOVSXWQ, pmovsxwq)
+UNARY_INT_SSE(VPMOVSXDQ, pmovsxdq)
+
+UNARY_INT_SSE(VPMOVZXBW, pmovzxbw)
+UNARY_INT_SSE(VPMOVZXBD, pmovzxbd)
+UNARY_INT_SSE(VPMOVZXBQ, pmovzxbq)
+UNARY_INT_SSE(VPMOVZXWD, pmovzxwd)
+UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq)
+UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq)
+
+UNARY_INT_SSE(VMOVSLDUP, pmovsldup)
+UNARY_INT_SSE(VMOVSHDUP, pmovshdup)
+UNARY_INT_SSE(VMOVDDUP, pmovdldup)
+
+UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
+UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
+UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
+UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps)
+UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq)
+UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq)
+
+
+static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ if (!s->vex_l) {
+ xmm(OP_PTR0, OP_PTR1, imm);
+ } else {
+ ymm(OP_PTR0, OP_PTR1, imm);
+ }
+}
+
+#define UNARY_IMM_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_unary_imm_sse(s, env, decode, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+
+UNARY_IMM_SSE(PSHUFD, pshufd)
+UNARY_IMM_SSE(PSHUFHW, pshufhw)
+UNARY_IMM_SSE(PSHUFLW, pshuflw)
+#define gen_helper_vpermq_xmm NULL
+UNARY_IMM_SSE(VPERMQ, vpermq)
+UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
+UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
+
+static inline void gen_unary_imm_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR0, OP_PTR1, imm);
+ } else {
+ ymm(cpu_env, OP_PTR0, OP_PTR1, imm);
+ }
+}
+
+#define UNARY_IMM_FP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_unary_imm_fp_sse(s, env, decode, \
+ gen_helper_##lname##_xmm, \
+ gen_helper_##lname##_ymm); \
+}
+
+UNARY_IMM_FP_SSE(VROUNDPS, roundps)
+UNARY_IMM_FP_SSE(VROUNDPD, roundpd)
+
+static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
+ SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
+{
+ SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
+ SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
+ SSEFunc_0_eppp fn = s->vex_w ? q : d;
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+
+/* VEX.W affects whether to operate on 32- or 64-bit elements. */
+#define VEXW_AVX(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_vexw_avx(s, env, decode, \
+ gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
+ gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
+}
+VEXW_AVX(VPSLLV, vpsllv)
+VEXW_AVX(VPSRLV, vpsrlv)
+VEXW_AVX(VPSRAV, vpsrav)
+VEXW_AVX(VPMASKMOV, vpmaskmov)
+
+/* Same as above, but with extra arguments to the helper. */
+static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
+ SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
+{
+ SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
+ SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
+ SSEFunc_0_epppti fn = s->vex_w ? q : d;
+ TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
+ TCGv_ptr index = tcg_temp_new_ptr();
+
+ /* Pass third input as (index, base, scale) */
+ tcg_gen_addi_ptr(index, cpu_env, ZMM_OFFSET(decode->mem.index));
+ fn(cpu_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
+
+ /*
+ * There are two output operands, so zero OP1's high 128 bits
+ * in the VEX.128 case.
+ */
+ if (!s->vex_l) {
+ int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
+ tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
+ }
+ tcg_temp_free_ptr(index);
+}
+#define VSIB_AVX(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_vsib_avx(s, env, decode, \
+ gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
+ gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
+}
+VSIB_AVX(VPGATHERD, vpgatherd)
+VSIB_AVX(VPGATHERQ, vpgatherq)
+
+static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
+{
+ TCGv carry_in = NULL;
+ TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
+ TCGv zero;
+
+ if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) {
+ /* Re-use the carry-out from a previous round. */
+ carry_in = carry_out;
+ cc_op = s->cc_op;
+ } else if (s->cc_op == CC_OP_ADCX || s->cc_op == CC_OP_ADOX) {
+ /* Merge with the carry-out from the opposite instruction. */
+ cc_op = CC_OP_ADCOX;
+ }
+
+ /* If we don't have a carry-in, get it out of EFLAGS. */
+ if (!carry_in) {
+ if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+ gen_compute_eflags(s);
+ }
+ carry_in = s->tmp0;
+ tcg_gen_extract_tl(carry_in, cpu_cc_src,
+ ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
+ }
+
+ switch (ot) {
+#ifdef TARGET_X86_64
+ case MO_32:
+ /* If TL is 64-bit just do everything in 64-bit arithmetic. */
+ tcg_gen_add_i64(s->T0, s->T0, s->T1);
+ tcg_gen_add_i64(s->T0, s->T0, carry_in);
+ tcg_gen_shri_i64(carry_out, s->T0, 32);
+ break;
+#endif
+ default:
+ zero = tcg_constant_tl(0);
+ tcg_gen_add2_tl(s->T0, carry_out, s->T0, zero, carry_in, zero);
+ tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero);
+ break;
+ }
+ set_cc_op(s, cc_op);
+}
+
+static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADCX);
+}
+
+static void gen_ADOX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADOX);
+}
+
+static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ tcg_gen_andc_tl(s->T0, s->T1, s->T0);
+ gen_op_update1_cc(s);
+ set_cc_op(s, CC_OP_LOGICB + ot);
+}
+
+static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ TCGv bound, zero;
+
+ /*
+ * Extract START, and shift the operand.
+ * Shifts larger than operand size get zeros.
+ */
+ tcg_gen_ext8u_tl(s->A0, s->T1);
+ tcg_gen_shr_tl(s->T0, s->T0, s->A0);
+
+ bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
+ zero = tcg_constant_tl(0);
+ tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
+
+ /*
+ * Extract the LEN into a mask. Lengths larger than
+ * operand size get all ones.
+ */
+ tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
+ tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->A0, bound, s->A0, bound);
+
+ tcg_gen_movi_tl(s->T1, 1);
+ tcg_gen_shl_tl(s->T1, s->T1, s->A0);
+ tcg_gen_subi_tl(s->T1, s->T1, 1);
+ tcg_gen_and_tl(s->T0, s->T0, s->T1);
+
+ gen_op_update1_cc(s);
+ set_cc_op(s, CC_OP_LOGICB + ot);
+}
+
+static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ tcg_gen_neg_tl(s->T1, s->T0);
+ tcg_gen_and_tl(s->T0, s->T0, s->T1);
+ tcg_gen_mov_tl(cpu_cc_dst, s->T0);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+}
+
+static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ tcg_gen_subi_tl(s->T1, s->T0, 1);
+ tcg_gen_xor_tl(s->T0, s->T0, s->T1);
+ tcg_gen_mov_tl(cpu_cc_dst, s->T0);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+}
+
+static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ tcg_gen_subi_tl(s->T1, s->T0, 1);
+ tcg_gen_and_tl(s->T0, s->T0, s->T1);
+ tcg_gen_mov_tl(cpu_cc_dst, s->T0);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+}
+
+static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ TCGv bound;
+
+ tcg_gen_ext8u_tl(s->T1, cpu_regs[s->vex_v]);
+ bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
+
+ /*
+ * Note that since we're using BMILG (in order to get O
+ * cleared) we need to store the inverse into C.
+ */
+ tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src, s->T1, bound);
+ tcg_gen_movcond_tl(TCG_COND_GT, s->T1, s->T1, bound, bound, s->T1);
+
+ tcg_gen_movi_tl(s->A0, -1);
+ tcg_gen_shl_tl(s->A0, s->A0, s->T1);
+ tcg_gen_andc_tl(s->T0, s->T0, s->A0);
+
+ gen_op_update1_cc(s);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+}
+
+static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[2].ot;
+
+ tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
+ gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
+}
+
+static void gen_CVTPI2Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_enter_mmx(cpu_env);
+ if (s->prefix & PREFIX_DATA) {
+ gen_helper_cvtpi2pd(cpu_env, OP_PTR0, OP_PTR2);
+ } else {
+ gen_helper_cvtpi2ps(cpu_env, OP_PTR0, OP_PTR2);
+ }
+}
+
+static void gen_CVTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_enter_mmx(cpu_env);
+ if (s->prefix & PREFIX_DATA) {
+ gen_helper_cvtpd2pi(cpu_env, OP_PTR0, OP_PTR2);
+ } else {
+ gen_helper_cvtps2pi(cpu_env, OP_PTR0, OP_PTR2);
+ }
+}
+
+static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_enter_mmx(cpu_env);
+ if (s->prefix & PREFIX_DATA) {
+ gen_helper_cvttpd2pi(cpu_env, OP_PTR0, OP_PTR2);
+ } else {
+ gen_helper_cvttps2pi(cpu_env, OP_PTR0, OP_PTR2);
+ }
+}
+
+static void gen_EMMS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_emms(cpu_env);
+}
+
+static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+ TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+ gen_helper_extrq_i(cpu_env, OP_PTR0, index, length);
+}
+
+static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_extrq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+ TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+ gen_helper_insertq_i(cpu_env, OP_PTR0, OP_PTR1, index, length);
+}
+
+static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_insertq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (s->vex_l) {
+ gen_illegal_opcode(s);
+ return;
+ }
+ tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1);
+ gen_helper_ldmxcsr(cpu_env, s->tmp2_i32);
+}
+
+static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]);
+ gen_extu(s->aflag, s->A0);
+ gen_add_A0_ds_seg(s);
+
+ if (s->prefix & PREFIX_DATA) {
+ gen_helper_maskmov_xmm(cpu_env, OP_PTR1, OP_PTR2, s->A0);
+ } else {
+ gen_helper_maskmov_mmx(cpu_env, OP_PTR1, OP_PTR2, s->A0);
+ }
+}
+
+static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ /* M operand type does not load/store */
+ if (decode->e.op0 == X86_TYPE_M) {
+ tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
+ } else {
+ tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
+ }
+}
+
+static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[2].ot;
+
+ switch (ot) {
+ case MO_32:
+#ifdef TARGET_X86_64
+ tcg_gen_ld32u_tl(s->T0, cpu_env, decode->op[2].offset);
+ break;
+ case MO_64:
+#endif
+ tcg_gen_ld_tl(s->T0, cpu_env, decode->op[2].offset);
+ break;
+ default:
+ abort();
+ }
+}
+
+static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[2].ot;
+ int vec_len = vector_len(s, decode);
+ int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0);
+
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+
+ switch (ot) {
+ case MO_32:
+#ifdef TARGET_X86_64
+ tcg_gen_st32_tl(s->T1, cpu_env, lo_ofs);
+ break;
+ case MO_64:
+#endif
+ tcg_gen_st_tl(s->T1, cpu_env, lo_ofs);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_store_sse(s, decode, decode->op[2].offset);
+}
+
+static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
+ ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
+ pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm;
+ fn = s->prefix & PREFIX_DATA ? pd : ps;
+ fn(s->tmp2_i32, cpu_env, OP_PTR2);
+ tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
+}
+
+static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+ int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
+
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
+ if (decode->op[0].has_ea) {
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+ } else {
+ /*
+ * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
+ * seem to work, but it does not on big-endian platforms; the cleared parts
+ * are always at higher addresses, but cross-endian emulation inverts the
+ * byte order so that the cleared parts need to be at *lower* addresses.
+ * Because oprsz is 8, we see this here even for SSE; but more in general,
+ * it disqualifies using oprsz < maxsz to emulate VEX128.
+ */
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs);
+ }
+}
+
+static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_helper_enter_mmx(cpu_env);
+ /* Otherwise the same as any other movq. */
+ return gen_MOVQ(s, env, decode);
+}
+
+static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+
+ /* low part of result in VEX.vvvv, high in MODRM */
+ switch (ot) {
+ default:
+ tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
+ tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
+ tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32,
+ s->tmp2_i32, s->tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32);
+ tcg_gen_extu_i32_tl(s->T0, s->tmp3_i32);
+ break;
+#ifdef TARGET_X86_64
+ case MO_64:
+ tcg_gen_mulu2_i64(cpu_regs[s->vex_v], s->T0, s->T0, s->T1);
+ break;
+#endif
+ }
+
+}
+
+static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ if (!(s->prefix & PREFIX_DATA)) {
+ gen_helper_palignr_mmx(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+ } else if (!s->vex_l) {
+ gen_helper_palignr_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+ } else {
+ gen_helper_palignr_ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+ }
+}
+
+static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ /* Careful, operand order is reversed! */
+ tcg_gen_gvec_andc(MO_64,
+ decode->op[0].offset, decode->op[2].offset,
+ decode->op[1].offset, vec_len, vec_len);
+}
+
+static void gen_PCMPESTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ gen_helper_pcmpestri_xmm(cpu_env, OP_PTR1, OP_PTR2, imm);
+ set_cc_op(s, CC_OP_EFLAGS);
+}
+
+static void gen_PCMPESTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ gen_helper_pcmpestrm_xmm(cpu_env, OP_PTR1, OP_PTR2, imm);
+ set_cc_op(s, CC_OP_EFLAGS);
+ if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
+ tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
+ 16, 16, 0);
+ }
+}
+
+static void gen_PCMPISTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ gen_helper_pcmpistri_xmm(cpu_env, OP_PTR1, OP_PTR2, imm);
+ set_cc_op(s, CC_OP_EFLAGS);
+}
+
+static void gen_PCMPISTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ gen_helper_pcmpistrm_xmm(cpu_env, OP_PTR1, OP_PTR2, imm);
+ set_cc_op(s, CC_OP_EFLAGS);
+ if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
+ tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
+ 16, 16, 0);
+ }
+}
+
+static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[1].ot;
+ if (ot < MO_64) {
+ tcg_gen_ext32u_tl(s->T0, s->T0);
+ }
+ gen_helper_pdep(s->T0, s->T0, s->T1);
+}
+
+static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[1].ot;
+ if (ot < MO_64) {
+ tcg_gen_ext32u_tl(s->T0, s->T0);
+ }
+ gen_helper_pext(s->T0, s->T0, s->T1);
+}
+
+static inline void gen_pextr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
+{
+ int vec_len = vector_len(s, decode);
+ int mask = (vec_len >> ot) - 1;
+ int val = decode->immediate & mask;
+
+ switch (ot) {
+ case MO_8:
+ tcg_gen_ld8u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val));
+ break;
+ case MO_16:
+ tcg_gen_ld16u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val));
+ break;
+ case MO_32:
+#ifdef TARGET_X86_64
+ tcg_gen_ld32u_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val));
+ break;
+ case MO_64:
+#endif
+ tcg_gen_ld_tl(s->T0, cpu_env, vector_elem_offset(&decode->op[1], ot, val));
+ break;
+ default:
+ abort();
+ }
+}
+
+static void gen_PEXTRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pextr(s, env, decode, MO_8);
+}
+
+static void gen_PEXTRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pextr(s, env, decode, MO_16);
+}
+
+static void gen_PEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ gen_pextr(s, env, decode, ot);
+}
+
+static inline void gen_pinsr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
+{
+ int vec_len = vector_len(s, decode);
+ int mask = (vec_len >> ot) - 1;
+ int val = decode->immediate & mask;
+
+ if (decode->op[1].offset != decode->op[0].offset) {
+ assert(vec_len == 16);
+ gen_store_sse(s, decode, decode->op[1].offset);
+ }
+
+ switch (ot) {
+ case MO_8:
+ tcg_gen_st8_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val));
+ break;
+ case MO_16:
+ tcg_gen_st16_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val));
+ break;
+ case MO_32:
+#ifdef TARGET_X86_64
+ tcg_gen_st32_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val));
+ break;
+ case MO_64:
+#endif
+ tcg_gen_st_tl(s->T1, cpu_env, vector_elem_offset(&decode->op[0], ot, val));
+ break;
+ default:
+ abort();
+ }
+}
+
+static void gen_PINSRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pinsr(s, env, decode, MO_8);
+}
+
+static void gen_PINSRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pinsr(s, env, decode, MO_16);
+}
+
+static void gen_PINSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pinsr(s, env, decode, decode->op[2].ot);
+}
+
+static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
+
+ /*
+ * After each shift+or pair:
+ * 0: a.......b.......c.......d.......e.......f.......g.......h.......
+ * 7: ab......bc......cd......de......ef......fg......gh......h.......
+ * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
+ * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
+ * The result is left in the high bits of the word.
+ */
+ tcg_gen_shli_i64(t, d, 7);
+ tcg_gen_or_i64(d, d, t);
+ tcg_gen_shli_i64(t, d, 14);
+ tcg_gen_or_i64(d, d, t);
+ tcg_gen_shli_i64(t, d, 28);
+ tcg_gen_or_i64(d, d, t);
+}
+
+static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
+
+ /* See above */
+ tcg_gen_and_vec(vece, d, s, m);
+ tcg_gen_shli_vec(vece, t, d, 7);
+ tcg_gen_or_vec(vece, d, d, t);
+ tcg_gen_shli_vec(vece, t, d, 14);
+ tcg_gen_or_vec(vece, d, d, t);
+ tcg_gen_shli_vec(vece, t, d, 28);
+ tcg_gen_or_vec(vece, d, d, t);
+}
+
+#ifdef TARGET_X86_64
+#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i64
+#else
+#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i32
+#endif
+
+static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
+ static const GVecGen2 g = {
+ .fni8 = gen_pmovmskb_i64,
+ .fniv = gen_pmovmskb_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64
+ };
+ MemOp ot = decode->op[2].ot;
+ int vec_len = vector_len(s, decode);
+ TCGv t = tcg_temp_new();
+
+ tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset,
+ vec_len, vec_len, &g);
+ tcg_gen_ld8u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
+ while (vec_len > 8) {
+ vec_len -= 8;
+ if (TCG_TARGET_HAS_extract2_tl) {
+ /*
+ * Load the next byte of the result into the high byte of T.
+ * TCG does a similar expansion of deposit to shl+extract2; by
+ * loading the whole word, the shift left is avoided.
+ */
+#ifdef TARGET_X86_64
+ tcg_gen_ld_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8)));
+#else
+ tcg_gen_ld_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4)));
+#endif
+
+ tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8);
+ } else {
+ /*
+ * The _previous_ value is deposited into bits 8 and higher of t. Because
+ * those bits are known to be zero after ld8u, this becomes a shift+or
+ * if deposit is not available.
+ */
+ tcg_gen_ld8u_tl(t, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
+ tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8);
+ }
+ }
+ tcg_temp_free(t);
+}
+
+static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
+}
+
+static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 16) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shri(MO_16,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 16) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shli(MO_16,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 16) {
+ decode->immediate = 15;
+ }
+ tcg_gen_gvec_sari(MO_16,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+}
+
+static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 32) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shri(MO_32,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 32) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shli(MO_32,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 32) {
+ decode->immediate = 31;
+ }
+ tcg_gen_gvec_sari(MO_32,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+}
+
+static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 64) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shri(MO_64,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ if (decode->immediate >= 64) {
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else {
+ tcg_gen_gvec_shli(MO_64,
+ decode->op[0].offset, decode->op[1].offset,
+ decode->immediate, vec_len, vec_len);
+ }
+}
+
+static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len)
+{
+ MemOp ot = vec_len == 16 ? MO_128 : MO_256;
+ TCGv_i32 imm_v = tcg_constant8u_i32(imm);
+ TCGv_ptr ptr = tcg_temp_new_ptr();
+
+ tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
+ vec_len, vec_len, 0);
+
+ tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0));
+ tcg_gen_st_i32(imm_v, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
+ return ptr;
+}
+
+static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+ TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
+
+ if (s->vex_l) {
+ gen_helper_psrldq_ymm(cpu_env, OP_PTR0, OP_PTR1, imm_vec);
+ } else {
+ gen_helper_psrldq_xmm(cpu_env, OP_PTR0, OP_PTR1, imm_vec);
+ }
+ tcg_temp_free_ptr(imm_vec);
+}
+
+static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+ TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
+
+ if (s->vex_l) {
+ gen_helper_pslldq_ymm(cpu_env, OP_PTR0, OP_PTR1, imm_vec);
+ } else {
+ gen_helper_pslldq_xmm(cpu_env, OP_PTR0, OP_PTR1, imm_vec);
+ }
+ tcg_temp_free_ptr(imm_vec);
+}
+
+static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ int b = decode->immediate;
+
+ if (ot == MO_64) {
+ tcg_gen_rotri_tl(s->T0, s->T0, b & 63);
+ } else {
+ tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
+ tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b & 31);
+ tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
+ }
+}
+
+static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ int mask;
+
+ mask = ot == MO_64 ? 63 : 31;
+ tcg_gen_andi_tl(s->T1, s->T1, mask);
+ if (ot != MO_64) {
+ tcg_gen_ext32s_tl(s->T0, s->T0);
+ }
+ tcg_gen_sar_tl(s->T0, s->T0, s->T1);
+}
+
+static void gen_SHLX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ int mask;
+
+ mask = ot == MO_64 ? 63 : 31;
+ tcg_gen_andi_tl(s->T1, s->T1, mask);
+ tcg_gen_shl_tl(s->T0, s->T0, s->T1);
+}
+
+static void gen_SHRX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ MemOp ot = decode->op[0].ot;
+ int mask;
+
+ mask = ot == MO_64 ? 63 : 31;
+ tcg_gen_andi_tl(s->T1, s->T1, mask);
+ if (ot != MO_64) {
+ tcg_gen_ext32u_tl(s->T0, s->T0);
+ }
+ tcg_gen_shr_tl(s->T0, s->T0, s->T1);
+}
+
+static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ assert(!s->vex_l);
+ gen_helper_aeskeygenassist_xmm(cpu_env, OP_PTR0, OP_PTR1, imm);
+}
+
+static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (s->vex_l) {
+ gen_illegal_opcode(s);
+ return;
+ }
+ gen_helper_update_mxcsr(cpu_env);
+ tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, mxcsr));
+}
+
+static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(!s->vex_l);
+ gen_helper_aesimc_xmm(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+/*
+ * 00 = v*ps Vps, Hps, Wpd
+ * 66 = v*pd Vpd, Hpd, Wps
+ * f3 = v*ss Vss, Hss, Wps
+ * f2 = v*sd Vsd, Hsd, Wps
+ */
+#define SSE_CMP(x) { \
+ gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
+ gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \
+ gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm}
+static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = {
+ SSE_CMP(cmpeq),
+ SSE_CMP(cmplt),
+ SSE_CMP(cmple),
+ SSE_CMP(cmpunord),
+ SSE_CMP(cmpneq),
+ SSE_CMP(cmpnlt),
+ SSE_CMP(cmpnle),
+ SSE_CMP(cmpord),
+
+ SSE_CMP(cmpequ),
+ SSE_CMP(cmpnge),
+ SSE_CMP(cmpngt),
+ SSE_CMP(cmpfalse),
+ SSE_CMP(cmpnequ),
+ SSE_CMP(cmpge),
+ SSE_CMP(cmpgt),
+ SSE_CMP(cmptrue),
+
+ SSE_CMP(cmpeqs),
+ SSE_CMP(cmpltq),
+ SSE_CMP(cmpleq),
+ SSE_CMP(cmpunords),
+ SSE_CMP(cmpneqq),
+ SSE_CMP(cmpnltq),
+ SSE_CMP(cmpnleq),
+ SSE_CMP(cmpords),
+
+ SSE_CMP(cmpequs),
+ SSE_CMP(cmpngeq),
+ SSE_CMP(cmpngtq),
+ SSE_CMP(cmpfalses),
+ SSE_CMP(cmpnequs),
+ SSE_CMP(cmpgeq),
+ SSE_CMP(cmpgtq),
+ SSE_CMP(cmptrues),
+};
+#undef SSE_CMP
+
+static void gen_VCMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
+ int b =
+ s->prefix & PREFIX_REPZ ? 2 /* ss */ :
+ s->prefix & PREFIX_REPNZ ? 3 /* sd */ :
+ !!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2);
+
+ gen_helper_cmp_funcs[index][b](cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+
+static void gen_VCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ SSEFunc_0_epp fn;
+ fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
+ fn(cpu_env, OP_PTR1, OP_PTR2);
+ set_cc_op(s, CC_OP_EFLAGS);
+}
+
+static void gen_VCVTfp2fp(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_unary_fp_sse(s, env, decode,
+ gen_helper_cvtpd2ps_xmm, gen_helper_cvtps2pd_xmm,
+ gen_helper_cvtpd2ps_ymm, gen_helper_cvtps2pd_ymm,
+ gen_helper_cvtsd2ss, gen_helper_cvtss2sd);
+}
+
+static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+ TCGv_i32 in;
+
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+
+#ifdef TARGET_X86_64
+ MemOp ot = decode->op[2].ot;
+ if (ot == MO_64) {
+ if (s->prefix & PREFIX_REPNZ) {
+ gen_helper_cvtsq2sd(cpu_env, OP_PTR0, s->T1);
+ } else {
+ gen_helper_cvtsq2ss(cpu_env, OP_PTR0, s->T1);
+ }
+ return;
+ }
+ in = s->tmp2_i32;
+ tcg_gen_trunc_tl_i32(in, s->T1);
+#else
+ in = s->T1;
+#endif
+
+ if (s->prefix & PREFIX_REPNZ) {
+ gen_helper_cvtsi2sd(cpu_env, OP_PTR0, in);
+ } else {
+ gen_helper_cvtsi2ss(cpu_env, OP_PTR0, in);
+ }
+}
+
+static inline void gen_VCVTtSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
+ SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
+{
+ TCGv_i32 out;
+
+#ifdef TARGET_X86_64
+ MemOp ot = decode->op[0].ot;
+ if (ot == MO_64) {
+ if (s->prefix & PREFIX_REPNZ) {
+ sd2sq(s->T0, cpu_env, OP_PTR2);
+ } else {
+ ss2sq(s->T0, cpu_env, OP_PTR2);
+ }
+ return;
+ }
+
+ out = s->tmp2_i32;
+#else
+ out = s->T0;
+#endif
+ if (s->prefix & PREFIX_REPNZ) {
+ sd2si(out, cpu_env, OP_PTR2);
+ } else {
+ ss2si(out, cpu_env, OP_PTR2);
+ }
+#ifdef TARGET_X86_64
+ tcg_gen_extu_i32_tl(s->T0, out);
+#endif
+}
+
+#ifndef TARGET_X86_64
+#define gen_helper_cvtss2sq NULL
+#define gen_helper_cvtsd2sq NULL
+#define gen_helper_cvttss2sq NULL
+#define gen_helper_cvttsd2sq NULL
+#endif
+
+static void gen_VCVTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_VCVTtSx2SI(s, env, decode,
+ gen_helper_cvtss2si, gen_helper_cvtss2sq,
+ gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
+}
+
+static void gen_VCVTTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_VCVTtSx2SI(s, env, decode,
+ gen_helper_cvttss2si, gen_helper_cvttss2sq,
+ gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
+}
+
+static void gen_VEXTRACTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int mask = decode->immediate & 1;
+ int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
+ if (decode->op[0].has_ea) {
+ /* VEX-only instruction, no alignment requirements. */
+ gen_sto_env_A0(s, src_ofs, false);
+ } else {
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16);
+ }
+}
+
+static void gen_VEXTRACTPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_pextr(s, env, decode, MO_32);
+}
+
+static void gen_vinsertps(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int val = decode->immediate;
+ int dest_word = (val >> 4) & 3;
+ int new_mask = (val & 15) | (1 << dest_word);
+ int vec_len = 16;
+
+ assert(!s->vex_l);
+
+ if (new_mask == 15) {
+ /* All zeroes except possibly for the inserted element */
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ } else if (decode->op[1].offset != decode->op[0].offset) {
+ gen_store_sse(s, decode, decode->op[1].offset);
+ }
+
+ if (new_mask != (val & 15)) {
+ tcg_gen_st_i32(s->tmp2_i32, cpu_env,
+ vector_elem_offset(&decode->op[0], MO_32, dest_word));
+ }
+
+ if (new_mask != 15) {
+ TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */
+ int i;
+ for (i = 0; i < 4; i++) {
+ if ((val >> i) & 1) {
+ tcg_gen_st_i32(zero, cpu_env,
+ vector_elem_offset(&decode->op[0], MO_32, i));
+ }
+ }
+ }
+}
+
+static void gen_VINSERTPS_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int val = decode->immediate;
+ tcg_gen_ld_i32(s->tmp2_i32, cpu_env,
+ vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
+ gen_vinsertps(s, env, decode);
+}
+
+static void gen_VINSERTPS_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
+ gen_vinsertps(s, env, decode);
+}
+
+static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int mask = decode->immediate & 1;
+ tcg_gen_gvec_mov(MO_64,
+ decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)),
+ decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16);
+ tcg_gen_gvec_mov(MO_64,
+ decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)),
+ decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
+}
+
+static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
+{
+ if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR2, OP_PTR1, s->A0);
+ } else {
+ ymm(cpu_env, OP_PTR2, OP_PTR1, s->A0);
+ }
+}
+
+static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
+}
+
+static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
+}
+
+static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+}
+
+static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (decode->op[0].offset != decode->op[2].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ }
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ }
+}
+
+static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+/*
+ * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
+ * Use a gvec move to move everything above the bottom 64 bits.
+ */
+
+static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+}
+
+static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
+}
+
+static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+}
+
+static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i64 zero = tcg_constant_i64(0);
+
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+ tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
+}
+
+static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
+}
+
+static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
+}
+
+static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
+ tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
+}
+
+static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (s->vex_w) {
+ gen_VMASKMOVPD_st(s, env, decode);
+ } else {
+ gen_VMASKMOVPS_st(s, env, decode);
+ }
+}
+
+static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(s->vex_l);
+ gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
+}
+
+static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ assert(s->vex_l);
+ gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
+}
+
+static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(!s->vex_l);
+ gen_helper_phminposuw_xmm(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ assert(!s->vex_l);
+ gen_helper_roundsd_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+}
+
+static void gen_VROUNDSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
+ assert(!s->vex_l);
+ gen_helper_roundss_xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
+}
+
+static void gen_VSHUF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i32 imm = tcg_constant_i32(decode->immediate);
+ SSEFunc_0_pppi ps, pd, fn;
+ ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm;
+ pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm;
+ fn = s->prefix & PREFIX_DATA ? pd : ps;
+ fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
+}
+
+static void gen_VUCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ SSEFunc_0_epp fn;
+ fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
+ fn(cpu_env, OP_PTR1, OP_PTR2);
+ set_cc_op(s, CC_OP_EFLAGS);
+}
+
+static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_ptr ptr = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0));
+ gen_helper_memset(ptr, ptr, tcg_constant_i32(0),
+ tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
+ tcg_temp_free_ptr(ptr);
+}
+
+static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int i;
+
+ for (i = 0; i < CPU_NB_REGS; i++) {
+ int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1));
+ tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0);
+ }
+}
diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c
index ad58931..a6a90a1 100644
--- a/target/i386/tcg/fpu_helper.c
+++ b/target/i386/tcg/fpu_helper.c
@@ -2559,6 +2559,22 @@ static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
}
}
+static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
+{
+ int i, nb_xmm_regs;
+
+ if (env->hflags & HF_CS64_MASK) {
+ nb_xmm_regs = 16;
+ } else {
+ nb_xmm_regs = 8;
+ }
+
+ for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
+ cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra);
+ cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra);
+ }
+}
+
static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
{
target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
@@ -2651,6 +2667,9 @@ static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
if (opt & XSTATE_SSE_MASK) {
do_xsave_sse(env, ptr, ra);
}
+ if (opt & XSTATE_YMM_MASK) {
+ do_xsave_ymmh(env, ptr + XO(avx_state), ra);
+ }
if (opt & XSTATE_BNDREGS_MASK) {
do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
}
@@ -2725,6 +2744,54 @@ static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
}
}
+static void do_clear_sse(CPUX86State *env)
+{
+ int i, nb_xmm_regs;
+
+ if (env->hflags & HF_CS64_MASK) {
+ nb_xmm_regs = 16;
+ } else {
+ nb_xmm_regs = 8;
+ }
+
+ for (i = 0; i < nb_xmm_regs; i++) {
+ env->xmm_regs[i].ZMM_Q(0) = 0;
+ env->xmm_regs[i].ZMM_Q(1) = 0;
+ }
+}
+
+static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
+{
+ int i, nb_xmm_regs;
+
+ if (env->hflags & HF_CS64_MASK) {
+ nb_xmm_regs = 16;
+ } else {
+ nb_xmm_regs = 8;
+ }
+
+ for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
+ env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra);
+ env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra);
+ }
+}
+
+static void do_clear_ymmh(CPUX86State *env)
+{
+ int i, nb_xmm_regs;
+
+ if (env->hflags & HF_CS64_MASK) {
+ nb_xmm_regs = 16;
+ } else {
+ nb_xmm_regs = 8;
+ }
+
+ for (i = 0; i < nb_xmm_regs; i++) {
+ env->xmm_regs[i].ZMM_Q(2) = 0;
+ env->xmm_regs[i].ZMM_Q(3) = 0;
+ }
+}
+
static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
{
target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
@@ -2831,9 +2898,14 @@ static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr
if (xstate_bv & XSTATE_SSE_MASK) {
do_xrstor_sse(env, ptr, ra);
} else {
- /* ??? When AVX is implemented, we may have to be more
- selective in the clearing. */
- memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
+ do_clear_sse(env);
+ }
+ }
+ if (rfbm & XSTATE_YMM_MASK) {
+ if (xstate_bv & XSTATE_YMM_MASK) {
+ do_xrstor_ymmh(env, ptr + XO(avx_state), ra);
+ } else {
+ do_clear_ymmh(env);
}
}
if (rfbm & XSTATE_BNDREGS_MASK) {
@@ -2955,6 +3027,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
env->xcr0 = mask;
cpu_sync_bndcs_hflags(env);
+ cpu_sync_avx_hflag(env);
return;
do_gpf:
@@ -3053,14 +3126,11 @@ void helper_emms(CPUX86State *env)
*(uint32_t *)(env->fptags + 4) = 0x01010101;
}
-/* XXX: suppress */
-void helper_movq(CPUX86State *env, void *d, void *s)
-{
- *(uint64_t *)d = *(uint64_t *)s;
-}
-
#define SHIFT 0
#include "ops_sse.h"
#define SHIFT 1
#include "ops_sse.h"
+
+#define SHIFT 2
+#include "ops_sse.h"
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index 796dc2a..d51b5d7 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -22,150 +22,274 @@
#include "exec/exec-all.h"
#include "tcg/helper-tcg.h"
-#define PG_ERROR_OK (-1)
+typedef struct TranslateParams {
+ target_ulong addr;
+ target_ulong cr3;
+ int pg_mode;
+ int mmu_idx;
+ int ptw_idx;
+ MMUAccessType access_type;
+} TranslateParams;
+
+typedef struct TranslateResult {
+ hwaddr paddr;
+ int prot;
+ int page_size;
+} TranslateResult;
+
+typedef enum TranslateFaultStage2 {
+ S2_NONE,
+ S2_GPA,
+ S2_GPT,
+} TranslateFaultStage2;
+
+typedef struct TranslateFault {
+ int exception_index;
+ int error_code;
+ target_ulong cr2;
+ TranslateFaultStage2 stage2;
+} TranslateFault;
+
+typedef struct PTETranslate {
+ CPUX86State *env;
+ TranslateFault *err;
+ int ptw_idx;
+ void *haddr;
+ hwaddr gaddr;
+} PTETranslate;
+
+static bool ptw_translate(PTETranslate *inout, hwaddr addr)
+{
+ CPUTLBEntryFull *full;
+ int flags;
+
+ inout->gaddr = addr;
+ flags = probe_access_full(inout->env, addr, MMU_DATA_STORE,
+ inout->ptw_idx, true, &inout->haddr, &full, 0);
+
+ if (unlikely(flags & TLB_INVALID_MASK)) {
+ TranslateFault *err = inout->err;
+
+ assert(inout->ptw_idx == MMU_NESTED_IDX);
+ err->exception_index = 0; /* unused */
+ err->error_code = inout->env->error_code;
+ err->cr2 = addr;
+ err->stage2 = S2_GPT;
+ return false;
+ }
+ return true;
+}
+
+static inline uint32_t ptw_ldl(const PTETranslate *in)
+{
+ if (likely(in->haddr)) {
+ return ldl_p(in->haddr);
+ }
+ return cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0);
+}
+
+static inline uint64_t ptw_ldq(const PTETranslate *in)
+{
+ if (likely(in->haddr)) {
+ return ldq_p(in->haddr);
+ }
+ return cpu_ldq_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0);
+}
-typedef hwaddr (*MMUTranslateFunc)(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
- int *prot);
+/*
+ * Note that we can use a 32-bit cmpxchg for all page table entries,
+ * even 64-bit ones, because PG_PRESENT_MASK, PG_ACCESSED_MASK and
+ * PG_DIRTY_MASK are all in the low 32 bits.
+ */
+static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new)
+{
+ uint32_t cmp;
-#define GET_HPHYS(cs, gpa, access_type, prot) \
- (get_hphys_func ? get_hphys_func(cs, gpa, access_type, prot) : gpa)
+ /* Does x86 really perform a rmw cycle on mmio for ptw? */
+ start_exclusive();
+ cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0);
+ if (cmp == old) {
+ cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0);
+ }
+ end_exclusive();
+ return cmp == old;
+}
-static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_func,
- uint64_t cr3, int is_write1, int mmu_idx, int pg_mode,
- hwaddr *xlat, int *page_size, int *prot)
+static inline bool ptw_setl(const PTETranslate *in, uint32_t old, uint32_t set)
{
- X86CPU *cpu = X86_CPU(cs);
- CPUX86State *env = &cpu->env;
- uint64_t ptep, pte;
- int32_t a20_mask;
- target_ulong pde_addr, pte_addr;
- int error_code = 0;
- int is_dirty, is_write, is_user;
- uint64_t rsvd_mask = PG_ADDRESS_MASK & ~MAKE_64BIT_MASK(0, cpu->phys_bits);
- uint32_t page_offset;
- uint32_t pkr;
+ if (set & ~old) {
+ uint32_t new = old | set;
+ if (likely(in->haddr)) {
+ old = cpu_to_le32(old);
+ new = cpu_to_le32(new);
+ return qatomic_cmpxchg((uint32_t *)in->haddr, old, new) == old;
+ }
+ return ptw_setl_slow(in, old, new);
+ }
+ return true;
+}
- is_user = (mmu_idx == MMU_USER_IDX);
- is_write = is_write1 & 1;
- a20_mask = x86_get_a20_mask(env);
+static bool mmu_translate(CPUX86State *env, const TranslateParams *in,
+ TranslateResult *out, TranslateFault *err)
+{
+ const int32_t a20_mask = x86_get_a20_mask(env);
+ const target_ulong addr = in->addr;
+ const int pg_mode = in->pg_mode;
+ const bool is_user = (in->mmu_idx == MMU_USER_IDX);
+ const MMUAccessType access_type = in->access_type;
+ uint64_t ptep, pte, rsvd_mask;
+ PTETranslate pte_trans = {
+ .env = env,
+ .err = err,
+ .ptw_idx = in->ptw_idx,
+ };
+ hwaddr pte_addr, paddr;
+ uint32_t pkr;
+ int page_size;
+ restart_all:
+ rsvd_mask = ~MAKE_64BIT_MASK(0, env_archcpu(env)->phys_bits);
+ rsvd_mask &= PG_ADDRESS_MASK;
if (!(pg_mode & PG_MODE_NXE)) {
rsvd_mask |= PG_NX_MASK;
}
if (pg_mode & PG_MODE_PAE) {
- uint64_t pde, pdpe;
- target_ulong pdpe_addr;
-
#ifdef TARGET_X86_64
if (pg_mode & PG_MODE_LMA) {
- bool la57 = pg_mode & PG_MODE_LA57;
- uint64_t pml5e_addr, pml5e;
- uint64_t pml4e_addr, pml4e;
-
- if (la57) {
- pml5e_addr = ((cr3 & ~0xfff) +
- (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
- pml5e_addr = GET_HPHYS(cs, pml5e_addr, MMU_DATA_STORE, NULL);
- pml5e = x86_ldq_phys(cs, pml5e_addr);
- if (!(pml5e & PG_PRESENT_MASK)) {
+ if (pg_mode & PG_MODE_LA57) {
+ /*
+ * Page table level 5
+ */
+ pte_addr = ((in->cr3 & ~0xfff) +
+ (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
+ }
+ restart_5:
+ pte = ptw_ldq(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
- if (pml5e & (rsvd_mask | PG_PSE_MASK)) {
+ if (pte & (rsvd_mask | PG_PSE_MASK)) {
goto do_fault_rsvd;
}
- if (!(pml5e & PG_ACCESSED_MASK)) {
- pml5e |= PG_ACCESSED_MASK;
- x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_5;
}
- ptep = pml5e ^ PG_NX_MASK;
+ ptep = pte ^ PG_NX_MASK;
} else {
- pml5e = cr3;
+ pte = in->cr3;
ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK;
}
- pml4e_addr = ((pml5e & PG_ADDRESS_MASK) +
- (((addr >> 39) & 0x1ff) << 3)) & a20_mask;
- pml4e_addr = GET_HPHYS(cs, pml4e_addr, MMU_DATA_STORE, NULL);
- pml4e = x86_ldq_phys(cs, pml4e_addr);
- if (!(pml4e & PG_PRESENT_MASK)) {
+ /*
+ * Page table level 4
+ */
+ pte_addr = ((pte & PG_ADDRESS_MASK) +
+ (((addr >> 39) & 0x1ff) << 3)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
+ }
+ restart_4:
+ pte = ptw_ldq(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
- if (pml4e & (rsvd_mask | PG_PSE_MASK)) {
+ if (pte & (rsvd_mask | PG_PSE_MASK)) {
goto do_fault_rsvd;
}
- if (!(pml4e & PG_ACCESSED_MASK)) {
- pml4e |= PG_ACCESSED_MASK;
- x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_4;
+ }
+ ptep &= pte ^ PG_NX_MASK;
+
+ /*
+ * Page table level 3
+ */
+ pte_addr = ((pte & PG_ADDRESS_MASK) +
+ (((addr >> 30) & 0x1ff) << 3)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
}
- ptep &= pml4e ^ PG_NX_MASK;
- pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
- a20_mask;
- pdpe_addr = GET_HPHYS(cs, pdpe_addr, MMU_DATA_STORE, NULL);
- pdpe = x86_ldq_phys(cs, pdpe_addr);
- if (!(pdpe & PG_PRESENT_MASK)) {
+ restart_3_lma:
+ pte = ptw_ldq(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
- if (pdpe & rsvd_mask) {
+ if (pte & rsvd_mask) {
goto do_fault_rsvd;
}
- ptep &= pdpe ^ PG_NX_MASK;
- if (!(pdpe & PG_ACCESSED_MASK)) {
- pdpe |= PG_ACCESSED_MASK;
- x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_3_lma;
}
- if (pdpe & PG_PSE_MASK) {
+ ptep &= pte ^ PG_NX_MASK;
+ if (pte & PG_PSE_MASK) {
/* 1 GB page */
- *page_size = 1024 * 1024 * 1024;
- pte_addr = pdpe_addr;
- pte = pdpe;
+ page_size = 1024 * 1024 * 1024;
goto do_check_protect;
}
} else
#endif
{
- /* XXX: load them when cr3 is loaded ? */
- pdpe_addr = ((cr3 & ~0x1f) + ((addr >> 27) & 0x18)) &
- a20_mask;
- pdpe_addr = GET_HPHYS(cs, pdpe_addr, MMU_DATA_STORE, NULL);
- pdpe = x86_ldq_phys(cs, pdpe_addr);
- if (!(pdpe & PG_PRESENT_MASK)) {
- goto do_fault;
+ /*
+ * Page table level 3
+ */
+ pte_addr = ((in->cr3 & ~0x1f) + ((addr >> 27) & 0x18)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
}
rsvd_mask |= PG_HI_USER_MASK;
- if (pdpe & (rsvd_mask | PG_NX_MASK)) {
+ restart_3_nolma:
+ pte = ptw_ldq(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
+ goto do_fault;
+ }
+ if (pte & (rsvd_mask | PG_NX_MASK)) {
goto do_fault_rsvd;
}
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_3_nolma;
+ }
ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK;
}
- pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) &
- a20_mask;
- pde_addr = GET_HPHYS(cs, pde_addr, MMU_DATA_STORE, NULL);
- pde = x86_ldq_phys(cs, pde_addr);
- if (!(pde & PG_PRESENT_MASK)) {
+ /*
+ * Page table level 2
+ */
+ pte_addr = ((pte & PG_ADDRESS_MASK) +
+ (((addr >> 21) & 0x1ff) << 3)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
+ }
+ restart_2_pae:
+ pte = ptw_ldq(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
- if (pde & rsvd_mask) {
+ if (pte & rsvd_mask) {
goto do_fault_rsvd;
}
- ptep &= pde ^ PG_NX_MASK;
- if (pde & PG_PSE_MASK) {
+ if (pte & PG_PSE_MASK) {
/* 2 MB page */
- *page_size = 2048 * 1024;
- pte_addr = pde_addr;
- pte = pde;
+ page_size = 2048 * 1024;
+ ptep &= pte ^ PG_NX_MASK;
goto do_check_protect;
}
- /* 4 KB page */
- if (!(pde & PG_ACCESSED_MASK)) {
- pde |= PG_ACCESSED_MASK;
- x86_stl_phys_notdirty(cs, pde_addr, pde);
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_2_pae;
+ }
+ ptep &= pte ^ PG_NX_MASK;
+
+ /*
+ * Page table level 1
+ */
+ pte_addr = ((pte & PG_ADDRESS_MASK) +
+ (((addr >> 12) & 0x1ff) << 3)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
}
- pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
- a20_mask;
- pte_addr = GET_HPHYS(cs, pte_addr, MMU_DATA_STORE, NULL);
- pte = x86_ldq_phys(cs, pte_addr);
+ pte = ptw_ldq(&pte_trans);
if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
@@ -174,54 +298,56 @@ static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_f
}
/* combine pde and pte nx, user and rw protections */
ptep &= pte ^ PG_NX_MASK;
- *page_size = 4096;
+ page_size = 4096;
} else {
- uint32_t pde;
-
- /* page directory entry */
- pde_addr = ((cr3 & ~0xfff) + ((addr >> 20) & 0xffc)) &
- a20_mask;
- pde_addr = GET_HPHYS(cs, pde_addr, MMU_DATA_STORE, NULL);
- pde = x86_ldl_phys(cs, pde_addr);
- if (!(pde & PG_PRESENT_MASK)) {
+ /*
+ * Page table level 2
+ */
+ pte_addr = ((in->cr3 & ~0xfff) + ((addr >> 20) & 0xffc)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
+ }
+ restart_2_nopae:
+ pte = ptw_ldl(&pte_trans);
+ if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
- ptep = pde | PG_NX_MASK;
+ ptep = pte | PG_NX_MASK;
/* if PSE bit is set, then we use a 4MB page */
- if ((pde & PG_PSE_MASK) && (pg_mode & PG_MODE_PSE)) {
- *page_size = 4096 * 1024;
- pte_addr = pde_addr;
-
- /* Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved.
+ if ((pte & PG_PSE_MASK) && (pg_mode & PG_MODE_PSE)) {
+ page_size = 4096 * 1024;
+ /*
+ * Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved.
* Leave bits 20-13 in place for setting accessed/dirty bits below.
*/
- pte = pde | ((pde & 0x1fe000LL) << (32 - 13));
+ pte = (uint32_t)pte | ((pte & 0x1fe000LL) << (32 - 13));
rsvd_mask = 0x200000;
goto do_check_protect_pse36;
}
-
- if (!(pde & PG_ACCESSED_MASK)) {
- pde |= PG_ACCESSED_MASK;
- x86_stl_phys_notdirty(cs, pde_addr, pde);
+ if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) {
+ goto restart_2_nopae;
}
- /* page directory entry */
- pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) &
- a20_mask;
- pte_addr = GET_HPHYS(cs, pte_addr, MMU_DATA_STORE, NULL);
- pte = x86_ldl_phys(cs, pte_addr);
+ /*
+ * Page table level 1
+ */
+ pte_addr = ((pte & ~0xfffu) + ((addr >> 10) & 0xffc)) & a20_mask;
+ if (!ptw_translate(&pte_trans, pte_addr)) {
+ return false;
+ }
+ pte = ptw_ldl(&pte_trans);
if (!(pte & PG_PRESENT_MASK)) {
goto do_fault;
}
/* combine pde and pte user and rw protections */
ptep &= pte | PG_NX_MASK;
- *page_size = 4096;
+ page_size = 4096;
rsvd_mask = 0;
}
do_check_protect:
- rsvd_mask |= (*page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK;
+ rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK;
do_check_protect_pse36:
if (pte & rsvd_mask) {
goto do_fault_rsvd;
@@ -233,17 +359,17 @@ do_check_protect_pse36:
goto do_fault_protect;
}
- *prot = 0;
- if (mmu_idx != MMU_KSMAP_IDX || !(ptep & PG_USER_MASK)) {
- *prot |= PAGE_READ;
+ int prot = 0;
+ if (in->mmu_idx != MMU_KSMAP_IDX || !(ptep & PG_USER_MASK)) {
+ prot |= PAGE_READ;
if ((ptep & PG_RW_MASK) || !(is_user || (pg_mode & PG_MODE_WP))) {
- *prot |= PAGE_WRITE;
+ prot |= PAGE_WRITE;
}
}
if (!(ptep & PG_NX_MASK) &&
- (mmu_idx == MMU_USER_IDX ||
+ (is_user ||
!((pg_mode & PG_MODE_SMEP) && (ptep & PG_USER_MASK)))) {
- *prot |= PAGE_EXEC;
+ prot |= PAGE_EXEC;
}
if (ptep & PG_USER_MASK) {
@@ -262,182 +388,246 @@ do_check_protect_pse36:
} else if (pkr_wd && (is_user || (pg_mode & PG_MODE_WP))) {
pkr_prot &= ~PAGE_WRITE;
}
-
- *prot &= pkr_prot;
- if ((pkr_prot & (1 << is_write1)) == 0) {
- assert(is_write1 != 2);
- error_code |= PG_ERROR_PK_MASK;
- goto do_fault_protect;
+ if ((pkr_prot & (1 << access_type)) == 0) {
+ goto do_fault_pk_protect;
}
+ prot &= pkr_prot;
}
- if ((*prot & (1 << is_write1)) == 0) {
+ if ((prot & (1 << access_type)) == 0) {
goto do_fault_protect;
}
/* yes, it can! */
- is_dirty = is_write && !(pte & PG_DIRTY_MASK);
- if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
- pte |= PG_ACCESSED_MASK;
- if (is_dirty) {
- pte |= PG_DIRTY_MASK;
+ {
+ uint32_t set = PG_ACCESSED_MASK;
+ if (access_type == MMU_DATA_STORE) {
+ set |= PG_DIRTY_MASK;
+ } else if (!(pte & PG_DIRTY_MASK)) {
+ /*
+ * Only set write access if already dirty...
+ * otherwise wait for dirty access.
+ */
+ prot &= ~PAGE_WRITE;
+ }
+ if (!ptw_setl(&pte_trans, pte, set)) {
+ /*
+ * We can arrive here from any of 3 levels and 2 formats.
+ * The only safe thing is to restart the entire lookup.
+ */
+ goto restart_all;
}
- x86_stl_phys_notdirty(cs, pte_addr, pte);
}
- if (!(pte & PG_DIRTY_MASK)) {
- /* only set write access if already dirty... otherwise wait
- for dirty access */
- assert(!is_write);
- *prot &= ~PAGE_WRITE;
- }
+ /* align to page_size */
+ paddr = (pte & a20_mask & PG_ADDRESS_MASK & ~(page_size - 1))
+ | (addr & (page_size - 1));
+
+ if (in->ptw_idx == MMU_NESTED_IDX) {
+ CPUTLBEntryFull *full;
+ int flags, nested_page_size;
+
+ flags = probe_access_full(env, paddr, access_type,
+ MMU_NESTED_IDX, true,
+ &pte_trans.haddr, &full, 0);
+ if (unlikely(flags & TLB_INVALID_MASK)) {
+ err->exception_index = 0; /* unused */
+ err->error_code = env->error_code;
+ err->cr2 = paddr;
+ err->stage2 = S2_GPA;
+ return false;
+ }
- pte = pte & a20_mask;
+ /* Merge stage1 & stage2 protection bits. */
+ prot &= full->prot;
- /* align to page_size */
- pte &= PG_ADDRESS_MASK & ~(*page_size - 1);
- page_offset = addr & (*page_size - 1);
- *xlat = GET_HPHYS(cs, pte + page_offset, is_write1, prot);
- return PG_ERROR_OK;
+ /* Re-verify resulting protection. */
+ if ((prot & (1 << access_type)) == 0) {
+ goto do_fault_protect;
+ }
+
+ /* Merge stage1 & stage2 addresses to final physical address. */
+ nested_page_size = 1 << full->lg_page_size;
+ paddr = (full->phys_addr & ~(nested_page_size - 1))
+ | (paddr & (nested_page_size - 1));
+
+ /*
+ * Use the larger of stage1 & stage2 page sizes, so that
+ * invalidation works.
+ */
+ if (nested_page_size > page_size) {
+ page_size = nested_page_size;
+ }
+ }
+
+ out->paddr = paddr;
+ out->prot = prot;
+ out->page_size = page_size;
+ return true;
+ int error_code;
do_fault_rsvd:
- error_code |= PG_ERROR_RSVD_MASK;
+ error_code = PG_ERROR_RSVD_MASK;
+ goto do_fault_cont;
do_fault_protect:
- error_code |= PG_ERROR_P_MASK;
+ error_code = PG_ERROR_P_MASK;
+ goto do_fault_cont;
+ do_fault_pk_protect:
+ assert(access_type != MMU_INST_FETCH);
+ error_code = PG_ERROR_PK_MASK | PG_ERROR_P_MASK;
+ goto do_fault_cont;
do_fault:
- error_code |= (is_write << PG_ERROR_W_BIT);
- if (is_user)
+ error_code = 0;
+ do_fault_cont:
+ if (is_user) {
error_code |= PG_ERROR_U_MASK;
- if (is_write1 == 2 &&
- ((pg_mode & PG_MODE_NXE) || (pg_mode & PG_MODE_SMEP)))
- error_code |= PG_ERROR_I_D_MASK;
- return error_code;
-}
-
-hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
- int *prot)
-{
- CPUX86State *env = &X86_CPU(cs)->env;
- uint64_t exit_info_1;
- int page_size;
- int next_prot;
- hwaddr hphys;
-
- if (likely(!(env->hflags2 & HF2_NPT_MASK))) {
- return gphys;
}
-
- exit_info_1 = mmu_translate(cs, gphys, NULL, env->nested_cr3,
- access_type, MMU_USER_IDX, env->nested_pg_mode,
- &hphys, &page_size, &next_prot);
- if (exit_info_1 == PG_ERROR_OK) {
- if (prot) {
- *prot &= next_prot;
+ switch (access_type) {
+ case MMU_DATA_LOAD:
+ break;
+ case MMU_DATA_STORE:
+ error_code |= PG_ERROR_W_MASK;
+ break;
+ case MMU_INST_FETCH:
+ if (pg_mode & (PG_MODE_NXE | PG_MODE_SMEP)) {
+ error_code |= PG_ERROR_I_D_MASK;
}
- return hphys;
+ break;
}
+ err->exception_index = EXCP0E_PAGE;
+ err->error_code = error_code;
+ err->cr2 = addr;
+ err->stage2 = S2_NONE;
+ return false;
+}
- x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
- gphys);
- if (prot) {
- exit_info_1 |= SVM_NPTEXIT_GPA;
- } else { /* page table access */
+static G_NORETURN void raise_stage2(CPUX86State *env, TranslateFault *err,
+ uintptr_t retaddr)
+{
+ uint64_t exit_info_1 = err->error_code;
+
+ switch (err->stage2) {
+ case S2_GPT:
exit_info_1 |= SVM_NPTEXIT_GPT;
+ break;
+ case S2_GPA:
+ exit_info_1 |= SVM_NPTEXIT_GPA;
+ break;
+ default:
+ g_assert_not_reached();
}
- cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, env->retaddr);
+
+ x86_stq_phys(env_cpu(env),
+ env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
+ err->cr2);
+ cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, retaddr);
}
-/* return value:
- * -1 = cannot handle fault
- * 0 = nothing more to do
- * 1 = generate PF fault
- */
-static int handle_mmu_fault(CPUState *cs, vaddr addr, int size,
- int is_write1, int mmu_idx)
+static bool get_physical_address(CPUX86State *env, vaddr addr,
+ MMUAccessType access_type, int mmu_idx,
+ TranslateResult *out, TranslateFault *err)
{
- X86CPU *cpu = X86_CPU(cs);
- CPUX86State *env = &cpu->env;
- int error_code = PG_ERROR_OK;
- int pg_mode, prot, page_size;
- int32_t a20_mask;
- hwaddr paddr;
- hwaddr vaddr;
-
-#if defined(DEBUG_MMU)
- printf("MMU fault: addr=%" VADDR_PRIx " w=%d mmu=%d eip=" TARGET_FMT_lx "\n",
- addr, is_write1, mmu_idx, env->eip);
-#endif
-
- if (!(env->cr[0] & CR0_PG_MASK)) {
- a20_mask = x86_get_a20_mask(env);
- paddr = addr & a20_mask;
-#ifdef TARGET_X86_64
- if (!(env->hflags & HF_LMA_MASK)) {
- /* Without long mode we can only address 32bits in real mode */
- paddr = (uint32_t)paddr;
+ TranslateParams in;
+ bool use_stage2 = env->hflags2 & HF2_NPT_MASK;
+
+ in.addr = addr;
+ in.access_type = access_type;
+
+ switch (mmu_idx) {
+ case MMU_PHYS_IDX:
+ break;
+
+ case MMU_NESTED_IDX:
+ if (likely(use_stage2)) {
+ in.cr3 = env->nested_cr3;
+ in.pg_mode = env->nested_pg_mode;
+ in.mmu_idx = MMU_USER_IDX;
+ in.ptw_idx = MMU_PHYS_IDX;
+
+ if (!mmu_translate(env, &in, out, err)) {
+ err->stage2 = S2_GPA;
+ return false;
+ }
+ return true;
}
-#endif
- prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
- page_size = 4096;
- } else {
- pg_mode = get_pg_mode(env);
- if (pg_mode & PG_MODE_LMA) {
- int32_t sext;
-
- /* test virtual address sign extension */
- sext = (int64_t)addr >> (pg_mode & PG_MODE_LA57 ? 56 : 47);
- if (sext != 0 && sext != -1) {
- env->error_code = 0;
- cs->exception_index = EXCP0D_GPF;
- return 1;
+ break;
+
+ default:
+ in.cr3 = env->cr[3];
+ in.mmu_idx = mmu_idx;
+ in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX;
+ in.pg_mode = get_pg_mode(env);
+
+ if (likely(in.pg_mode)) {
+ if (in.pg_mode & PG_MODE_LMA) {
+ /* test virtual address sign extension */
+ int shift = in.pg_mode & PG_MODE_LA57 ? 56 : 47;
+ int64_t sext = (int64_t)addr >> shift;
+ if (sext != 0 && sext != -1) {
+ err->exception_index = EXCP0D_GPF;
+ err->error_code = 0;
+ err->cr2 = addr;
+ return false;
+ }
}
+ return mmu_translate(env, &in, out, err);
}
-
- error_code = mmu_translate(cs, addr, get_hphys, env->cr[3], is_write1,
- mmu_idx, pg_mode,
- &paddr, &page_size, &prot);
+ break;
}
- if (error_code == PG_ERROR_OK) {
- /* Even if 4MB pages, we map only one 4KB page in the cache to
- avoid filling it too fast */
- vaddr = addr & TARGET_PAGE_MASK;
- paddr &= TARGET_PAGE_MASK;
-
- assert(prot & (1 << is_write1));
- tlb_set_page_with_attrs(cs, vaddr, paddr, cpu_get_mem_attrs(env),
- prot, mmu_idx, page_size);
- return 0;
- } else {
- if (env->intercept_exceptions & (1 << EXCP0E_PAGE)) {
- /* cr2 is not modified in case of exceptions */
- x86_stq_phys(cs,
- env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
- addr);
- } else {
- env->cr[2] = addr;
- }
- env->error_code = error_code;
- cs->exception_index = EXCP0E_PAGE;
- return 1;
+ /* Translation disabled. */
+ out->paddr = addr & x86_get_a20_mask(env);
+#ifdef TARGET_X86_64
+ if (!(env->hflags & HF_LMA_MASK)) {
+ /* Without long mode we can only address 32bits in real mode */
+ out->paddr = (uint32_t)out->paddr;
}
+#endif
+ out->prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+ out->page_size = TARGET_PAGE_SIZE;
+ return true;
}
bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
MMUAccessType access_type, int mmu_idx,
bool probe, uintptr_t retaddr)
{
- X86CPU *cpu = X86_CPU(cs);
- CPUX86State *env = &cpu->env;
-
- env->retaddr = retaddr;
- if (handle_mmu_fault(cs, addr, size, access_type, mmu_idx)) {
- /* FIXME: On error in get_hphys we have already jumped out. */
- g_assert(!probe);
- raise_exception_err_ra(env, cs->exception_index,
- env->error_code, retaddr);
+ CPUX86State *env = cs->env_ptr;
+ TranslateResult out;
+ TranslateFault err;
+
+ if (get_physical_address(env, addr, access_type, mmu_idx, &out, &err)) {
+ /*
+ * Even if 4MB pages, we map only one 4KB page in the cache to
+ * avoid filling it too fast.
+ */
+ assert(out.prot & (1 << access_type));
+ tlb_set_page_with_attrs(cs, addr & TARGET_PAGE_MASK,
+ out.paddr & TARGET_PAGE_MASK,
+ cpu_get_mem_attrs(env),
+ out.prot, mmu_idx, out.page_size);
+ return true;
}
- return true;
+
+ if (probe) {
+ /* This will be used if recursing for stage2 translation. */
+ env->error_code = err.error_code;
+ return false;
+ }
+
+ if (err.stage2 != S2_NONE) {
+ raise_stage2(env, &err, retaddr);
+ }
+
+ if (env->intercept_exceptions & (1 << err.exception_index)) {
+ /* cr2 is not modified in case of exceptions */
+ x86_stq_phys(cs, env->vm_vmcb +
+ offsetof(struct vmcb, control.exit_info_2),
+ err.cr2);
+ } else {
+ env->cr[2] = err.cr2;
+ }
+ raise_exception_err_ra(env, err.exception_index, err.error_code, retaddr);
}
G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
diff --git a/target/i386/tcg/sysemu/svm_helper.c b/target/i386/tcg/sysemu/svm_helper.c
index 2b6f450..8e88567 100644
--- a/target/i386/tcg/sysemu/svm_helper.c
+++ b/target/i386/tcg/sysemu/svm_helper.c
@@ -27,19 +27,19 @@
/* Secure Virtual Machine helpers */
-static inline void svm_save_seg(CPUX86State *env, hwaddr addr,
- const SegmentCache *sc)
+static void svm_save_seg(CPUX86State *env, int mmu_idx, hwaddr addr,
+ const SegmentCache *sc)
{
- CPUState *cs = env_cpu(env);
-
- x86_stw_phys(cs, addr + offsetof(struct vmcb_seg, selector),
- sc->selector);
- x86_stq_phys(cs, addr + offsetof(struct vmcb_seg, base),
- sc->base);
- x86_stl_phys(cs, addr + offsetof(struct vmcb_seg, limit),
- sc->limit);
- x86_stw_phys(cs, addr + offsetof(struct vmcb_seg, attrib),
- ((sc->flags >> 8) & 0xff) | ((sc->flags >> 12) & 0x0f00));
+ cpu_stw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, selector),
+ sc->selector, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, base),
+ sc->base, mmu_idx, 0);
+ cpu_stl_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, limit),
+ sc->limit, mmu_idx, 0);
+ cpu_stw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, attrib),
+ ((sc->flags >> 8) & 0xff)
+ | ((sc->flags >> 12) & 0x0f00),
+ mmu_idx, 0);
}
/*
@@ -52,29 +52,36 @@ static inline void svm_canonicalization(CPUX86State *env, target_ulong *seg_base
*seg_base = ((((long) *seg_base) << shift_amt) >> shift_amt);
}
-static inline void svm_load_seg(CPUX86State *env, hwaddr addr,
- SegmentCache *sc)
+static void svm_load_seg(CPUX86State *env, int mmu_idx, hwaddr addr,
+ SegmentCache *sc)
{
- CPUState *cs = env_cpu(env);
unsigned int flags;
- sc->selector = x86_lduw_phys(cs,
- addr + offsetof(struct vmcb_seg, selector));
- sc->base = x86_ldq_phys(cs, addr + offsetof(struct vmcb_seg, base));
- sc->limit = x86_ldl_phys(cs, addr + offsetof(struct vmcb_seg, limit));
- flags = x86_lduw_phys(cs, addr + offsetof(struct vmcb_seg, attrib));
+ sc->selector =
+ cpu_lduw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, selector),
+ mmu_idx, 0);
+ sc->base =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, base),
+ mmu_idx, 0);
+ sc->limit =
+ cpu_ldl_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, limit),
+ mmu_idx, 0);
+ flags =
+ cpu_lduw_mmuidx_ra(env, addr + offsetof(struct vmcb_seg, attrib),
+ mmu_idx, 0);
sc->flags = ((flags & 0xff) << 8) | ((flags & 0x0f00) << 12);
+
svm_canonicalization(env, &sc->base);
}
-static inline void svm_load_seg_cache(CPUX86State *env, hwaddr addr,
- int seg_reg)
+static void svm_load_seg_cache(CPUX86State *env, int mmu_idx,
+ hwaddr addr, int seg_reg)
{
- SegmentCache sc1, *sc = &sc1;
+ SegmentCache sc;
- svm_load_seg(env, addr, sc);
- cpu_x86_load_seg_cache(env, seg_reg, sc->selector,
- sc->base, sc->limit, sc->flags);
+ svm_load_seg(env, mmu_idx, addr, &sc);
+ cpu_x86_load_seg_cache(env, seg_reg, sc.selector,
+ sc.base, sc.limit, sc.flags);
}
static inline bool is_efer_invalid_state (CPUX86State *env)
@@ -199,13 +206,17 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
env->vm_hsave + offsetof(struct vmcb, save.rflags),
cpu_compute_eflags(env));
- svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.es),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.es),
&env->segs[R_ES]);
- svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.cs),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.cs),
&env->segs[R_CS]);
- svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.ss),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.ss),
&env->segs[R_SS]);
- svm_save_seg(env, env->vm_hsave + offsetof(struct vmcb, save.ds),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.ds),
&env->segs[R_DS]);
x86_stq_phys(cs, env->vm_hsave + offsetof(struct vmcb, save.rip),
@@ -271,6 +282,8 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
env->hflags2 |= HF2_NPT_MASK;
env->nested_pg_mode = get_pg_mode(env) & PG_MODE_SVM_MASK;
+
+ tlb_flush_by_mmuidx(cs, 1 << MMU_NESTED_IDX);
}
/* enable intercepts */
@@ -323,18 +336,18 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
save.rflags)),
~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
- svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.es),
- R_ES);
- svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.cs),
- R_CS);
- svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.ss),
- R_SS);
- svm_load_seg_cache(env, env->vm_vmcb + offsetof(struct vmcb, save.ds),
- R_DS);
- svm_load_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.idtr),
- &env->idt);
- svm_load_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.gdtr),
- &env->gdt);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.es), R_ES);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.cs), R_CS);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.ss), R_SS);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.ds), R_DS);
+ svm_load_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.idtr), &env->idt);
+ svm_load_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.gdtr), &env->gdt);
env->eip = x86_ldq_phys(cs,
env->vm_vmcb + offsetof(struct vmcb, save.rip));
@@ -449,9 +462,8 @@ void helper_vmmcall(CPUX86State *env)
void helper_vmload(CPUX86State *env, int aflag)
{
- CPUState *cs = env_cpu(env);
+ int mmu_idx = MMU_PHYS_IDX;
target_ulong addr;
- int prot;
cpu_svm_check_intercept_param(env, SVM_EXIT_VMLOAD, 0, GETPC());
@@ -462,43 +474,52 @@ void helper_vmload(CPUX86State *env, int aflag)
}
if (virtual_vm_load_save_enabled(env, SVM_EXIT_VMLOAD, GETPC())) {
- addr = get_hphys(cs, addr, MMU_DATA_LOAD, &prot);
+ mmu_idx = MMU_NESTED_IDX;
}
- qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmload! " TARGET_FMT_lx
- "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n",
- addr, x86_ldq_phys(cs, addr + offsetof(struct vmcb,
- save.fs.base)),
- env->segs[R_FS].base);
-
- svm_load_seg_cache(env, addr + offsetof(struct vmcb, save.fs), R_FS);
- svm_load_seg_cache(env, addr + offsetof(struct vmcb, save.gs), R_GS);
- svm_load_seg(env, addr + offsetof(struct vmcb, save.tr), &env->tr);
- svm_load_seg(env, addr + offsetof(struct vmcb, save.ldtr), &env->ldt);
+ svm_load_seg_cache(env, mmu_idx,
+ addr + offsetof(struct vmcb, save.fs), R_FS);
+ svm_load_seg_cache(env, mmu_idx,
+ addr + offsetof(struct vmcb, save.gs), R_GS);
+ svm_load_seg(env, mmu_idx,
+ addr + offsetof(struct vmcb, save.tr), &env->tr);
+ svm_load_seg(env, mmu_idx,
+ addr + offsetof(struct vmcb, save.ldtr), &env->ldt);
#ifdef TARGET_X86_64
- env->kernelgsbase = x86_ldq_phys(cs, addr + offsetof(struct vmcb,
- save.kernel_gs_base));
- env->lstar = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.lstar));
- env->cstar = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.cstar));
- env->fmask = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.sfmask));
+ env->kernelgsbase =
+ cpu_ldq_mmuidx_ra(env,
+ addr + offsetof(struct vmcb, save.kernel_gs_base),
+ mmu_idx, 0);
+ env->lstar =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.lstar),
+ mmu_idx, 0);
+ env->cstar =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.cstar),
+ mmu_idx, 0);
+ env->fmask =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sfmask),
+ mmu_idx, 0);
svm_canonicalization(env, &env->kernelgsbase);
#endif
- env->star = x86_ldq_phys(cs, addr + offsetof(struct vmcb, save.star));
- env->sysenter_cs = x86_ldq_phys(cs,
- addr + offsetof(struct vmcb, save.sysenter_cs));
- env->sysenter_esp = x86_ldq_phys(cs, addr + offsetof(struct vmcb,
- save.sysenter_esp));
- env->sysenter_eip = x86_ldq_phys(cs, addr + offsetof(struct vmcb,
- save.sysenter_eip));
-
+ env->star =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.star),
+ mmu_idx, 0);
+ env->sysenter_cs =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_cs),
+ mmu_idx, 0);
+ env->sysenter_esp =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_esp),
+ mmu_idx, 0);
+ env->sysenter_eip =
+ cpu_ldq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_eip),
+ mmu_idx, 0);
}
void helper_vmsave(CPUX86State *env, int aflag)
{
- CPUState *cs = env_cpu(env);
+ int mmu_idx = MMU_PHYS_IDX;
target_ulong addr;
- int prot;
cpu_svm_check_intercept_param(env, SVM_EXIT_VMSAVE, 0, GETPC());
@@ -509,38 +530,36 @@ void helper_vmsave(CPUX86State *env, int aflag)
}
if (virtual_vm_load_save_enabled(env, SVM_EXIT_VMSAVE, GETPC())) {
- addr = get_hphys(cs, addr, MMU_DATA_STORE, &prot);
+ mmu_idx = MMU_NESTED_IDX;
}
- qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmsave! " TARGET_FMT_lx
- "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n",
- addr, x86_ldq_phys(cs,
- addr + offsetof(struct vmcb, save.fs.base)),
- env->segs[R_FS].base);
-
- svm_save_seg(env, addr + offsetof(struct vmcb, save.fs),
+ svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.fs),
&env->segs[R_FS]);
- svm_save_seg(env, addr + offsetof(struct vmcb, save.gs),
+ svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.gs),
&env->segs[R_GS]);
- svm_save_seg(env, addr + offsetof(struct vmcb, save.tr),
+ svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.tr),
&env->tr);
- svm_save_seg(env, addr + offsetof(struct vmcb, save.ldtr),
+ svm_save_seg(env, mmu_idx, addr + offsetof(struct vmcb, save.ldtr),
&env->ldt);
#ifdef TARGET_X86_64
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.kernel_gs_base),
- env->kernelgsbase);
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.lstar), env->lstar);
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.cstar), env->cstar);
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sfmask), env->fmask);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.kernel_gs_base),
+ env->kernelgsbase, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.lstar),
+ env->lstar, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.cstar),
+ env->cstar, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sfmask),
+ env->fmask, mmu_idx, 0);
#endif
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.star), env->star);
- x86_stq_phys(cs,
- addr + offsetof(struct vmcb, save.sysenter_cs), env->sysenter_cs);
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sysenter_esp),
- env->sysenter_esp);
- x86_stq_phys(cs, addr + offsetof(struct vmcb, save.sysenter_eip),
- env->sysenter_eip);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.star),
+ env->star, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_cs),
+ env->sysenter_cs, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_esp),
+ env->sysenter_esp, mmu_idx, 0);
+ cpu_stq_mmuidx_ra(env, addr + offsetof(struct vmcb, save.sysenter_eip),
+ env->sysenter_eip, mmu_idx, 0);
}
void helper_stgi(CPUX86State *env)
@@ -720,15 +739,20 @@ void do_vmexit(CPUX86State *env)
env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0);
}
env->hflags2 &= ~HF2_NPT_MASK;
+ tlb_flush_by_mmuidx(cs, 1 << MMU_NESTED_IDX);
/* Save the VM state in the vmcb */
- svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.es),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.es),
&env->segs[R_ES]);
- svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.cs),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.cs),
&env->segs[R_CS]);
- svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.ss),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.ss),
&env->segs[R_SS]);
- svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.ds),
+ svm_save_seg(env, MMU_PHYS_IDX,
+ env->vm_vmcb + offsetof(struct vmcb, save.ds),
&env->segs[R_DS]);
x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.gdtr.base),
@@ -809,14 +833,14 @@ void do_vmexit(CPUX86State *env)
~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK |
VM_MASK));
- svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.es),
- R_ES);
- svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.cs),
- R_CS);
- svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.ss),
- R_SS);
- svm_load_seg_cache(env, env->vm_hsave + offsetof(struct vmcb, save.ds),
- R_DS);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.es), R_ES);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.cs), R_CS);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.ss), R_SS);
+ svm_load_seg_cache(env, MMU_PHYS_IDX,
+ env->vm_hsave + offsetof(struct vmcb, save.ds), R_DS);
env->eip = x86_ldq_phys(cs,
env->vm_hsave + offsetof(struct vmcb, save.rip));
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 279a3ae..e19d5c1 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -23,6 +23,7 @@
#include "disas/disas.h"
#include "exec/exec-all.h"
#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "exec/cpu_ldst.h"
#include "exec/translator.h"
@@ -86,6 +87,9 @@ typedef struct DisasContext {
int8_t override; /* -1 if no override, else R_CS, R_DS, etc */
uint8_t prefix;
+ bool has_modrm;
+ uint8_t modrm;
+
#ifndef CONFIG_USER_ONLY
uint8_t cpl; /* code priv level */
uint8_t iopl; /* i/o priv level */
@@ -99,8 +103,8 @@ typedef struct DisasContext {
uint8_t rex_r;
uint8_t rex_x;
uint8_t rex_b;
- bool rex_w;
#endif
+ bool vex_w; /* used by AVX even on 32-bit processors */
bool jmp_opt; /* use direct block chaining for direct jumps */
bool repz_opt; /* optimize jumps within repz instructions */
bool cc_op_dirty;
@@ -113,6 +117,7 @@ typedef struct DisasContext {
int cpuid_ext2_features;
int cpuid_ext3_features;
int cpuid_7_0_ebx_features;
+ int cpuid_7_0_ecx_features;
int cpuid_xsave_features;
/* TCG local temps */
@@ -124,8 +129,6 @@ typedef struct DisasContext {
/* TCG local register indexes (only used inside old micro ops) */
TCGv tmp0;
TCGv tmp4;
- TCGv_ptr ptr0;
- TCGv_ptr ptr1;
TCGv_i32 tmp2_i32;
TCGv_i32 tmp3_i32;
TCGv_i64 tmp1_i64;
@@ -177,7 +180,7 @@ typedef struct DisasContext {
#ifdef TARGET_X86_64
#define REX_PREFIX(S) (((S)->prefix & PREFIX_REX) != 0)
-#define REX_W(S) ((S)->rex_w)
+#define REX_W(S) ((S)->vex_w)
#define REX_R(S) ((S)->rex_r + 0)
#define REX_X(S) ((S)->rex_x + 0)
#define REX_B(S) ((S)->rex_b + 0)
@@ -2277,11 +2280,11 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
}
/* Compute the address, with a minimum number of TCG ops. */
-static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a)
+static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a, bool is_vsib)
{
TCGv ea = NULL;
- if (a.index >= 0) {
+ if (a.index >= 0 && !is_vsib) {
if (a.scale == 0) {
ea = cpu_regs[a.index];
} else {
@@ -2314,7 +2317,7 @@ static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a)
static void gen_lea_modrm(CPUX86State *env, DisasContext *s, int modrm)
{
AddressParts a = gen_lea_modrm_0(env, s, modrm);
- TCGv ea = gen_lea_modrm_1(s, a);
+ TCGv ea = gen_lea_modrm_1(s, a, false);
gen_lea_v_seg(s, s->aflag, ea, a.def_seg, s->override);
}
@@ -2327,7 +2330,8 @@ static void gen_nop_modrm(CPUX86State *env, DisasContext *s, int modrm)
static void gen_bndck(CPUX86State *env, DisasContext *s, int modrm,
TCGCond cond, TCGv_i64 bndv)
{
- TCGv ea = gen_lea_modrm_1(s, gen_lea_modrm_0(env, s, modrm));
+ AddressParts a = gen_lea_modrm_0(env, s, modrm);
+ TCGv ea = gen_lea_modrm_1(s, a, false);
tcg_gen_extu_tl_i64(s->tmp1_i64, ea);
if (!CODE64(s)) {
@@ -2425,6 +2429,31 @@ static inline uint32_t insn_get(CPUX86State *env, DisasContext *s, MemOp ot)
return ret;
}
+static target_long insn_get_signed(CPUX86State *env, DisasContext *s, MemOp ot)
+{
+ target_long ret;
+
+ switch (ot) {
+ case MO_8:
+ ret = (int8_t) x86_ldub_code(env, s);
+ break;
+ case MO_16:
+ ret = (int16_t) x86_lduw_code(env, s);
+ break;
+ case MO_32:
+ ret = (int32_t) x86_ldl_code(env, s);
+ break;
+#ifdef TARGET_X86_64
+ case MO_64:
+ ret = x86_ldq_code(env, s);
+ break;
+#endif
+ default:
+ g_assert_not_reached();
+ }
+ return ret;
+}
+
static inline int insn_const_size(MemOp ot)
{
if (ot <= MO_32) {
@@ -2868,1943 +2897,61 @@ static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool align)
int mem_index = s->mem_index;
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
MO_LEUQ | (align ? MO_ALIGN_16 : 0));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_addi_tl(s->tmp0, s->A0, 8);
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
}
static inline void gen_sto_env_A0(DisasContext *s, int offset, bool align)
{
int mem_index = s->mem_index;
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0)));
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
MO_LEUQ | (align ? MO_ALIGN_16 : 0));
tcg_gen_addi_tl(s->tmp0, s->A0, 8);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1)));
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
}
-static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset)
+static void gen_ldy_env_A0(DisasContext *s, int offset, bool align)
{
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(0)));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(0)));
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(1)));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(1)));
-}
-
-static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset)
-{
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset);
-}
+ int mem_index = s->mem_index;
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
+ MO_LEUQ | (align ? MO_ALIGN_32 : 0));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
+ tcg_gen_addi_tl(s->tmp0, s->A0, 8);
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
-static inline void gen_op_movl(DisasContext *s, int d_offset, int s_offset)
-{
- tcg_gen_ld_i32(s->tmp2_i32, cpu_env, s_offset);
- tcg_gen_st_i32(s->tmp2_i32, cpu_env, d_offset);
+ tcg_gen_addi_tl(s->tmp0, s->A0, 16);
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
+ tcg_gen_addi_tl(s->tmp0, s->A0, 24);
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
}
-static inline void gen_op_movq_env_0(DisasContext *s, int d_offset)
+static void gen_sty_env_A0(DisasContext *s, int offset, bool align)
{
- tcg_gen_movi_i64(s->tmp1_i64, 0);
- tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset);
+ int mem_index = s->mem_index;
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index,
+ MO_LEUQ | (align ? MO_ALIGN_32 : 0));
+ tcg_gen_addi_tl(s->tmp0, s->A0, 8);
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_addi_tl(s->tmp0, s->A0, 16);
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+ tcg_gen_addi_tl(s->tmp0, s->A0, 24);
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
}
-#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
-
-typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
-typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
-typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val);
-typedef void (*SSEFunc_0_epl)(TCGv_ptr env, TCGv_ptr reg, TCGv_i64 val);
-typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
-typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
- TCGv_ptr reg_c);
-typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
- TCGv_i32 val);
-typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
-typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
- TCGv val);
-
-#define SSE_OPF_CMP (1 << 1) /* does not write for first operand */
-#define SSE_OPF_SPECIAL (1 << 3) /* magic */
-#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */
-#define SSE_OPF_MMX (1 << 5) /* MMX/integer/AVX2 instruction */
-#define SSE_OPF_SCALAR (1 << 6) /* Has SSE scalar variants */
-#define SSE_OPF_SHUF (1 << 9) /* pshufx/shufpx */
-
-#define OP(op, flags, a, b, c, d) \
- {flags, {{.op = a}, {.op = b}, {.op = c}, {.op = d} } }
-
-#define MMX_OP(x) OP(op1, SSE_OPF_MMX, \
- gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL)
-
-#define SSE_FOP(name) OP(op1, SSE_OPF_SCALAR, \
- gen_helper_##name##ps##_xmm, gen_helper_##name##pd##_xmm, \
- gen_helper_##name##ss, gen_helper_##name##sd)
-#define SSE_OP(sname, dname, op, flags) OP(op, flags, \
- gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL)
-
-typedef union SSEFuncs {
- SSEFunc_0_epp op1;
- SSEFunc_0_ppi op1i;
- SSEFunc_0_eppt op1t;
-} SSEFuncs;
-
-struct SSEOpHelper_table1 {
- int flags;
- SSEFuncs fn[4];
-};
-
-#define SSE_3DNOW { SSE_OPF_3DNOW }
-#define SSE_SPECIAL { SSE_OPF_SPECIAL }
-
-static const struct SSEOpHelper_table1 sse_op_table1[256] = {
- /* 3DNow! extensions */
- [0x0e] = SSE_SPECIAL, /* femms */
- [0x0f] = SSE_3DNOW, /* pf... (sse_op_table5) */
- /* pure SSE operations */
- [0x10] = SSE_SPECIAL, /* movups, movupd, movss, movsd */
- [0x11] = SSE_SPECIAL, /* movups, movupd, movss, movsd */
- [0x12] = SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */
- [0x13] = SSE_SPECIAL, /* movlps, movlpd */
- [0x14] = SSE_OP(punpckldq, punpcklqdq, op1, 0), /* unpcklps, unpcklpd */
- [0x15] = SSE_OP(punpckhdq, punpckhqdq, op1, 0), /* unpckhps, unpckhpd */
- [0x16] = SSE_SPECIAL, /* movhps, movhpd, movshdup */
- [0x17] = SSE_SPECIAL, /* movhps, movhpd */
-
- [0x28] = SSE_SPECIAL, /* movaps, movapd */
- [0x29] = SSE_SPECIAL, /* movaps, movapd */
- [0x2a] = SSE_SPECIAL, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
- [0x2b] = SSE_SPECIAL, /* movntps, movntpd, movntss, movntsd */
- [0x2c] = SSE_SPECIAL, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
- [0x2d] = SSE_SPECIAL, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
- [0x2e] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR,
- gen_helper_ucomiss, gen_helper_ucomisd, NULL, NULL),
- [0x2f] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR,
- gen_helper_comiss, gen_helper_comisd, NULL, NULL),
- [0x50] = SSE_SPECIAL, /* movmskps, movmskpd */
- [0x51] = OP(op1, SSE_OPF_SCALAR,
- gen_helper_sqrtps_xmm, gen_helper_sqrtpd_xmm,
- gen_helper_sqrtss, gen_helper_sqrtsd),
- [0x52] = OP(op1, SSE_OPF_SCALAR,
- gen_helper_rsqrtps_xmm, NULL, gen_helper_rsqrtss, NULL),
- [0x53] = OP(op1, SSE_OPF_SCALAR,
- gen_helper_rcpps_xmm, NULL, gen_helper_rcpss, NULL),
- [0x54] = SSE_OP(pand, pand, op1, 0), /* andps, andpd */
- [0x55] = SSE_OP(pandn, pandn, op1, 0), /* andnps, andnpd */
- [0x56] = SSE_OP(por, por, op1, 0), /* orps, orpd */
- [0x57] = SSE_OP(pxor, pxor, op1, 0), /* xorps, xorpd */
- [0x58] = SSE_FOP(add),
- [0x59] = SSE_FOP(mul),
- [0x5a] = OP(op1, SSE_OPF_SCALAR,
- gen_helper_cvtps2pd_xmm, gen_helper_cvtpd2ps_xmm,
- gen_helper_cvtss2sd, gen_helper_cvtsd2ss),
- [0x5b] = OP(op1, 0,
- gen_helper_cvtdq2ps_xmm, gen_helper_cvtps2dq_xmm,
- gen_helper_cvttps2dq_xmm, NULL),
- [0x5c] = SSE_FOP(sub),
- [0x5d] = SSE_FOP(min),
- [0x5e] = SSE_FOP(div),
- [0x5f] = SSE_FOP(max),
-
- [0xc2] = SSE_FOP(cmpeq), /* sse_op_table4 */
- [0xc6] = SSE_OP(shufps, shufpd, op1i, SSE_OPF_SHUF),
-
- /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */
- [0x38] = SSE_SPECIAL,
- [0x3a] = SSE_SPECIAL,
-
- /* MMX ops and their SSE extensions */
- [0x60] = MMX_OP(punpcklbw),
- [0x61] = MMX_OP(punpcklwd),
- [0x62] = MMX_OP(punpckldq),
- [0x63] = MMX_OP(packsswb),
- [0x64] = MMX_OP(pcmpgtb),
- [0x65] = MMX_OP(pcmpgtw),
- [0x66] = MMX_OP(pcmpgtl),
- [0x67] = MMX_OP(packuswb),
- [0x68] = MMX_OP(punpckhbw),
- [0x69] = MMX_OP(punpckhwd),
- [0x6a] = MMX_OP(punpckhdq),
- [0x6b] = MMX_OP(packssdw),
- [0x6c] = OP(op1, SSE_OPF_MMX,
- NULL, gen_helper_punpcklqdq_xmm, NULL, NULL),
- [0x6d] = OP(op1, SSE_OPF_MMX,
- NULL, gen_helper_punpckhqdq_xmm, NULL, NULL),
- [0x6e] = SSE_SPECIAL, /* movd mm, ea */
- [0x6f] = SSE_SPECIAL, /* movq, movdqa, , movqdu */
- [0x70] = OP(op1i, SSE_OPF_SHUF | SSE_OPF_MMX,
- gen_helper_pshufw_mmx, gen_helper_pshufd_xmm,
- gen_helper_pshufhw_xmm, gen_helper_pshuflw_xmm),
- [0x71] = SSE_SPECIAL, /* shiftw */
- [0x72] = SSE_SPECIAL, /* shiftd */
- [0x73] = SSE_SPECIAL, /* shiftq */
- [0x74] = MMX_OP(pcmpeqb),
- [0x75] = MMX_OP(pcmpeqw),
- [0x76] = MMX_OP(pcmpeql),
- [0x77] = SSE_SPECIAL, /* emms */
- [0x78] = SSE_SPECIAL, /* extrq_i, insertq_i (sse4a) */
- [0x79] = OP(op1, 0,
- NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r),
- [0x7c] = OP(op1, 0,
- NULL, gen_helper_haddpd_xmm, NULL, gen_helper_haddps_xmm),
- [0x7d] = OP(op1, 0,
- NULL, gen_helper_hsubpd_xmm, NULL, gen_helper_hsubps_xmm),
- [0x7e] = SSE_SPECIAL, /* movd, movd, , movq */
- [0x7f] = SSE_SPECIAL, /* movq, movdqa, movdqu */
- [0xc4] = SSE_SPECIAL, /* pinsrw */
- [0xc5] = SSE_SPECIAL, /* pextrw */
- [0xd0] = OP(op1, 0,
- NULL, gen_helper_addsubpd_xmm, NULL, gen_helper_addsubps_xmm),
- [0xd1] = MMX_OP(psrlw),
- [0xd2] = MMX_OP(psrld),
- [0xd3] = MMX_OP(psrlq),
- [0xd4] = MMX_OP(paddq),
- [0xd5] = MMX_OP(pmullw),
- [0xd6] = SSE_SPECIAL,
- [0xd7] = SSE_SPECIAL, /* pmovmskb */
- [0xd8] = MMX_OP(psubusb),
- [0xd9] = MMX_OP(psubusw),
- [0xda] = MMX_OP(pminub),
- [0xdb] = MMX_OP(pand),
- [0xdc] = MMX_OP(paddusb),
- [0xdd] = MMX_OP(paddusw),
- [0xde] = MMX_OP(pmaxub),
- [0xdf] = MMX_OP(pandn),
- [0xe0] = MMX_OP(pavgb),
- [0xe1] = MMX_OP(psraw),
- [0xe2] = MMX_OP(psrad),
- [0xe3] = MMX_OP(pavgw),
- [0xe4] = MMX_OP(pmulhuw),
- [0xe5] = MMX_OP(pmulhw),
- [0xe6] = OP(op1, 0,
- NULL, gen_helper_cvttpd2dq_xmm,
- gen_helper_cvtdq2pd_xmm, gen_helper_cvtpd2dq_xmm),
- [0xe7] = SSE_SPECIAL, /* movntq, movntq */
- [0xe8] = MMX_OP(psubsb),
- [0xe9] = MMX_OP(psubsw),
- [0xea] = MMX_OP(pminsw),
- [0xeb] = MMX_OP(por),
- [0xec] = MMX_OP(paddsb),
- [0xed] = MMX_OP(paddsw),
- [0xee] = MMX_OP(pmaxsw),
- [0xef] = MMX_OP(pxor),
- [0xf0] = SSE_SPECIAL, /* lddqu */
- [0xf1] = MMX_OP(psllw),
- [0xf2] = MMX_OP(pslld),
- [0xf3] = MMX_OP(psllq),
- [0xf4] = MMX_OP(pmuludq),
- [0xf5] = MMX_OP(pmaddwd),
- [0xf6] = MMX_OP(psadbw),
- [0xf7] = OP(op1t, SSE_OPF_MMX,
- gen_helper_maskmov_mmx, gen_helper_maskmov_xmm, NULL, NULL),
- [0xf8] = MMX_OP(psubb),
- [0xf9] = MMX_OP(psubw),
- [0xfa] = MMX_OP(psubl),
- [0xfb] = MMX_OP(psubq),
- [0xfc] = MMX_OP(paddb),
- [0xfd] = MMX_OP(paddw),
- [0xfe] = MMX_OP(paddl),
-};
-#undef MMX_OP
-#undef OP
-#undef SSE_FOP
-#undef SSE_OP
-#undef SSE_SPECIAL
-
-#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
-
-static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = {
- [0 + 2] = MMX_OP2(psrlw),
- [0 + 4] = MMX_OP2(psraw),
- [0 + 6] = MMX_OP2(psllw),
- [8 + 2] = MMX_OP2(psrld),
- [8 + 4] = MMX_OP2(psrad),
- [8 + 6] = MMX_OP2(pslld),
- [16 + 2] = MMX_OP2(psrlq),
- [16 + 3] = { NULL, gen_helper_psrldq_xmm },
- [16 + 6] = MMX_OP2(psllq),
- [16 + 7] = { NULL, gen_helper_pslldq_xmm },
-};
-
-static const SSEFunc_0_epi sse_op_table3ai[] = {
- gen_helper_cvtsi2ss,
- gen_helper_cvtsi2sd
-};
-
-#ifdef TARGET_X86_64
-static const SSEFunc_0_epl sse_op_table3aq[] = {
- gen_helper_cvtsq2ss,
- gen_helper_cvtsq2sd
-};
-#endif
-
-static const SSEFunc_i_ep sse_op_table3bi[] = {
- gen_helper_cvttss2si,
- gen_helper_cvtss2si,
- gen_helper_cvttsd2si,
- gen_helper_cvtsd2si
-};
-
-#ifdef TARGET_X86_64
-static const SSEFunc_l_ep sse_op_table3bq[] = {
- gen_helper_cvttss2sq,
- gen_helper_cvtss2sq,
- gen_helper_cvttsd2sq,
- gen_helper_cvtsd2sq
-};
-#endif
-
-#define SSE_CMP(x) { \
- gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
- gen_helper_ ## x ## ss, gen_helper_ ## x ## sd}
-static const SSEFunc_0_epp sse_op_table4[8][4] = {
- SSE_CMP(cmpeq),
- SSE_CMP(cmplt),
- SSE_CMP(cmple),
- SSE_CMP(cmpunord),
- SSE_CMP(cmpneq),
- SSE_CMP(cmpnlt),
- SSE_CMP(cmpnle),
- SSE_CMP(cmpord),
-};
-#undef SSE_CMP
-
-static const SSEFunc_0_epp sse_op_table5[256] = {
- [0x0c] = gen_helper_pi2fw,
- [0x0d] = gen_helper_pi2fd,
- [0x1c] = gen_helper_pf2iw,
- [0x1d] = gen_helper_pf2id,
- [0x8a] = gen_helper_pfnacc,
- [0x8e] = gen_helper_pfpnacc,
- [0x90] = gen_helper_pfcmpge,
- [0x94] = gen_helper_pfmin,
- [0x96] = gen_helper_pfrcp,
- [0x97] = gen_helper_pfrsqrt,
- [0x9a] = gen_helper_pfsub,
- [0x9e] = gen_helper_pfadd,
- [0xa0] = gen_helper_pfcmpgt,
- [0xa4] = gen_helper_pfmax,
- [0xa6] = gen_helper_movq, /* pfrcpit1; no need to actually increase precision */
- [0xa7] = gen_helper_movq, /* pfrsqit1 */
- [0xaa] = gen_helper_pfsubr,
- [0xae] = gen_helper_pfacc,
- [0xb0] = gen_helper_pfcmpeq,
- [0xb4] = gen_helper_pfmul,
- [0xb6] = gen_helper_movq, /* pfrcpit2 */
- [0xb7] = gen_helper_pmulhrw_mmx,
- [0xbb] = gen_helper_pswapd,
- [0xbf] = gen_helper_pavgb_mmx,
-};
-
-struct SSEOpHelper_table6 {
- SSEFuncs fn[2];
- uint32_t ext_mask;
- int flags;
-};
-
-struct SSEOpHelper_table7 {
- union {
- SSEFunc_0_eppi op1;
- } fn[2];
- uint32_t ext_mask;
- int flags;
-};
-
-#define gen_helper_special_xmm NULL
-
-#define OP(name, op, flags, ext, mmx_name) \
- {{{.op = mmx_name}, {.op = gen_helper_ ## name ## _xmm} }, \
- CPUID_EXT_ ## ext, flags}
-#define BINARY_OP_MMX(name, ext) \
- OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx)
-#define BINARY_OP(name, ext, flags) \
- OP(name, op1, flags, ext, NULL)
-#define UNARY_OP_MMX(name, ext) \
- OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx)
-#define UNARY_OP(name, ext, flags) \
- OP(name, op1, flags, ext, NULL)
-#define BLENDV_OP(name, ext, flags) OP(name, op1, 0, ext, NULL)
-#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP, ext, NULL)
-#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL)
-
-/* prefix [66] 0f 38 */
-static const struct SSEOpHelper_table6 sse_op_table6[256] = {
- [0x00] = BINARY_OP_MMX(pshufb, SSSE3),
- [0x01] = BINARY_OP_MMX(phaddw, SSSE3),
- [0x02] = BINARY_OP_MMX(phaddd, SSSE3),
- [0x03] = BINARY_OP_MMX(phaddsw, SSSE3),
- [0x04] = BINARY_OP_MMX(pmaddubsw, SSSE3),
- [0x05] = BINARY_OP_MMX(phsubw, SSSE3),
- [0x06] = BINARY_OP_MMX(phsubd, SSSE3),
- [0x07] = BINARY_OP_MMX(phsubsw, SSSE3),
- [0x08] = BINARY_OP_MMX(psignb, SSSE3),
- [0x09] = BINARY_OP_MMX(psignw, SSSE3),
- [0x0a] = BINARY_OP_MMX(psignd, SSSE3),
- [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3),
- [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX),
- [0x14] = BLENDV_OP(blendvps, SSE41, 0),
- [0x15] = BLENDV_OP(blendvpd, SSE41, 0),
- [0x17] = CMP_OP(ptest, SSE41),
- [0x1c] = UNARY_OP_MMX(pabsb, SSSE3),
- [0x1d] = UNARY_OP_MMX(pabsw, SSSE3),
- [0x1e] = UNARY_OP_MMX(pabsd, SSSE3),
- [0x20] = UNARY_OP(pmovsxbw, SSE41, SSE_OPF_MMX),
- [0x21] = UNARY_OP(pmovsxbd, SSE41, SSE_OPF_MMX),
- [0x22] = UNARY_OP(pmovsxbq, SSE41, SSE_OPF_MMX),
- [0x23] = UNARY_OP(pmovsxwd, SSE41, SSE_OPF_MMX),
- [0x24] = UNARY_OP(pmovsxwq, SSE41, SSE_OPF_MMX),
- [0x25] = UNARY_OP(pmovsxdq, SSE41, SSE_OPF_MMX),
- [0x28] = BINARY_OP(pmuldq, SSE41, SSE_OPF_MMX),
- [0x29] = BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX),
- [0x2a] = SPECIAL_OP(SSE41), /* movntdqa */
- [0x2b] = BINARY_OP(packusdw, SSE41, SSE_OPF_MMX),
- [0x30] = UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX),
- [0x31] = UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX),
- [0x32] = UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX),
- [0x33] = UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX),
- [0x34] = UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX),
- [0x35] = UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX),
- [0x37] = BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX),
- [0x38] = BINARY_OP(pminsb, SSE41, SSE_OPF_MMX),
- [0x39] = BINARY_OP(pminsd, SSE41, SSE_OPF_MMX),
- [0x3a] = BINARY_OP(pminuw, SSE41, SSE_OPF_MMX),
- [0x3b] = BINARY_OP(pminud, SSE41, SSE_OPF_MMX),
- [0x3c] = BINARY_OP(pmaxsb, SSE41, SSE_OPF_MMX),
- [0x3d] = BINARY_OP(pmaxsd, SSE41, SSE_OPF_MMX),
- [0x3e] = BINARY_OP(pmaxuw, SSE41, SSE_OPF_MMX),
- [0x3f] = BINARY_OP(pmaxud, SSE41, SSE_OPF_MMX),
- [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX),
- [0x41] = UNARY_OP(phminposuw, SSE41, 0),
- [0xdb] = UNARY_OP(aesimc, AES, 0),
- [0xdc] = BINARY_OP(aesenc, AES, 0),
- [0xdd] = BINARY_OP(aesenclast, AES, 0),
- [0xde] = BINARY_OP(aesdec, AES, 0),
- [0xdf] = BINARY_OP(aesdeclast, AES, 0),
-};
-
-/* prefix [66] 0f 3a */
-static const struct SSEOpHelper_table7 sse_op_table7[256] = {
- [0x08] = UNARY_OP(roundps, SSE41, 0),
- [0x09] = UNARY_OP(roundpd, SSE41, 0),
- [0x0a] = UNARY_OP(roundss, SSE41, SSE_OPF_SCALAR),
- [0x0b] = UNARY_OP(roundsd, SSE41, SSE_OPF_SCALAR),
- [0x0c] = BINARY_OP(blendps, SSE41, 0),
- [0x0d] = BINARY_OP(blendpd, SSE41, 0),
- [0x0e] = BINARY_OP(pblendw, SSE41, SSE_OPF_MMX),
- [0x0f] = BINARY_OP_MMX(palignr, SSSE3),
- [0x14] = SPECIAL_OP(SSE41), /* pextrb */
- [0x15] = SPECIAL_OP(SSE41), /* pextrw */
- [0x16] = SPECIAL_OP(SSE41), /* pextrd/pextrq */
- [0x17] = SPECIAL_OP(SSE41), /* extractps */
- [0x20] = SPECIAL_OP(SSE41), /* pinsrb */
- [0x21] = SPECIAL_OP(SSE41), /* insertps */
- [0x22] = SPECIAL_OP(SSE41), /* pinsrd/pinsrq */
- [0x40] = BINARY_OP(dpps, SSE41, 0),
- [0x41] = BINARY_OP(dppd, SSE41, 0),
- [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX),
- [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0),
- [0x60] = CMP_OP(pcmpestrm, SSE42),
- [0x61] = CMP_OP(pcmpestri, SSE42),
- [0x62] = CMP_OP(pcmpistrm, SSE42),
- [0x63] = CMP_OP(pcmpistri, SSE42),
- [0xdf] = UNARY_OP(aeskeygenassist, AES, 0),
-};
-
-#undef OP
-#undef BINARY_OP_MMX
-#undef BINARY_OP
-#undef UNARY_OP_MMX
-#undef UNARY_OP
-#undef BLENDV_OP
-#undef SPECIAL_OP
-
-/* VEX prefix not allowed */
-#define CHECK_NO_VEX(s) do { \
- if (s->prefix & PREFIX_VEX) \
- goto illegal_op; \
- } while (0)
-
-static void gen_sse(CPUX86State *env, DisasContext *s, int b)
-{
- int b1, op1_offset, op2_offset, is_xmm, val;
- int modrm, mod, rm, reg;
- int sse_op_flags;
- SSEFuncs sse_op_fn;
- const struct SSEOpHelper_table6 *op6;
- const struct SSEOpHelper_table7 *op7;
- MemOp ot;
-
- b &= 0xff;
- if (s->prefix & PREFIX_DATA)
- b1 = 1;
- else if (s->prefix & PREFIX_REPZ)
- b1 = 2;
- else if (s->prefix & PREFIX_REPNZ)
- b1 = 3;
- else
- b1 = 0;
- sse_op_flags = sse_op_table1[b].flags;
- sse_op_fn = sse_op_table1[b].fn[b1];
- if ((sse_op_flags & (SSE_OPF_SPECIAL | SSE_OPF_3DNOW)) == 0
- && !sse_op_fn.op1) {
- goto unknown_op;
- }
- if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) {
- is_xmm = 1;
- } else {
- if (b1 == 0) {
- /* MMX case */
- is_xmm = 0;
- } else {
- is_xmm = 1;
- }
- }
- if (sse_op_flags & SSE_OPF_3DNOW) {
- if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) {
- goto illegal_op;
- }
- }
- /* simple MMX/SSE operation */
- if (s->flags & HF_TS_MASK) {
- gen_exception(s, EXCP07_PREX);
- return;
- }
- if (s->flags & HF_EM_MASK) {
- illegal_op:
- gen_illegal_opcode(s);
- return;
- }
- if (is_xmm
- && !(s->flags & HF_OSFXSR_MASK)
- && (b != 0x38 && b != 0x3a)) {
- goto unknown_op;
- }
- if (b == 0x0e) {
- if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) {
- /* If we were fully decoding this we might use illegal_op. */
- goto unknown_op;
- }
- /* femms */
- gen_helper_emms(cpu_env);
- return;
- }
- if (b == 0x77) {
- /* emms */
- gen_helper_emms(cpu_env);
- return;
- }
- /* prepare MMX state (XXX: optimize by storing fptt and fptags in
- the static cpu state) */
- if (!is_xmm) {
- gen_helper_enter_mmx(cpu_env);
- }
-
- modrm = x86_ldub_code(env, s);
- reg = ((modrm >> 3) & 7);
- if (is_xmm) {
- reg |= REX_R(s);
- }
- mod = (modrm >> 6) & 3;
- if (sse_op_flags & SSE_OPF_SPECIAL) {
- b |= (b1 << 8);
- switch(b) {
- case 0x0e7: /* movntq */
- CHECK_NO_VEX(s);
- if (mod == 3) {
- goto illegal_op;
- }
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx));
- break;
- case 0x1e7: /* movntdq */
- case 0x02b: /* movntps */
- case 0x12b: /* movntpd */
- if (mod == 3)
- goto illegal_op;
- gen_lea_modrm(env, s, modrm);
- gen_sto_env_A0(s, ZMM_OFFSET(reg), true);
- break;
- case 0x3f0: /* lddqu */
- if (mod == 3)
- goto illegal_op;
- gen_lea_modrm(env, s, modrm);
- gen_ldo_env_A0(s, ZMM_OFFSET(reg), false);
- break;
- case 0x22b: /* movntss */
- case 0x32b: /* movntsd */
- if (mod == 3)
- goto illegal_op;
- gen_lea_modrm(env, s, modrm);
- if (b1 & 1) {
- gen_stq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(0)));
- gen_op_st_v(s, MO_32, s->T0, s->A0);
- }
- break;
- case 0x6e: /* movd mm, ea */
- CHECK_NO_VEX(s);
-#ifdef TARGET_X86_64
- if (s->dflag == MO_64) {
- gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0);
- tcg_gen_st_tl(s->T0, cpu_env,
- offsetof(CPUX86State, fpregs[reg].mmx));
- } else
-#endif
- {
- gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0);
- tcg_gen_addi_ptr(s->ptr0, cpu_env,
- offsetof(CPUX86State,fpregs[reg].mmx));
- tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
- gen_helper_movl_mm_T0_mmx(s->ptr0, s->tmp2_i32);
- }
- break;
- case 0x16e: /* movd xmm, ea */
-#ifdef TARGET_X86_64
- if (s->dflag == MO_64) {
- gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
- gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0);
- } else
-#endif
- {
- gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
- tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
- gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32);
- }
- break;
- case 0x6f: /* movq mm, ea */
- CHECK_NO_VEX(s);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx));
- } else {
- rm = (modrm & 7);
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env,
- offsetof(CPUX86State,fpregs[rm].mmx));
- tcg_gen_st_i64(s->tmp1_i64, cpu_env,
- offsetof(CPUX86State,fpregs[reg].mmx));
- }
- break;
- case 0x010: /* movups */
- case 0x110: /* movupd */
- case 0x028: /* movaps */
- case 0x128: /* movapd */
- case 0x16f: /* movdqa xmm, ea */
- case 0x26f: /* movdqu xmm, ea */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldo_env_A0(s, ZMM_OFFSET(reg),
- /* movaps, movapd, movdqa */
- b == 0x028 || b == 0x128 || b == 0x16f);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm));
- }
- break;
- case 0x210: /* movss xmm, ea */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_op_ld_v(s, MO_32, s->T0, s->A0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)));
- tcg_gen_movi_tl(s->T0, 0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)));
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)));
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- tcg_gen_ld_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0)));
- tcg_gen_st_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)));
- }
- break;
- case 0x310: /* movsd xmm, ea */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- tcg_gen_movi_tl(s->T0, 0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)));
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
- offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)));
- }
- break;
- case 0x012: /* movlps */
- case 0x112: /* movlpd */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- /* movhlps */
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(1)));
- }
- break;
- case 0x212: /* movsldup */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldo_env_A0(s, ZMM_OFFSET(reg), true);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_L(0)));
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_L(2)));
- }
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0)));
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(2)));
- break;
- case 0x312: /* movddup */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0)));
- }
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0)));
- break;
- case 0x016: /* movhps */
- case 0x116: /* movhpd */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(1)));
- } else {
- /* movlhps */
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0)));
- }
- break;
- case 0x216: /* movshdup */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldo_env_A0(s, ZMM_OFFSET(reg), true);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_L(1)));
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_L(3)));
- }
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(1)));
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(3)));
- break;
- case 0x178:
- case 0x378:
- CHECK_NO_VEX(s);
- {
- int bit_index, field_length;
-
- if (b1 == 1 && reg != 0)
- goto illegal_op;
- field_length = x86_ldub_code(env, s) & 0x3F;
- bit_index = x86_ldub_code(env, s) & 0x3F;
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
- if (b1 == 1)
- gen_helper_extrq_i(cpu_env, s->ptr0,
- tcg_const_i32(bit_index),
- tcg_const_i32(field_length));
- else {
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- op2_offset = offsetof(CPUX86State, xmm_t0);
- gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_D(0)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- op2_offset = ZMM_OFFSET(rm);
- }
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- gen_helper_insertq_i(cpu_env, s->ptr0, s->ptr1,
- tcg_const_i32(bit_index),
- tcg_const_i32(field_length));
- }
- }
- break;
- case 0x7e: /* movd ea, mm */
- CHECK_NO_VEX(s);
-#ifdef TARGET_X86_64
- if (s->dflag == MO_64) {
- tcg_gen_ld_i64(s->T0, cpu_env,
- offsetof(CPUX86State,fpregs[reg].mmx));
- gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 1);
- } else
-#endif
- {
- tcg_gen_ld32u_tl(s->T0, cpu_env,
- offsetof(CPUX86State,fpregs[reg].mmx.MMX_L(0)));
- gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 1);
- }
- break;
- case 0x17e: /* movd ea, xmm */
-#ifdef TARGET_X86_64
- if (s->dflag == MO_64) {
- tcg_gen_ld_i64(s->T0, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0)));
- gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 1);
- } else
-#endif
- {
- tcg_gen_ld32u_tl(s->T0, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0)));
- gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 1);
- }
- break;
- case 0x27e: /* movq xmm, ea */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0)));
- }
- gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)));
- break;
- case 0x7f: /* movq ea, mm */
- CHECK_NO_VEX(s);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx));
- } else {
- rm = (modrm & 7);
- gen_op_movq(s, offsetof(CPUX86State, fpregs[rm].mmx),
- offsetof(CPUX86State,fpregs[reg].mmx));
- }
- break;
- case 0x011: /* movups */
- case 0x111: /* movupd */
- case 0x029: /* movaps */
- case 0x129: /* movapd */
- case 0x17f: /* movdqa ea, xmm */
- case 0x27f: /* movdqu ea, xmm */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_sto_env_A0(s, ZMM_OFFSET(reg),
- /* movaps, movapd, movdqa */
- b == 0x029 || b == 0x129 || b == 0x17f);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movo(s, ZMM_OFFSET(rm), ZMM_OFFSET(reg));
- }
- break;
- case 0x211: /* movss ea, xmm */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- tcg_gen_ld32u_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)));
- gen_op_st_v(s, MO_32, s->T0, s->A0);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movl(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0)));
- }
- break;
- case 0x311: /* movsd ea, xmm */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0)));
- }
- break;
- case 0x013: /* movlps */
- case 0x113: /* movlpd */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- goto illegal_op;
- }
- break;
- case 0x017: /* movhps */
- case 0x117: /* movhpd */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(1)));
- } else {
- goto illegal_op;
- }
- break;
- case 0x71: /* shift mm, im */
- case 0x72:
- case 0x73:
- case 0x171: /* shift xmm, im */
- case 0x172:
- case 0x173:
- val = x86_ldub_code(env, s);
- if (is_xmm) {
- tcg_gen_movi_tl(s->T0, val);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
- tcg_gen_movi_tl(s->T0, 0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_t0.ZMM_L(1)));
- op1_offset = offsetof(CPUX86State,xmm_t0);
- } else {
- CHECK_NO_VEX(s);
- tcg_gen_movi_tl(s->T0, val);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, mmx_t0.MMX_L(0)));
- tcg_gen_movi_tl(s->T0, 0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, mmx_t0.MMX_L(1)));
- op1_offset = offsetof(CPUX86State,mmx_t0);
- }
- assert(b1 < 2);
- SSEFunc_0_epp fn = sse_op_table2[((b - 1) & 3) * 8 +
- (((modrm >> 3)) & 7)][b1];
- if (!fn) {
- goto unknown_op;
- }
- if (is_xmm) {
- rm = (modrm & 7) | REX_B(s);
- op2_offset = ZMM_OFFSET(rm);
- } else {
- rm = (modrm & 7);
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op1_offset);
- fn(cpu_env, s->ptr0, s->ptr1);
- break;
- case 0x050: /* movmskps */
- rm = (modrm & 7) | REX_B(s);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm));
- gen_helper_movmskps_xmm(s->tmp2_i32, cpu_env, s->ptr0);
- tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
- break;
- case 0x150: /* movmskpd */
- rm = (modrm & 7) | REX_B(s);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm));
- gen_helper_movmskpd_xmm(s->tmp2_i32, cpu_env, s->ptr0);
- tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
- break;
- case 0x02a: /* cvtpi2ps */
- case 0x12a: /* cvtpi2pd */
- CHECK_NO_VEX(s);
- gen_helper_enter_mmx(cpu_env);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- op2_offset = offsetof(CPUX86State,mmx_t0);
- gen_ldq_env_A0(s, op2_offset);
- } else {
- rm = (modrm & 7);
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
- }
- op1_offset = ZMM_OFFSET(reg);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- switch(b >> 8) {
- case 0x0:
- gen_helper_cvtpi2ps(cpu_env, s->ptr0, s->ptr1);
- break;
- default:
- case 0x1:
- gen_helper_cvtpi2pd(cpu_env, s->ptr0, s->ptr1);
- break;
- }
- break;
- case 0x22a: /* cvtsi2ss */
- case 0x32a: /* cvtsi2sd */
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- op1_offset = ZMM_OFFSET(reg);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- if (ot == MO_32) {
- SSEFunc_0_epi sse_fn_epi = sse_op_table3ai[(b >> 8) & 1];
- tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
- sse_fn_epi(cpu_env, s->ptr0, s->tmp2_i32);
- } else {
-#ifdef TARGET_X86_64
- SSEFunc_0_epl sse_fn_epl = sse_op_table3aq[(b >> 8) & 1];
- sse_fn_epl(cpu_env, s->ptr0, s->T0);
-#else
- goto illegal_op;
-#endif
- }
- break;
- case 0x02c: /* cvttps2pi */
- case 0x12c: /* cvttpd2pi */
- case 0x02d: /* cvtps2pi */
- case 0x12d: /* cvtpd2pi */
- CHECK_NO_VEX(s);
- gen_helper_enter_mmx(cpu_env);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- op2_offset = offsetof(CPUX86State,xmm_t0);
- /* FIXME: should be 64-bit access if b1 == 0. */
- gen_ldo_env_A0(s, op2_offset, !!b1);
- } else {
- rm = (modrm & 7) | REX_B(s);
- op2_offset = ZMM_OFFSET(rm);
- }
- op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- switch(b) {
- case 0x02c:
- gen_helper_cvttps2pi(cpu_env, s->ptr0, s->ptr1);
- break;
- case 0x12c:
- gen_helper_cvttpd2pi(cpu_env, s->ptr0, s->ptr1);
- break;
- case 0x02d:
- gen_helper_cvtps2pi(cpu_env, s->ptr0, s->ptr1);
- break;
- case 0x12d:
- gen_helper_cvtpd2pi(cpu_env, s->ptr0, s->ptr1);
- break;
- }
- break;
- case 0x22c: /* cvttss2si */
- case 0x32c: /* cvttsd2si */
- case 0x22d: /* cvtss2si */
- case 0x32d: /* cvtsd2si */
- ot = mo_64_32(s->dflag);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- if ((b >> 8) & 1) {
- gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_Q(0)));
- } else {
- gen_op_ld_v(s, MO_32, s->T0, s->A0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
- }
- op2_offset = offsetof(CPUX86State,xmm_t0);
- } else {
- rm = (modrm & 7) | REX_B(s);
- op2_offset = ZMM_OFFSET(rm);
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset);
- if (ot == MO_32) {
- SSEFunc_i_ep sse_fn_i_ep =
- sse_op_table3bi[((b >> 7) & 2) | (b & 1)];
- sse_fn_i_ep(s->tmp2_i32, cpu_env, s->ptr0);
- tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
- } else {
-#ifdef TARGET_X86_64
- SSEFunc_l_ep sse_fn_l_ep =
- sse_op_table3bq[((b >> 7) & 2) | (b & 1)];
- sse_fn_l_ep(s->T0, cpu_env, s->ptr0);
-#else
- goto illegal_op;
-#endif
- }
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- break;
- case 0xc4: /* pinsrw */
- case 0x1c4:
- s->rip_offset = 1;
- gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0);
- val = x86_ldub_code(env, s);
- if (b1) {
- val &= 7;
- tcg_gen_st16_tl(s->T0, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg].ZMM_W(val)));
- } else {
- CHECK_NO_VEX(s);
- val &= 3;
- tcg_gen_st16_tl(s->T0, cpu_env,
- offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val)));
- }
- break;
- case 0xc5: /* pextrw */
- case 0x1c5:
- if (mod != 3)
- goto illegal_op;
- ot = mo_64_32(s->dflag);
- val = x86_ldub_code(env, s);
- if (b1) {
- val &= 7;
- rm = (modrm & 7) | REX_B(s);
- tcg_gen_ld16u_tl(s->T0, cpu_env,
- offsetof(CPUX86State,xmm_regs[rm].ZMM_W(val)));
- } else {
- val &= 3;
- rm = (modrm & 7);
- tcg_gen_ld16u_tl(s->T0, cpu_env,
- offsetof(CPUX86State,fpregs[rm].mmx.MMX_W(val)));
- }
- reg = ((modrm >> 3) & 7) | REX_R(s);
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- break;
- case 0x1d6: /* movq ea, xmm */
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- gen_stq_env_A0(s, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(0)));
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)),
- offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0)));
- gen_op_movq_env_0(s,
- offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(1)));
- }
- break;
- case 0x2d6: /* movq2dq */
- CHECK_NO_VEX(s);
- gen_helper_enter_mmx(cpu_env);
- rm = (modrm & 7);
- gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
- offsetof(CPUX86State,fpregs[rm].mmx));
- gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)));
- break;
- case 0x3d6: /* movdq2q */
- CHECK_NO_VEX(s);
- gen_helper_enter_mmx(cpu_env);
- rm = (modrm & 7) | REX_B(s);
- gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx),
- offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0)));
- break;
- case 0xd7: /* pmovmskb */
- case 0x1d7:
- if (mod != 3)
- goto illegal_op;
- if (b1) {
- rm = (modrm & 7) | REX_B(s);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm));
- gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0);
- } else {
- CHECK_NO_VEX(s);
- rm = (modrm & 7);
- tcg_gen_addi_ptr(s->ptr0, cpu_env,
- offsetof(CPUX86State, fpregs[rm].mmx));
- gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr0);
- }
- reg = ((modrm >> 3) & 7) | REX_R(s);
- tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
- break;
-
- case 0x138:
- case 0x038:
- b = modrm;
- if ((b & 0xf0) == 0xf0) {
- goto do_0f_38_fx;
- }
- modrm = x86_ldub_code(env, s);
- rm = modrm & 7;
- reg = ((modrm >> 3) & 7) | REX_R(s);
- mod = (modrm >> 6) & 3;
-
- assert(b1 < 2);
- op6 = &sse_op_table6[b];
- if (op6->ext_mask == 0) {
- goto unknown_op;
- }
- if (!(s->cpuid_ext_features & op6->ext_mask)) {
- goto illegal_op;
- }
-
- if (b1) {
- op1_offset = ZMM_OFFSET(reg);
- if (mod == 3) {
- op2_offset = ZMM_OFFSET(rm | REX_B(s));
- } else {
- op2_offset = offsetof(CPUX86State,xmm_t0);
- gen_lea_modrm(env, s, modrm);
- switch (b) {
- case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */
- case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */
- case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */
- gen_ldq_env_A0(s, op2_offset +
- offsetof(ZMMReg, ZMM_Q(0)));
- break;
- case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */
- case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */
- tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0,
- s->mem_index, MO_LEUL);
- tcg_gen_st_i32(s->tmp2_i32, cpu_env, op2_offset +
- offsetof(ZMMReg, ZMM_L(0)));
- break;
- case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
- tcg_gen_qemu_ld_tl(s->tmp0, s->A0,
- s->mem_index, MO_LEUW);
- tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset +
- offsetof(ZMMReg, ZMM_W(0)));
- break;
- case 0x2a: /* movntdqa */
- gen_ldo_env_A0(s, op1_offset, true);
- return;
- default:
- gen_ldo_env_A0(s, op2_offset, true);
- }
- }
- if (!op6->fn[b1].op1) {
- goto illegal_op;
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- op6->fn[b1].op1(cpu_env, s->ptr0, s->ptr1);
- } else {
- CHECK_NO_VEX(s);
- if ((op6->flags & SSE_OPF_MMX) == 0) {
- goto unknown_op;
- }
- op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
- if (mod == 3) {
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
- } else {
- op2_offset = offsetof(CPUX86State,mmx_t0);
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, op2_offset);
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- op6->fn[0].op1(cpu_env, s->ptr0, s->ptr1);
- }
-
- if (op6->flags & SSE_OPF_CMP) {
- set_cc_op(s, CC_OP_EFLAGS);
- }
- break;
-
- case 0x238:
- case 0x338:
- do_0f_38_fx:
- /* Various integer extensions at 0f 38 f[0-f]. */
- b = modrm | (b1 << 8);
- modrm = x86_ldub_code(env, s);
- reg = ((modrm >> 3) & 7) | REX_R(s);
-
- switch (b) {
- case 0x3f0: /* crc32 Gd,Eb */
- case 0x3f1: /* crc32 Gd,Ey */
- do_crc32:
- CHECK_NO_VEX(s);
- if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) {
- goto illegal_op;
- }
- if ((b & 0xff) == 0xf0) {
- ot = MO_8;
- } else if (s->dflag != MO_64) {
- ot = (s->prefix & PREFIX_DATA ? MO_16 : MO_32);
- } else {
- ot = MO_64;
- }
-
- tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[reg]);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- gen_helper_crc32(s->T0, s->tmp2_i32,
- s->T0, tcg_const_i32(8 << ot));
-
- ot = mo_64_32(s->dflag);
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- break;
-
- case 0x1f0: /* crc32 or movbe */
- case 0x1f1:
- CHECK_NO_VEX(s);
- /* For these insns, the f3 prefix is supposed to have priority
- over the 66 prefix, but that's not what we implement above
- setting b1. */
- if (s->prefix & PREFIX_REPNZ) {
- goto do_crc32;
- }
- /* FALLTHRU */
- case 0x0f0: /* movbe Gy,My */
- case 0x0f1: /* movbe My,Gy */
- CHECK_NO_VEX(s);
- if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) {
- goto illegal_op;
- }
- if (s->dflag != MO_64) {
- ot = (s->prefix & PREFIX_DATA ? MO_16 : MO_32);
- } else {
- ot = MO_64;
- }
-
- gen_lea_modrm(env, s, modrm);
- if ((b & 1) == 0) {
- tcg_gen_qemu_ld_tl(s->T0, s->A0,
- s->mem_index, ot | MO_BE);
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- } else {
- tcg_gen_qemu_st_tl(cpu_regs[reg], s->A0,
- s->mem_index, ot | MO_BE);
- }
- break;
-
- case 0x0f2: /* andn Gy, By, Ey */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- tcg_gen_andc_tl(s->T0, s->T0, cpu_regs[s->vex_v]);
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- gen_op_update1_cc(s);
- set_cc_op(s, CC_OP_LOGICB + ot);
- break;
-
- case 0x0f7: /* bextr Gy, Ey, By */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- {
- TCGv bound, zero;
-
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- /* Extract START, and shift the operand.
- Shifts larger than operand size get zeros. */
- tcg_gen_ext8u_tl(s->A0, cpu_regs[s->vex_v]);
- tcg_gen_shr_tl(s->T0, s->T0, s->A0);
-
- bound = tcg_const_tl(ot == MO_64 ? 63 : 31);
- zero = tcg_const_tl(0);
- tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound,
- s->T0, zero);
- tcg_temp_free(zero);
-
- /* Extract the LEN into a mask. Lengths larger than
- operand size get all ones. */
- tcg_gen_extract_tl(s->A0, cpu_regs[s->vex_v], 8, 8);
- tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->A0, bound,
- s->A0, bound);
- tcg_temp_free(bound);
- tcg_gen_movi_tl(s->T1, 1);
- tcg_gen_shl_tl(s->T1, s->T1, s->A0);
- tcg_gen_subi_tl(s->T1, s->T1, 1);
- tcg_gen_and_tl(s->T0, s->T0, s->T1);
-
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- gen_op_update1_cc(s);
- set_cc_op(s, CC_OP_LOGICB + ot);
- }
- break;
-
- case 0x0f5: /* bzhi Gy, Ey, By */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- tcg_gen_ext8u_tl(s->T1, cpu_regs[s->vex_v]);
- {
- TCGv bound = tcg_const_tl(ot == MO_64 ? 63 : 31);
- /* Note that since we're using BMILG (in order to get O
- cleared) we need to store the inverse into C. */
- tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
- s->T1, bound);
- tcg_gen_movcond_tl(TCG_COND_GT, s->T1, s->T1,
- bound, bound, s->T1);
- tcg_temp_free(bound);
- }
- tcg_gen_movi_tl(s->A0, -1);
- tcg_gen_shl_tl(s->A0, s->A0, s->T1);
- tcg_gen_andc_tl(s->T0, s->T0, s->A0);
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- gen_op_update1_cc(s);
- set_cc_op(s, CC_OP_BMILGB + ot);
- break;
-
- case 0x3f6: /* mulx By, Gy, rdx, Ey */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- switch (ot) {
- default:
- tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
- tcg_gen_trunc_tl_i32(s->tmp3_i32, cpu_regs[R_EDX]);
- tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32,
- s->tmp2_i32, s->tmp3_i32);
- tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32);
- tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp3_i32);
- break;
-#ifdef TARGET_X86_64
- case MO_64:
- tcg_gen_mulu2_i64(s->T0, s->T1,
- s->T0, cpu_regs[R_EDX]);
- tcg_gen_mov_i64(cpu_regs[s->vex_v], s->T0);
- tcg_gen_mov_i64(cpu_regs[reg], s->T1);
- break;
-#endif
- }
- break;
-
- case 0x3f5: /* pdep Gy, By, Ey */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- /* Note that by zero-extending the source operand, we
- automatically handle zero-extending the result. */
- if (ot == MO_64) {
- tcg_gen_mov_tl(s->T1, cpu_regs[s->vex_v]);
- } else {
- tcg_gen_ext32u_tl(s->T1, cpu_regs[s->vex_v]);
- }
- gen_helper_pdep(cpu_regs[reg], s->T1, s->T0);
- break;
-
- case 0x2f5: /* pext Gy, By, Ey */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- /* Note that by zero-extending the source operand, we
- automatically handle zero-extending the result. */
- if (ot == MO_64) {
- tcg_gen_mov_tl(s->T1, cpu_regs[s->vex_v]);
- } else {
- tcg_gen_ext32u_tl(s->T1, cpu_regs[s->vex_v]);
- }
- gen_helper_pext(cpu_regs[reg], s->T1, s->T0);
- break;
-
- case 0x1f6: /* adcx Gy, Ey */
- case 0x2f6: /* adox Gy, Ey */
- CHECK_NO_VEX(s);
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
- goto illegal_op;
- } else {
- TCGv carry_in, carry_out, zero;
- int end_op;
-
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-
- /* Re-use the carry-out from a previous round. */
- carry_in = NULL;
- carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
- switch (s->cc_op) {
- case CC_OP_ADCX:
- if (b == 0x1f6) {
- carry_in = cpu_cc_dst;
- end_op = CC_OP_ADCX;
- } else {
- end_op = CC_OP_ADCOX;
- }
- break;
- case CC_OP_ADOX:
- if (b == 0x1f6) {
- end_op = CC_OP_ADCOX;
- } else {
- carry_in = cpu_cc_src2;
- end_op = CC_OP_ADOX;
- }
- break;
- case CC_OP_ADCOX:
- end_op = CC_OP_ADCOX;
- carry_in = carry_out;
- break;
- default:
- end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX);
- break;
- }
- /* If we can't reuse carry-out, get it out of EFLAGS. */
- if (!carry_in) {
- if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
- gen_compute_eflags(s);
- }
- carry_in = s->tmp0;
- tcg_gen_extract_tl(carry_in, cpu_cc_src,
- ctz32(b == 0x1f6 ? CC_C : CC_O), 1);
- }
-
- switch (ot) {
-#ifdef TARGET_X86_64
- case MO_32:
- /* If we know TL is 64-bit, and we want a 32-bit
- result, just do everything in 64-bit arithmetic. */
- tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
- tcg_gen_ext32u_i64(s->T0, s->T0);
- tcg_gen_add_i64(s->T0, s->T0, cpu_regs[reg]);
- tcg_gen_add_i64(s->T0, s->T0, carry_in);
- tcg_gen_ext32u_i64(cpu_regs[reg], s->T0);
- tcg_gen_shri_i64(carry_out, s->T0, 32);
- break;
-#endif
- default:
- /* Otherwise compute the carry-out in two steps. */
- zero = tcg_const_tl(0);
- tcg_gen_add2_tl(s->T0, carry_out,
- s->T0, zero,
- carry_in, zero);
- tcg_gen_add2_tl(cpu_regs[reg], carry_out,
- cpu_regs[reg], carry_out,
- s->T0, zero);
- tcg_temp_free(zero);
- break;
- }
- set_cc_op(s, end_op);
- }
- break;
-
- case 0x1f7: /* shlx Gy, Ey, By */
- case 0x2f7: /* sarx Gy, Ey, By */
- case 0x3f7: /* shrx Gy, Ey, By */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- if (ot == MO_64) {
- tcg_gen_andi_tl(s->T1, cpu_regs[s->vex_v], 63);
- } else {
- tcg_gen_andi_tl(s->T1, cpu_regs[s->vex_v], 31);
- }
- if (b == 0x1f7) {
- tcg_gen_shl_tl(s->T0, s->T0, s->T1);
- } else if (b == 0x2f7) {
- if (ot != MO_64) {
- tcg_gen_ext32s_tl(s->T0, s->T0);
- }
- tcg_gen_sar_tl(s->T0, s->T0, s->T1);
- } else {
- if (ot != MO_64) {
- tcg_gen_ext32u_tl(s->T0, s->T0);
- }
- tcg_gen_shr_tl(s->T0, s->T0, s->T1);
- }
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- break;
-
- case 0x0f3:
- case 0x1f3:
- case 0x2f3:
- case 0x3f3: /* Group 17 */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-
- tcg_gen_mov_tl(cpu_cc_src, s->T0);
- switch (reg & 7) {
- case 1: /* blsr By,Ey */
- tcg_gen_subi_tl(s->T1, s->T0, 1);
- tcg_gen_and_tl(s->T0, s->T0, s->T1);
- break;
- case 2: /* blsmsk By,Ey */
- tcg_gen_subi_tl(s->T1, s->T0, 1);
- tcg_gen_xor_tl(s->T0, s->T0, s->T1);
- break;
- case 3: /* blsi By, Ey */
- tcg_gen_neg_tl(s->T1, s->T0);
- tcg_gen_and_tl(s->T0, s->T0, s->T1);
- break;
- default:
- goto unknown_op;
- }
- tcg_gen_mov_tl(cpu_cc_dst, s->T0);
- gen_op_mov_reg_v(s, ot, s->vex_v, s->T0);
- set_cc_op(s, CC_OP_BMILGB + ot);
- break;
-
- default:
- goto unknown_op;
- }
- break;
-
- case 0x03a:
- case 0x13a:
- b = modrm;
- modrm = x86_ldub_code(env, s);
- rm = modrm & 7;
- reg = ((modrm >> 3) & 7) | REX_R(s);
- mod = (modrm >> 6) & 3;
-
- assert(b1 < 2);
- op7 = &sse_op_table7[b];
- if (op7->ext_mask == 0) {
- goto unknown_op;
- }
- if (!(s->cpuid_ext_features & op7->ext_mask)) {
- goto illegal_op;
- }
-
- s->rip_offset = 1;
-
- if (op7->flags & SSE_OPF_SPECIAL) {
- /* None of the "special" ops are valid on mmx registers */
- if (b1 == 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- rm = (modrm & 7) | REX_B(s);
- if (mod != 3)
- gen_lea_modrm(env, s, modrm);
- reg = ((modrm >> 3) & 7) | REX_R(s);
- val = x86_ldub_code(env, s);
- switch (b) {
- case 0x14: /* pextrb */
- tcg_gen_ld8u_tl(s->T0, cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_B(val & 15)));
- if (mod == 3) {
- gen_op_mov_reg_v(s, ot, rm, s->T0);
- } else {
- tcg_gen_qemu_st_tl(s->T0, s->A0,
- s->mem_index, MO_UB);
- }
- break;
- case 0x15: /* pextrw */
- tcg_gen_ld16u_tl(s->T0, cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_W(val & 7)));
- if (mod == 3) {
- gen_op_mov_reg_v(s, ot, rm, s->T0);
- } else {
- tcg_gen_qemu_st_tl(s->T0, s->A0,
- s->mem_index, MO_LEUW);
- }
- break;
- case 0x16:
- if (ot == MO_32) { /* pextrd */
- tcg_gen_ld_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(val & 3)));
- if (mod == 3) {
- tcg_gen_extu_i32_tl(cpu_regs[rm], s->tmp2_i32);
- } else {
- tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0,
- s->mem_index, MO_LEUL);
- }
- } else { /* pextrq */
-#ifdef TARGET_X86_64
- tcg_gen_ld_i64(s->tmp1_i64, cpu_env,
- offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(val & 1)));
- if (mod == 3) {
- tcg_gen_mov_i64(cpu_regs[rm], s->tmp1_i64);
- } else {
- tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0,
- s->mem_index, MO_LEUQ);
- }
-#else
- goto illegal_op;
-#endif
- }
- break;
- case 0x17: /* extractps */
- tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(val & 3)));
- if (mod == 3) {
- gen_op_mov_reg_v(s, ot, rm, s->T0);
- } else {
- tcg_gen_qemu_st_tl(s->T0, s->A0,
- s->mem_index, MO_LEUL);
- }
- break;
- case 0x20: /* pinsrb */
- if (mod == 3) {
- gen_op_mov_v_reg(s, MO_32, s->T0, rm);
- } else {
- tcg_gen_qemu_ld_tl(s->T0, s->A0,
- s->mem_index, MO_UB);
- }
- tcg_gen_st8_tl(s->T0, cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_B(val & 15)));
- break;
- case 0x21: /* insertps */
- if (mod == 3) {
- tcg_gen_ld_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State,xmm_regs[rm]
- .ZMM_L((val >> 6) & 3)));
- } else {
- tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0,
- s->mem_index, MO_LEUL);
- }
- tcg_gen_st_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg]
- .ZMM_L((val >> 4) & 3)));
- if ((val >> 0) & 1)
- tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
- cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(0)));
- if ((val >> 1) & 1)
- tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
- cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(1)));
- if ((val >> 2) & 1)
- tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
- cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(2)));
- if ((val >> 3) & 1)
- tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
- cpu_env, offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(3)));
- break;
- case 0x22:
- if (ot == MO_32) { /* pinsrd */
- if (mod == 3) {
- tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[rm]);
- } else {
- tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0,
- s->mem_index, MO_LEUL);
- }
- tcg_gen_st_i32(s->tmp2_i32, cpu_env,
- offsetof(CPUX86State,
- xmm_regs[reg].ZMM_L(val & 3)));
- } else { /* pinsrq */
-#ifdef TARGET_X86_64
- if (mod == 3) {
- gen_op_mov_v_reg(s, ot, s->tmp1_i64, rm);
- } else {
- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0,
- s->mem_index, MO_LEUQ);
- }
- tcg_gen_st_i64(s->tmp1_i64, cpu_env,
- offsetof(CPUX86State,
- xmm_regs[reg].ZMM_Q(val & 1)));
-#else
- goto illegal_op;
-#endif
- }
- break;
- }
- return;
- }
-
- if (b1 == 0) {
- CHECK_NO_VEX(s);
- /* MMX */
- if ((op7->flags & SSE_OPF_MMX) == 0) {
- goto illegal_op;
- }
- op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
- if (mod == 3) {
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
- } else {
- op2_offset = offsetof(CPUX86State,mmx_t0);
- gen_lea_modrm(env, s, modrm);
- gen_ldq_env_A0(s, op2_offset);
- }
- val = x86_ldub_code(env, s);
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
-
- /* We only actually have one MMX instuction (palignr) */
- assert(b == 0x0f);
-
- op7->fn[0].op1(cpu_env, s->ptr0, s->ptr1,
- tcg_const_i32(val));
- break;
- }
-
- /* SSE */
- op1_offset = ZMM_OFFSET(reg);
- if (mod == 3) {
- op2_offset = ZMM_OFFSET(rm | REX_B(s));
- } else {
- op2_offset = offsetof(CPUX86State, xmm_t0);
- gen_lea_modrm(env, s, modrm);
- gen_ldo_env_A0(s, op2_offset, true);
- }
-
- val = x86_ldub_code(env, s);
- if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
- set_cc_op(s, CC_OP_EFLAGS);
-
- if (s->dflag == MO_64) {
- /* The helper must use entire 64-bit gp registers */
- val |= 1 << 8;
- }
- }
-
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- op7->fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val));
- if (op7->flags & SSE_OPF_CMP) {
- set_cc_op(s, CC_OP_EFLAGS);
- }
- break;
-
- case 0x33a:
- /* Various integer extensions at 0f 3a f[0-f]. */
- b = modrm | (b1 << 8);
- modrm = x86_ldub_code(env, s);
- reg = ((modrm >> 3) & 7) | REX_R(s);
-
- switch (b) {
- case 0x3f0: /* rorx Gy,Ey, Ib */
- if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
- || !(s->prefix & PREFIX_VEX)
- || s->vex_l != 0) {
- goto illegal_op;
- }
- ot = mo_64_32(s->dflag);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- b = x86_ldub_code(env, s);
- if (ot == MO_64) {
- tcg_gen_rotri_tl(s->T0, s->T0, b & 63);
- } else {
- tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
- tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b & 31);
- tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
- }
- gen_op_mov_reg_v(s, ot, reg, s->T0);
- break;
-
- default:
- goto unknown_op;
- }
- break;
-
- default:
- unknown_op:
- gen_unknown_opcode(env, s);
- return;
- }
- } else {
- /* generic MMX or SSE operation */
- switch(b) {
- case 0x70: /* pshufx insn */
- case 0xc6: /* pshufx insn */
- case 0xc2: /* compare insns */
- s->rip_offset = 1;
- break;
- default:
- break;
- }
- if (is_xmm) {
- op1_offset = ZMM_OFFSET(reg);
- if (mod != 3) {
- int sz = 4;
-
- gen_lea_modrm(env, s, modrm);
- op2_offset = offsetof(CPUX86State, xmm_t0);
-
- if (sse_op_flags & SSE_OPF_SCALAR) {
- if (sse_op_flags & SSE_OPF_CMP) {
- /* ucomis[sd], comis[sd] */
- if (b1 == 0) {
- sz = 2;
- } else {
- sz = 3;
- }
- } else {
- /* Most sse scalar operations. */
- if (b1 == 2) {
- sz = 2;
- } else if (b1 == 3) {
- sz = 3;
- }
- }
- }
-
- switch (sz) {
- case 2:
- /* 32 bit access */
- gen_op_ld_v(s, MO_32, s->T0, s->A0);
- tcg_gen_st32_tl(s->T0, cpu_env,
- offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
- break;
- case 3:
- /* 64 bit access */
- gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_D(0)));
- break;
- default:
- /* 128 bit access */
- gen_ldo_env_A0(s, op2_offset, true);
- break;
- }
- } else {
- rm = (modrm & 7) | REX_B(s);
- op2_offset = ZMM_OFFSET(rm);
- }
- } else {
- CHECK_NO_VEX(s);
- op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm);
- op2_offset = offsetof(CPUX86State,mmx_t0);
- gen_ldq_env_A0(s, op2_offset);
- } else {
- rm = (modrm & 7);
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
- }
- if (sse_op_flags & SSE_OPF_3DNOW) {
- /* 3DNow! data insns */
- val = x86_ldub_code(env, s);
- SSEFunc_0_epp op_3dnow = sse_op_table5[val];
- if (!op_3dnow) {
- goto unknown_op;
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- op_3dnow(cpu_env, s->ptr0, s->ptr1);
- return;
- }
- }
- tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
- if (sse_op_flags & SSE_OPF_SHUF) {
- val = x86_ldub_code(env, s);
- sse_op_fn.op1i(s->ptr0, s->ptr1, tcg_const_i32(val));
- } else if (b == 0xf7) {
- /* maskmov : we must prepare A0 */
- if (mod != 3) {
- goto illegal_op;
- }
- tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]);
- gen_extu(s->aflag, s->A0);
- gen_add_A0_ds_seg(s);
- sse_op_fn.op1t(cpu_env, s->ptr0, s->ptr1, s->A0);
- } else if (b == 0xc2) {
- /* compare insns, bits 7:3 (7:5 for AVX) are ignored */
- val = x86_ldub_code(env, s) & 7;
- sse_op_table4[val][b1](cpu_env, s->ptr0, s->ptr1);
- } else {
- sse_op_fn.op1(cpu_env, s->ptr0, s->ptr1);
- }
-
- if (sse_op_flags & SSE_OPF_CMP) {
- set_cc_op(s, CC_OP_EFLAGS);
- }
- }
-}
+#include "decode-new.h"
+#include "emit.c.inc"
+#include "decode-new.c.inc"
/* convert one instruction. s->base.is_jmp is set if the translation must
be stopped. Return the next pc value */
@@ -4817,11 +2964,11 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
int modrm, reg, rm, mod, op, opreg, val;
bool orig_cc_op_dirty = s->cc_op_dirty;
CCOp orig_cc_op = s->cc_op;
+ target_ulong orig_pc_save = s->pc_save;
s->pc = s->base.pc_next;
s->override = -1;
#ifdef TARGET_X86_64
- s->rex_w = false;
s->rex_r = 0;
s->rex_x = 0;
s->rex_b = 0;
@@ -4829,6 +2976,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
s->rip_offset = 0; /* for relative ip address */
s->vex_l = 0;
s->vex_v = 0;
+ s->vex_w = false;
switch (sigsetjmp(s->jmpbuf, 0)) {
case 0:
break;
@@ -4838,8 +2986,15 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
case 2:
/* Restore state that may affect the next instruction. */
s->pc = s->base.pc_next;
+ /*
+ * TODO: These save/restore can be removed after the table-based
+ * decoder is complete; we will be decoding the insn completely
+ * before any code generation that might affect these variables.
+ */
s->cc_op_dirty = orig_cc_op_dirty;
s->cc_op = orig_cc_op;
+ s->pc_save = orig_pc_save;
+ /* END TODO */
s->base.num_insns--;
tcg_remove_ops_after(s->prev_insn_end);
s->base.is_jmp = DISAS_TOO_MANY;
@@ -4851,9 +3006,15 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
prefixes = 0;
next_byte:
+ s->prefix = prefixes;
b = x86_ldub_code(env, s);
/* Collect prefixes. */
switch (b) {
+ default:
+ break;
+ case 0x0f:
+ b = x86_ldub_code(env, s) + 0x100;
+ break;
case 0xf3:
prefixes |= PREFIX_REPZ;
prefixes &= ~PREFIX_REPNZ;
@@ -4894,7 +3055,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
if (CODE64(s)) {
/* REX prefix */
prefixes |= PREFIX_REX;
- s->rex_w = (b >> 3) & 1;
+ s->vex_w = (b >> 3) & 1;
s->rex_r = (b & 0x4) << 1;
s->rex_x = (b & 0x2) << 2;
s->rex_b = (b & 0x1) << 3;
@@ -4904,58 +3065,17 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
#endif
case 0xc5: /* 2-byte VEX */
case 0xc4: /* 3-byte VEX */
- /* VEX prefixes cannot be used except in 32-bit mode.
- Otherwise the instruction is LES or LDS. */
if (CODE32(s) && !VM86(s)) {
- static const int pp_prefix[4] = {
- 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
- };
- int vex3, vex2 = x86_ldub_code(env, s);
+ int vex2 = x86_ldub_code(env, s);
+ s->pc--; /* rewind the advance_pc() x86_ldub_code() did */
if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
/* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
otherwise the instruction is LES or LDS. */
- s->pc--; /* rewind the advance_pc() x86_ldub_code() did */
break;
}
-
- /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
- if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
- | PREFIX_LOCK | PREFIX_DATA | PREFIX_REX)) {
- goto illegal_op;
- }
-#ifdef TARGET_X86_64
- s->rex_r = (~vex2 >> 4) & 8;
-#endif
- if (b == 0xc5) {
- /* 2-byte VEX prefix: RVVVVlpp, implied 0f leading opcode byte */
- vex3 = vex2;
- b = x86_ldub_code(env, s) | 0x100;
- } else {
- /* 3-byte VEX prefix: RXBmmmmm wVVVVlpp */
- vex3 = x86_ldub_code(env, s);
-#ifdef TARGET_X86_64
- s->rex_x = (~vex2 >> 3) & 8;
- s->rex_b = (~vex2 >> 2) & 8;
- s->rex_w = (vex3 >> 7) & 1;
-#endif
- switch (vex2 & 0x1f) {
- case 0x01: /* Implied 0f leading opcode bytes. */
- b = x86_ldub_code(env, s) | 0x100;
- break;
- case 0x02: /* Implied 0f 38 leading opcode bytes. */
- b = 0x138;
- break;
- case 0x03: /* Implied 0f 3a leading opcode bytes. */
- b = 0x13a;
- break;
- default: /* Reserved for future use. */
- goto unknown_op;
- }
- }
- s->vex_v = (~vex3 >> 3) & 0xf;
- s->vex_l = (vex3 >> 2) & 1;
- prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX;
+ disas_insn_new(s, cpu, b);
+ return s->pc;
}
break;
}
@@ -4988,14 +3108,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
s->dflag = dflag;
/* now check op code */
- reswitch:
- switch(b) {
- case 0x0f:
- /**************************/
- /* extended op code */
- b = x86_ldub_code(env, s) | 0x100;
- goto reswitch;
-
+ switch (b) {
/**************************/
/* arith & logic */
case 0x00 ... 0x05:
@@ -5931,7 +4044,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
reg = ((modrm >> 3) & 7) | REX_R(s);
{
AddressParts a = gen_lea_modrm_0(env, s, modrm);
- TCGv ea = gen_lea_modrm_1(s, a);
+ TCGv ea = gen_lea_modrm_1(s, a, false);
gen_lea_v_seg(s, s->aflag, ea, -1, -1);
gen_op_mov_reg_v(s, dflag, reg, s->A0);
}
@@ -6154,7 +4267,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
if (mod != 3) {
/* memory op */
AddressParts a = gen_lea_modrm_0(env, s, modrm);
- TCGv ea = gen_lea_modrm_1(s, a);
+ TCGv ea = gen_lea_modrm_1(s, a, false);
TCGv last_addr = tcg_temp_new();
bool update_fdp = true;
@@ -7149,7 +5262,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
gen_exts(ot, s->T1);
tcg_gen_sari_tl(s->tmp0, s->T1, 3 + ot);
tcg_gen_shli_tl(s->tmp0, s->tmp0, ot);
- tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a), s->tmp0);
+ tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a, false), s->tmp0);
gen_lea_v_seg(s, s->aflag, s->A0, a.def_seg, s->override);
if (!(s->prefix & PREFIX_LOCK)) {
gen_op_ld_v(s, ot, s->T0, s->A0);
@@ -8198,7 +6311,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
/* rip-relative generates #ud */
goto illegal_op;
}
- tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, a));
+ tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, a, false));
if (!CODE64(s)) {
tcg_gen_ext32u_tl(s->A0, s->A0);
}
@@ -8622,11 +6735,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
set_cc_op(s, CC_OP_POPCNT);
break;
- case 0x10e ... 0x10f:
- /* 3DNow! instructions, ignore prefixes */
- s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
- /* fall through */
- case 0x110 ... 0x117:
+ case 0x10e ... 0x117:
case 0x128 ... 0x12f:
case 0x138 ... 0x13a:
case 0x150 ... 0x179:
@@ -8634,7 +6743,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
case 0x1c2:
case 0x1c4 ... 0x1c6:
case 0x1d0 ... 0x1fe:
- gen_sse(env, s, b);
+ disas_insn_new(s, cpu, b);
break;
default:
goto unknown_op;
@@ -8780,6 +6889,7 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
+ dc->cpuid_7_0_ecx_features = env->features[FEAT_7_0_ECX];
dc->cpuid_xsave_features = env->features[FEAT_XSAVE];
dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) ||
(flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
@@ -8799,8 +6909,6 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
dc->tmp2_i32 = tcg_temp_new_i32();
dc->tmp3_i32 = tcg_temp_new_i32();
dc->tmp4 = tcg_temp_new();
- dc->ptr0 = tcg_temp_new_ptr();
- dc->ptr1 = tcg_temp_new_ptr();
dc->cc_srcT = tcg_temp_local_new();
}
diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
index 3273aa8..81831ca 100644
--- a/tests/tcg/i386/Makefile.target
+++ b/tests/tcg/i386/Makefile.target
@@ -107,7 +107,7 @@ run-test-mmx: QEMU_OPTS += -cpu max
run-plugin-test-mmx: QEMU_OPTS += -cpu max
test-mmx: test-mmx.h
-test-avx: CFLAGS += -masm=intel -O -I.
+test-avx: CFLAGS += -mavx -masm=intel -O -I.
run-test-avx: QEMU_OPTS += -cpu max
run-plugin-test-avx: QEMU_OPTS += -cpu max
test-avx: test-avx.h
diff --git a/tests/tcg/i386/test-avx.c b/tests/tcg/i386/test-avx.c
index 23c170d..953e290 100644
--- a/tests/tcg/i386/test-avx.c
+++ b/tests/tcg/i386/test-avx.c
@@ -6,18 +6,18 @@
typedef void (*testfn)(void);
typedef struct {
- uint64_t q0, q1;
-} __attribute__((aligned(16))) v2di;
+ uint64_t q0, q1, q2, q3;
+} __attribute__((aligned(32))) v4di;
typedef struct {
uint64_t mm[8];
- v2di xmm[16];
+ v4di ymm[16];
uint64_t r[16];
uint64_t flags;
uint32_t ff;
uint64_t pad;
- v2di mem[4];
- v2di mem0[4];
+ v4di mem[4];
+ v4di mem0[4];
} reg_state;
typedef struct {
@@ -31,20 +31,20 @@ reg_state initI;
reg_state initF32;
reg_state initF64;
-static void dump_xmm(const char *name, int n, const v2di *r, int ff)
+static void dump_ymm(const char *name, int n, const v4di *r, int ff)
{
- printf("%s%d = %016lx %016lx\n",
- name, n, r->q1, r->q0);
+ printf("%s%d = %016lx %016lx %016lx %016lx\n",
+ name, n, r->q3, r->q2, r->q1, r->q0);
if (ff == 64) {
- double v[2];
+ double v[4];
memcpy(v, r, sizeof(v));
- printf(" %16g %16g\n",
- v[1], v[0]);
+ printf(" %16g %16g %16g %16g\n",
+ v[3], v[2], v[1], v[0]);
} else if (ff == 32) {
- float v[4];
+ float v[8];
memcpy(v, r, sizeof(v));
- printf(" %8g %8g %8g %8g\n",
- v[3], v[2], v[1], v[0]);
+ printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n",
+ v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
}
}
@@ -53,10 +53,10 @@ static void dump_regs(reg_state *s)
int i;
for (i = 0; i < 16; i++) {
- dump_xmm("xmm", i, &s->xmm[i], 0);
+ dump_ymm("ymm", i, &s->ymm[i], 0);
}
for (i = 0; i < 4; i++) {
- dump_xmm("mem", i, &s->mem0[i], 0);
+ dump_ymm("mem", i, &s->mem0[i], 0);
}
}
@@ -74,13 +74,13 @@ static void compare_state(const reg_state *a, const reg_state *b)
}
}
for (i = 0; i < 16; i++) {
- if (memcmp(&a->xmm[i], &b->xmm[i], 16)) {
- dump_xmm("xmm", i, &b->xmm[i], a->ff);
+ if (memcmp(&a->ymm[i], &b->ymm[i], 32)) {
+ dump_ymm("ymm", i, &b->ymm[i], a->ff);
}
}
for (i = 0; i < 4; i++) {
- if (memcmp(&a->mem0[i], &a->mem[i], 16)) {
- dump_xmm("mem", i, &a->mem[i], a->ff);
+ if (memcmp(&a->mem0[i], &a->mem[i], 32)) {
+ dump_ymm("mem", i, &a->mem[i], a->ff);
}
}
if (a->flags != b->flags) {
@@ -89,9 +89,9 @@ static void compare_state(const reg_state *a, const reg_state *b)
}
#define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t"
-#define LOADXMM(r, o) "movdqa " #r ", " #o "[%0]\n\t"
+#define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t"
#define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t"
-#define STOREXMM(r, o) "movdqa " #o "[%1], " #r "\n\t"
+#define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t"
#define MMREG(F) \
F(mm0, 0x00) \
F(mm1, 0x08) \
@@ -101,39 +101,39 @@ static void compare_state(const reg_state *a, const reg_state *b)
F(mm5, 0x28) \
F(mm6, 0x30) \
F(mm7, 0x38)
-#define XMMREG(F) \
- F(xmm0, 0x040) \
- F(xmm1, 0x050) \
- F(xmm2, 0x060) \
- F(xmm3, 0x070) \
- F(xmm4, 0x080) \
- F(xmm5, 0x090) \
- F(xmm6, 0x0a0) \
- F(xmm7, 0x0b0) \
- F(xmm8, 0x0c0) \
- F(xmm9, 0x0d0) \
- F(xmm10, 0x0e0) \
- F(xmm11, 0x0f0) \
- F(xmm12, 0x100) \
- F(xmm13, 0x110) \
- F(xmm14, 0x120) \
- F(xmm15, 0x130)
+#define YMMREG(F) \
+ F(ymm0, 0x040) \
+ F(ymm1, 0x060) \
+ F(ymm2, 0x080) \
+ F(ymm3, 0x0a0) \
+ F(ymm4, 0x0c0) \
+ F(ymm5, 0x0e0) \
+ F(ymm6, 0x100) \
+ F(ymm7, 0x120) \
+ F(ymm8, 0x140) \
+ F(ymm9, 0x160) \
+ F(ymm10, 0x180) \
+ F(ymm11, 0x1a0) \
+ F(ymm12, 0x1c0) \
+ F(ymm13, 0x1e0) \
+ F(ymm14, 0x200) \
+ F(ymm15, 0x220)
#define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t"
#define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t"
#define REG(F) \
- F(rbx, 0x148) \
- F(rcx, 0x150) \
- F(rdx, 0x158) \
- F(rsi, 0x160) \
- F(rdi, 0x168) \
- F(r8, 0x180) \
- F(r9, 0x188) \
- F(r10, 0x190) \
- F(r11, 0x198) \
- F(r12, 0x1a0) \
- F(r13, 0x1a8) \
- F(r14, 0x1b0) \
- F(r15, 0x1b8) \
+ F(rbx, 0x248) \
+ F(rcx, 0x250) \
+ F(rdx, 0x258) \
+ F(rsi, 0x260) \
+ F(rdi, 0x268) \
+ F(r8, 0x280) \
+ F(r9, 0x288) \
+ F(r10, 0x290) \
+ F(r11, 0x298) \
+ F(r12, 0x2a0) \
+ F(r13, 0x2a8) \
+ F(r14, 0x2b0) \
+ F(r15, 0x2b8) \
static void run_test(const TestDef *t)
{
@@ -143,7 +143,7 @@ static void run_test(const TestDef *t)
printf("%5d %s\n", t->n, t->s);
asm volatile(
MMREG(LOADMM)
- XMMREG(LOADXMM)
+ YMMREG(LOADYMM)
"sub rsp, 128\n\t"
"push rax\n\t"
"push rbx\n\t"
@@ -156,26 +156,26 @@ static void run_test(const TestDef *t)
"pop rbx\n\t"
"shr rbx, 8\n\t"
"shl rbx, 8\n\t"
- "mov rcx, 0x1c0[rax]\n\t"
+ "mov rcx, 0x2c0[rax]\n\t"
"and rcx, 0xff\n\t"
"or rbx, rcx\n\t"
"push rbx\n\t"
"popf\n\t"
REG(LOADREG)
- "mov rax, 0x140[rax]\n\t"
+ "mov rax, 0x240[rax]\n\t"
"call [rsp]\n\t"
"mov [rsp], rax\n\t"
"mov rax, 8[rsp]\n\t"
REG(STOREREG)
"mov rbx, [rsp]\n\t"
- "mov 0x140[rax], rbx\n\t"
+ "mov 0x240[rax], rbx\n\t"
"mov rbx, 0\n\t"
- "mov 0x170[rax], rbx\n\t"
- "mov 0x178[rax], rbx\n\t"
+ "mov 0x270[rax], rbx\n\t"
+ "mov 0x278[rax], rbx\n\t"
"pushf\n\t"
"pop rbx\n\t"
"and rbx, 0xff\n\t"
- "mov 0x1c0[rax], rbx\n\t"
+ "mov 0x2c0[rax], rbx\n\t"
"add rsp, 16\n\t"
"pop rdx\n\t"
"pop rcx\n\t"
@@ -183,15 +183,15 @@ static void run_test(const TestDef *t)
"pop rax\n\t"
"add rsp, 128\n\t"
MMREG(STOREMM)
- XMMREG(STOREXMM)
+ YMMREG(STOREYMM)
: : "r"(init), "r"(&result), "r"(t->fn)
: "memory", "cc",
"rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
- "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
- "xmm12", "xmm13", "xmm14", "xmm15"
+ "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+ "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+ "ymm12", "ymm13", "ymm14", "ymm15"
);
compare_state(init, &result);
}
@@ -223,22 +223,30 @@ static void run_all(void)
float val_f32[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5, 8.3};
double val_f64[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5};
-v2di val_i64[] = {
- {0x3d6b3b6a9e4118f2lu, 0x355ae76d2774d78clu},
- {0xd851c54a56bf1f29lu, 0x4a84d1d50bf4c4fflu},
- {0x5826475e2c5fd799lu, 0xfd32edc01243f5e9lu},
+v4di val_i64[] = {
+ {0x3d6b3b6a9e4118f2lu, 0x355ae76d2774d78clu,
+ 0xac3ff76c4daa4b28lu, 0xe7fabd204cb54083lu},
+ {0xd851c54a56bf1f29lu, 0x4a84d1d50bf4c4fflu,
+ 0x56621e553d52b56clu, 0xd0069553da8f584alu},
+ {0x5826475e2c5fd799lu, 0xfd32edc01243f5e9lu,
+ 0x738ba2c66d3fe126lu, 0x5707219c6e6c26b4lu},
};
-v2di deadbeef = {0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull};
-v2di indexq = {0x000000000000001full, 0x000000000000008full};
-v2di indexd = {0x00000002000000efull, 0xfffffff500000010ull};
+v4di deadbeef = {0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull,
+ 0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull};
+v4di indexq = {0x000000000000001full, 0x000000000000008full,
+ 0xffffffffffffffffull, 0xffffffffffffff5full};
+v4di indexd = {0x00000002000000efull, 0xfffffff500000010ull,
+ 0x0000000afffffff0ull, 0x000000000000000eull};
-void init_f32reg(v2di *r)
+v4di gather_mem[0x20];
+
+void init_f32reg(v4di *r)
{
static int n;
- float v[4];
+ float v[8];
int i;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 8; i++) {
v[i] = val_f32[n++];
if (n == ARRAY_LEN(val_f32)) {
n = 0;
@@ -247,12 +255,12 @@ void init_f32reg(v2di *r)
memcpy(r, v, sizeof(*r));
}
-void init_f64reg(v2di *r)
+void init_f64reg(v4di *r)
{
static int n;
- double v[2];
+ double v[4];
int i;
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 4; i++) {
v[i] = val_f64[n++];
if (n == ARRAY_LEN(val_f64)) {
n = 0;
@@ -261,13 +269,15 @@ void init_f64reg(v2di *r)
memcpy(r, v, sizeof(*r));
}
-void init_intreg(v2di *r)
+void init_intreg(v4di *r)
{
static uint64_t mask;
static int n;
r->q0 = val_i64[n].q0 ^ mask;
r->q1 = val_i64[n].q1 ^ mask;
+ r->q2 = val_i64[n].q2 ^ mask;
+ r->q3 = val_i64[n].q3 ^ mask;
n++;
if (n == ARRAY_LEN(val_i64)) {
n = 0;
@@ -280,46 +290,53 @@ static void init_all(reg_state *s)
int i;
s->r[3] = (uint64_t)&s->mem[0]; /* rdx */
+ s->r[4] = (uint64_t)&gather_mem[ARRAY_LEN(gather_mem) / 2]; /* rsi */
s->r[5] = (uint64_t)&s->mem[2]; /* rdi */
s->flags = 2;
- for (i = 0; i < 8; i++) {
- s->xmm[i] = deadbeef;
+ for (i = 0; i < 16; i++) {
+ s->ymm[i] = deadbeef;
}
- s->xmm[13] = indexd;
- s->xmm[14] = indexq;
- for (i = 0; i < 2; i++) {
+ s->ymm[13] = indexd;
+ s->ymm[14] = indexq;
+ for (i = 0; i < 4; i++) {
s->mem0[i] = deadbeef;
}
}
int main(int argc, char *argv[])
{
+ int i;
+
init_all(&initI);
- init_intreg(&initI.xmm[10]);
- init_intreg(&initI.xmm[11]);
- init_intreg(&initI.xmm[12]);
+ init_intreg(&initI.ymm[10]);
+ init_intreg(&initI.ymm[11]);
+ init_intreg(&initI.ymm[12]);
init_intreg(&initI.mem0[1]);
printf("Int:\n");
dump_regs(&initI);
init_all(&initF32);
- init_f32reg(&initF32.xmm[10]);
- init_f32reg(&initF32.xmm[11]);
- init_f32reg(&initF32.xmm[12]);
+ init_f32reg(&initF32.ymm[10]);
+ init_f32reg(&initF32.ymm[11]);
+ init_f32reg(&initF32.ymm[12]);
init_f32reg(&initF32.mem0[1]);
initF32.ff = 32;
printf("F32:\n");
dump_regs(&initF32);
init_all(&initF64);
- init_f64reg(&initF64.xmm[10]);
- init_f64reg(&initF64.xmm[11]);
- init_f64reg(&initF64.xmm[12]);
+ init_f64reg(&initF64.ymm[10]);
+ init_f64reg(&initF64.ymm[11]);
+ init_f64reg(&initF64.ymm[12]);
init_f64reg(&initF64.mem0[1]);
initF64.ff = 64;
printf("F64:\n");
dump_regs(&initF64);
+ for (i = 0; i < ARRAY_LEN(gather_mem); i++) {
+ init_intreg(&gather_mem[i]);
+ }
+
if (argc > 1) {
int n = atoi(argv[1]);
run_test(&test_table[n]);
diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py
index e16a3d8..0298232 100755
--- a/tests/tcg/i386/test-avx.py
+++ b/tests/tcg/i386/test-avx.py
@@ -8,6 +8,7 @@ from fnmatch import fnmatch
archs = [
"SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2",
+ "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX",
]
ignore = set(["FISTTP",
@@ -42,7 +43,7 @@ imask = {
'vROUND[PS][SD]': 0x7,
'vSHUFPD': 0x0f,
'vSHUFPS': 0xff,
- 'vAESKEYGENASSIST': 0,
+ 'vAESKEYGENASSIST': 0xff,
'VEXTRACT[FI]128': 0x01,
'VINSERT[FI]128': 0x01,
'VPBLENDD': 0xff,
@@ -85,7 +86,7 @@ def mem_w(w):
else:
raise Exception()
- return t + " PTR 16[rdx]"
+ return t + " PTR 32[rdx]"
class XMMArg():
isxmm = True