aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--target/i386/ops_sse.h188
-rw-r--r--target/i386/ops_sse_header.h19
-rw-r--r--target/i386/tcg/decode-new.c.inc112
-rw-r--r--target/i386/tcg/decode-new.h6
-rw-r--r--target/i386/tcg/emit.c.inc205
-rw-r--r--target/i386/tcg/translate.c2
6 files changed, 524 insertions, 8 deletions
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index cb8909a..e3cc694 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2382,6 +2382,36 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
#endif
#if SHIFT >= 1
+void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ uint64_t r0, r1;
+ int i;
+
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = v->Q(i + ((s->Q(i) >> 1) & 1));
+ r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1));
+ d->Q(i) = r0;
+ d->Q(i+1) = r1;
+ }
+}
+
+void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ uint32_t r0, r1, r2, r3;
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ r0 = v->L(i + (s->L(i) & 3));
+ r1 = v->L(i + (s->L(i+1) & 3));
+ r2 = v->L(i + (s->L(i+2) & 3));
+ r3 = v->L(i + (s->L(i+3) & 3));
+ d->L(i) = r0;
+ d->L(i+1) = r1;
+ d->L(i+2) = r2;
+ d->L(i+3) = r3;
+ }
+}
+
void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
{
uint64_t r0, r1;
@@ -2414,6 +2444,150 @@ void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
}
}
+#if SHIFT == 1
+#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
+#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
+#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31))
+#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
+#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
+#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
+#endif
+
+SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
+SSE_HELPER_L(helper_vpsravd, FPSRAVD)
+SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
+
+SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
+SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
+SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
+
+void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ uint32_t zf = 0, cf = 0;
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i++) {
+ zf |= (s->L(i) & d->L(i));
+ cf |= (s->L(i) & ~d->L(i));
+ }
+ CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
+}
+
+void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ uint64_t zf = 0, cf = 0;
+ int i;
+
+ for (i = 0; i < 1 << SHIFT; i++) {
+ zf |= (s->Q(i) & d->Q(i));
+ cf |= (s->Q(i) & ~d->Q(i));
+ }
+ CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
+}
+
+void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
+ }
+ }
+}
+
+void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
+ }
+ }
+}
+
+void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
+ }
+}
+
+void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ int i;
+
+ for (i = 0; i < (1 << SHIFT); i++) {
+ d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0;
+ }
+}
+
+void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ target_ulong addr = a0
+ + ((target_ulong)(int32_t)s->L(i) << scale);
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+ }
+ v->L(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ target_ulong addr = a0
+ + ((target_ulong)(int32_t)s->L(i) << scale);
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
+ }
+ v->Q(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ target_ulong addr = a0
+ + ((target_ulong)(int64_t)s->Q(i) << scale);
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+ }
+ v->L(i) = 0;
+ }
+ for (i /= 2; i < 1 << SHIFT; i++) {
+ d->Q(i) = 0;
+ v->Q(i) = 0;
+ }
+}
+
+void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ target_ulong addr = a0
+ + ((target_ulong)(int64_t)s->Q(i) << scale);
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
+ }
+ v->Q(i) = 0;
+ }
+}
+#endif
+
#if SHIFT >= 2
void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order)
{
@@ -2473,7 +2647,19 @@ void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order)
d->Q(2) = r2;
d->Q(3) = r3;
}
-#endif
+
+void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
+{
+ uint32_t r[8];
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ r[i] = s->L(v->L(i) & 7);
+ }
+ for (i = 0; i < 8; i++) {
+ d->L(i) = r[i];
+ }
+}
#endif
#undef SSE_HELPER_S
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 1afc4ff..dd8dceb 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -413,9 +413,28 @@ DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32)
/* AVX helpers */
#if SHIFT >= 1
+DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg)
DEF_HELPER_3(glue(vpermilpd_imm, SUFFIX), void, Reg, Reg, i32)
DEF_HELPER_3(glue(vpermilps_imm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_6(glue(vpgatherdd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherdq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherqd, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
+DEF_HELPER_6(glue(vpgatherqq, SUFFIX), void, env, Reg, Reg, Reg, tl, i32)
#if SHIFT == 2
+DEF_HELPER_3(vpermd_ymm, void, Reg, Reg, Reg)
DEF_HELPER_4(vpermdq_ymm, void, Reg, Reg, Reg, i32)
DEF_HELPER_3(vpermq_ymm, void, Reg, Reg, i32)
#endif
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 3330085..e148edd 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -92,6 +92,7 @@
#define mmx .special = X86_SPECIAL_MMX,
#define zext0 .special = X86_SPECIAL_ZExtOp0,
#define zext2 .special = X86_SPECIAL_ZExtOp2,
+#define avx_movx .special = X86_SPECIAL_AVXExtMov,
#define vex1 .vex_class = 1,
#define vex1_rep3 .vex_class = 1, .vex_special = X86_VEX_REPScalar,
@@ -302,6 +303,105 @@ static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
}
static const X86OpEntry opcodes_0F38_00toEF[240] = {
+ [0x00] = X86_OP_ENTRY3(PSHUFB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x01] = X86_OP_ENTRY3(PHADDW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x02] = X86_OP_ENTRY3(PHADDD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x03] = X86_OP_ENTRY3(PHADDSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x04] = X86_OP_ENTRY3(PMADDUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x05] = X86_OP_ENTRY3(PHSUBW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x06] = X86_OP_ENTRY3(PHSUBD, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x07] = X86_OP_ENTRY3(PHSUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+
+ [0x10] = X86_OP_ENTRY2(PBLENDVB, V,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x14] = X86_OP_ENTRY2(BLENDVPS, V,x, W,x, vex4 cpuid(SSE41) p_66),
+ [0x15] = X86_OP_ENTRY2(BLENDVPD, V,x, W,x, vex4 cpuid(SSE41) p_66),
+ /* Listed incorrectly as type 4 */
+ [0x16] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+ [0x17] = X86_OP_ENTRY3(VPTEST, None,None, V,x, W,x, vex4 cpuid(SSE41) p_66),
+
+ /*
+ * Source operand listed as Mq/Ux and similar in the manual; incorrectly listed
+ * as 128-bit only in 2-17.
+ */
+ [0x20] = X86_OP_ENTRY3(VPMOVSXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x21] = X86_OP_ENTRY3(VPMOVSXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x22] = X86_OP_ENTRY3(VPMOVSXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x23] = X86_OP_ENTRY3(VPMOVSXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x24] = X86_OP_ENTRY3(VPMOVSXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x25] = X86_OP_ENTRY3(VPMOVSXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+
+ /* Same as PMOVSX. */
+ [0x30] = X86_OP_ENTRY3(VPMOVZXBW, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x31] = X86_OP_ENTRY3(VPMOVZXBD, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x32] = X86_OP_ENTRY3(VPMOVZXBQ, V,x, None,None, W,w, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x33] = X86_OP_ENTRY3(VPMOVZXWD, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x34] = X86_OP_ENTRY3(VPMOVZXWQ, V,x, None,None, W,d, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x35] = X86_OP_ENTRY3(VPMOVZXDQ, V,x, None,None, W,q, vex5 cpuid(SSE41) avx_movx avx2_256 p_66),
+ [0x36] = X86_OP_ENTRY3(VPERMD, V,qq, H,qq, W,qq, vex6 cpuid(AVX2) p_66),
+ [0x37] = X86_OP_ENTRY3(PCMPGTQ, V,x, H,x, W,x, vex4 cpuid(SSE42) avx2_256 p_66),
+
+ [0x40] = X86_OP_ENTRY3(PMULLD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x41] = X86_OP_ENTRY3(VPHMINPOSUW, V,dq, None,None, W,dq, vex4 cpuid(SSE41) p_66),
+ /* Listed incorrectly as type 4 */
+ [0x45] = X86_OP_ENTRY3(VPSRLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+ [0x46] = X86_OP_ENTRY3(VPSRAV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+ [0x47] = X86_OP_ENTRY3(VPSLLV, V,x, H,x, W,x, vex6 cpuid(AVX2) p_66),
+
+ [0x90] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vpgatherdd/q */
+ [0x91] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vpgatherqd/q */
+ [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */
+ [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */
+
+ [0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0b] = X86_OP_ENTRY3(PMULHRSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x0c] = X86_OP_ENTRY3(VPERMILPS, V,x, H,x, W,x, vex4 cpuid(AVX) p_00_66),
+ [0x0d] = X86_OP_ENTRY3(VPERMILPD, V,x, H,x, W,x, vex4 cpuid(AVX) p_66),
+ [0x0e] = X86_OP_ENTRY3(VTESTPS, None,None, V,x, W,x, vex4 cpuid(AVX) p_66),
+ [0x0f] = X86_OP_ENTRY3(VTESTPD, None,None, V,x, W,x, vex4 cpuid(AVX) p_66),
+
+ [0x18] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX) p_66), /* vbroadcastss */
+ [0x19] = X86_OP_ENTRY3(VPBROADCASTQ, V,qq, None,None, W,q, vex6 cpuid(AVX) p_66), /* vbroadcastsd */
+ [0x1a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX) p_66),
+ [0x1c] = X86_OP_ENTRY3(PABSB, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x1d] = X86_OP_ENTRY3(PABSW, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+ [0x1e] = X86_OP_ENTRY3(PABSD, V,x, None,None, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
+
+ [0x28] = X86_OP_ENTRY3(PMULDQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x29] = X86_OP_ENTRY3(PCMPEQQ, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x2a] = X86_OP_ENTRY3(MOVDQ, V,x, None,None, WM,x, vex1 cpuid(SSE41) avx2_256 p_66), /* movntdqa */
+ [0x2b] = X86_OP_ENTRY3(VPACKUSDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x2c] = X86_OP_ENTRY3(VMASKMOVPS, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66),
+ [0x2d] = X86_OP_ENTRY3(VMASKMOVPD, V,x, H,x, WM,x, vex6 cpuid(AVX) p_66),
+ /* Incorrectly listed as Mx,Hx,Vx in the manual */
+ [0x2e] = X86_OP_ENTRY3(VMASKMOVPS_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66),
+ [0x2f] = X86_OP_ENTRY3(VMASKMOVPD_st, M,x, V,x, H,x, vex6 cpuid(AVX) p_66),
+
+ [0x38] = X86_OP_ENTRY3(PMINSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x39] = X86_OP_ENTRY3(PMINSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3a] = X86_OP_ENTRY3(PMINUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3b] = X86_OP_ENTRY3(PMINUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3c] = X86_OP_ENTRY3(PMAXSB, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3d] = X86_OP_ENTRY3(PMAXSD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3e] = X86_OP_ENTRY3(PMAXUW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x3f] = X86_OP_ENTRY3(PMAXUD, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+
+ [0x58] = X86_OP_ENTRY3(VPBROADCASTD, V,x, None,None, W,d, vex6 cpuid(AVX2) p_66),
+ [0x59] = X86_OP_ENTRY3(VPBROADCASTQ, V,x, None,None, W,q, vex6 cpuid(AVX2) p_66),
+ [0x5a] = X86_OP_ENTRY3(VBROADCASTx128, V,qq, None,None, WM,dq,vex6 cpuid(AVX2) p_66),
+
+ [0x78] = X86_OP_ENTRY3(VPBROADCASTB, V,x, None,None, W,b, vex6 cpuid(AVX2) p_66),
+ [0x79] = X86_OP_ENTRY3(VPBROADCASTW, V,x, None,None, W,w, vex6 cpuid(AVX2) p_66),
+
+ [0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66),
+ [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66),
+
+ [0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66),
+ [0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xde] = X86_OP_ENTRY3(VAESDEC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+ [0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
};
/* five rows for no prefix, 66, F3, F2, 66+F2 */
@@ -431,8 +531,8 @@ static const X86OpEntry opcodes_0F3A[256] = {
[0x0b] = X86_OP_ENTRY4(VROUNDSD, V,x, H,x, W,sd, vex3 cpuid(SSE41) p_66),
[0x0c] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66),
[0x0d] = X86_OP_ENTRY4(VBLENDPD, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66),
- [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41) p_66),
- [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx p_00_66),
+ [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66),
+ [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
[0x18] = X86_OP_ENTRY4(VINSERTx128, V,qq, H,qq, W,qq, vex6 cpuid(AVX) p_66),
[0x19] = X86_OP_ENTRY3(VEXTRACTx128, W,dq, V,qq, I,b, vex6 cpuid(AVX) p_66),
@@ -1428,6 +1528,14 @@ static void disas_insn_new(DisasContext *s, CPUState *cpu, int b)
}
break;
+ case X86_SPECIAL_AVXExtMov:
+ if (!decode.op[2].has_ea) {
+ decode.op[2].ot = s->vex_l ? MO_256 : MO_128;
+ } else if (s->vex_l) {
+ decode.op[2].ot++;
+ }
+ break;
+
case X86_SPECIAL_MMX:
if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA))) {
gen_helper_enter_mmx(cpu_env);
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index ef318a0..2f0b0e4 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -143,6 +143,12 @@ typedef enum X86InsnSpecial {
X86_SPECIAL_ZExtOp2,
/*
+ * Register operand 2 is extended to full width, while a memory operand
+ * is doubled in size if VEX.L=1.
+ */
+ X86_SPECIAL_AVXExtMov,
+
+ /*
* MMX instruction exists with no prefix; if there is no prefix, V/H/W/U operands
* become P/P/Q/N, and size "x" becomes "q".
*/
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index ad93094..e33688f 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -19,6 +19,9 @@
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
+typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+ TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
+
static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
{
return tcg_constant_i32(val);
@@ -484,15 +487,20 @@ static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86Decoded
fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
tcg_temp_free_ptr(ptr3);
}
-#define TERNARY_SSE(uvname, lname) \
+#define TERNARY_SSE(uname, uvname, lname) \
static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4, \
gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
+} \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_ternary_sse(s, env, decode, 0, \
+ gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
}
-TERNARY_SSE(VBLENDVPS, blendvps)
-TERNARY_SSE(VBLENDVPD, blendvpd)
-TERNARY_SSE(VPBLENDVB, pblendvb)
+TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
+TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
+TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
@@ -522,6 +530,25 @@ BINARY_IMM_SSE(VDDPD, dppd)
BINARY_IMM_SSE(VMPSADBW, mpsadbw)
BINARY_IMM_SSE(PCLMULQDQ, pclmulqdq)
+
+#define UNARY_INT_GVEC(uname, func, ...) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ int vec_len = vector_len(s, decode); \
+ \
+ func(__VA_ARGS__, decode->op[0].offset, \
+ decode->op[2].offset, vec_len, vec_len); \
+}
+UNARY_INT_GVEC(PABSB, tcg_gen_gvec_abs, MO_8)
+UNARY_INT_GVEC(PABSW, tcg_gen_gvec_abs, MO_16)
+UNARY_INT_GVEC(PABSD, tcg_gen_gvec_abs, MO_32)
+UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
+UNARY_INT_GVEC(VPBROADCASTB, tcg_gen_gvec_dup_mem, MO_8)
+UNARY_INT_GVEC(VPBROADCASTW, tcg_gen_gvec_dup_mem, MO_16)
+UNARY_INT_GVEC(VPBROADCASTD, tcg_gen_gvec_dup_mem, MO_32)
+UNARY_INT_GVEC(VPBROADCASTQ, tcg_gen_gvec_dup_mem, MO_64)
+
+
#define BINARY_INT_GVEC(uname, func, ...) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
@@ -544,14 +571,25 @@ BINARY_INT_GVEC(PAND, tcg_gen_gvec_and, MO_64)
BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
+BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
+BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
+BINARY_INT_GVEC(PMAXSB, tcg_gen_gvec_smax, MO_8)
BINARY_INT_GVEC(PMAXSW, tcg_gen_gvec_smax, MO_16)
+BINARY_INT_GVEC(PMAXSD, tcg_gen_gvec_smax, MO_32)
BINARY_INT_GVEC(PMAXUB, tcg_gen_gvec_umax, MO_8)
+BINARY_INT_GVEC(PMAXUW, tcg_gen_gvec_umax, MO_16)
+BINARY_INT_GVEC(PMAXUD, tcg_gen_gvec_umax, MO_32)
+BINARY_INT_GVEC(PMINSB, tcg_gen_gvec_smin, MO_8)
BINARY_INT_GVEC(PMINSW, tcg_gen_gvec_smin, MO_16)
+BINARY_INT_GVEC(PMINSD, tcg_gen_gvec_smin, MO_32)
BINARY_INT_GVEC(PMINUB, tcg_gen_gvec_umin, MO_8)
+BINARY_INT_GVEC(PMINUW, tcg_gen_gvec_umin, MO_16)
+BINARY_INT_GVEC(PMINUD, tcg_gen_gvec_umin, MO_32)
BINARY_INT_GVEC(PMULLW, tcg_gen_gvec_mul, MO_16)
+BINARY_INT_GVEC(PMULLD, tcg_gen_gvec_mul, MO_32)
BINARY_INT_GVEC(POR, tcg_gen_gvec_or, MO_64)
BINARY_INT_GVEC(PSUBB, tcg_gen_gvec_sub, MO_8)
BINARY_INT_GVEC(PSUBW, tcg_gen_gvec_sub, MO_16)
@@ -626,6 +664,19 @@ BINARY_INT_MMX(PSRLQ_r, psrlq)
BINARY_INT_MMX(PSRAW_r, psraw)
BINARY_INT_MMX(PSRAD_r, psrad)
+BINARY_INT_MMX(PHADDW, phaddw)
+BINARY_INT_MMX(PHADDSW, phaddsw)
+BINARY_INT_MMX(PHADDD, phaddd)
+BINARY_INT_MMX(PHSUBW, phsubw)
+BINARY_INT_MMX(PHSUBSW, phsubsw)
+BINARY_INT_MMX(PHSUBD, phsubd)
+BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
+BINARY_INT_MMX(PSHUFB, pshufb)
+BINARY_INT_MMX(PSIGNB, psignb)
+BINARY_INT_MMX(PSIGNW, psignw)
+BINARY_INT_MMX(PSIGND, psignd)
+BINARY_INT_MMX(PMULHRSW, pmulhrsw)
+
/* Instructions with no MMX equivalent. */
#define BINARY_INT_SSE(uname, lname) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
@@ -636,8 +687,35 @@ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
gen_helper_##lname##_ymm); \
}
+/* Instructions with no MMX equivalent. */
BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
+BINARY_INT_SSE(VPACKUSDW, packusdw)
+BINARY_INT_SSE(VPERMILPS, vpermilps)
+BINARY_INT_SSE(VPERMILPD, vpermilpd)
+BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
+BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
+
+BINARY_INT_SSE(PMULDQ, pmuldq)
+
+BINARY_INT_SSE(VAESDEC, aesdec)
+BINARY_INT_SSE(VAESDECLAST, aesdeclast)
+BINARY_INT_SSE(VAESENC, aesenc)
+BINARY_INT_SSE(VAESENCLAST, aesenclast)
+
+#define UNARY_CMP_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ if (!s->vex_l) { \
+ gen_helper_##lname##_xmm(cpu_env, OP_PTR1, OP_PTR2); \
+ } else { \
+ gen_helper_##lname##_ymm(cpu_env, OP_PTR1, OP_PTR2); \
+ } \
+ set_cc_op(s, CC_OP_EFLAGS); \
+}
+UNARY_CMP_SSE(VPTEST, ptest)
+UNARY_CMP_SSE(VTESTPS, vtestps)
+UNARY_CMP_SSE(VTESTPD, vtestpd)
static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
@@ -657,6 +735,20 @@ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
gen_helper_##lname##_ymm); \
}
+UNARY_INT_SSE(VPMOVSXBW, pmovsxbw)
+UNARY_INT_SSE(VPMOVSXBD, pmovsxbd)
+UNARY_INT_SSE(VPMOVSXBQ, pmovsxbq)
+UNARY_INT_SSE(VPMOVSXWD, pmovsxwd)
+UNARY_INT_SSE(VPMOVSXWQ, pmovsxwq)
+UNARY_INT_SSE(VPMOVSXDQ, pmovsxdq)
+
+UNARY_INT_SSE(VPMOVZXBW, pmovzxbw)
+UNARY_INT_SSE(VPMOVZXBD, pmovzxbd)
+UNARY_INT_SSE(VPMOVZXBQ, pmovzxbq)
+UNARY_INT_SSE(VPMOVZXWD, pmovzxwd)
+UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq)
+UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq)
+
UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
@@ -714,6 +806,64 @@ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
UNARY_IMM_FP_SSE(VROUNDPS, roundps)
UNARY_IMM_FP_SSE(VROUNDPD, roundpd)
+static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
+ SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
+{
+ SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
+ SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
+ SSEFunc_0_eppp fn = s->vex_w ? q : d;
+ fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+
+/* VEX.W affects whether to operate on 32- or 64-bit elements. */
+#define VEXW_AVX(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_vexw_avx(s, env, decode, \
+ gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
+ gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
+}
+VEXW_AVX(VPSLLV, vpsllv)
+VEXW_AVX(VPSRLV, vpsrlv)
+VEXW_AVX(VPSRAV, vpsrav)
+VEXW_AVX(VPMASKMOV, vpmaskmov)
+
+/* Same as above, but with extra arguments to the helper. */
+static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
+ SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
+{
+ SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
+ SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
+ SSEFunc_0_epppti fn = s->vex_w ? q : d;
+ TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
+ TCGv_ptr index = tcg_temp_new_ptr();
+
+ /* Pass third input as (index, base, scale) */
+ tcg_gen_addi_ptr(index, cpu_env, ZMM_OFFSET(decode->mem.index));
+ fn(cpu_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
+
+ /*
+ * There are two output operands, so zero OP1's high 128 bits
+ * in the VEX.128 case.
+ */
+ if (!s->vex_l) {
+ int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
+ tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
+ }
+ tcg_temp_free_ptr(index);
+}
+#define VSIB_AVX(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ gen_vsib_avx(s, env, decode, \
+ gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
+ gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
+}
+VSIB_AVX(VPGATHERD, vpgatherd)
+VSIB_AVX(VPGATHERQ, vpgatherq)
+
static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
{
TCGv carry_in = NULL;
@@ -1484,6 +1634,12 @@ static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *de
gen_helper_aeskeygenassist_xmm(cpu_env, OP_PTR0, OP_PTR1, imm);
}
+static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(!s->vex_l);
+ gen_helper_aesimc_xmm(cpu_env, OP_PTR0, OP_PTR2);
+}
+
static void gen_VCVTfp2fp(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
gen_unary_fp_sse(s, env, decode,
@@ -1567,6 +1723,41 @@ static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *d
decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
}
+static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+ SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
+{
+ if (!s->vex_l) {
+ xmm(cpu_env, OP_PTR2, OP_PTR1, s->A0);
+ } else {
+ ymm(cpu_env, OP_PTR2, OP_PTR1, s->A0);
+ }
+}
+
+static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
+}
+
+static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
+}
+
+static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (s->vex_w) {
+ gen_VMASKMOVPD_st(s, env, decode);
+ } else {
+ gen_VMASKMOVPS_st(s, env, decode);
+ }
+}
+
+static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(s->vex_l);
+ gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
+}
+
static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -1574,6 +1765,12 @@ static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *de
gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
}
+static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ assert(!s->vex_l);
+ gen_helper_phminposuw_xmm(cpu_env, OP_PTR0, OP_PTR2);
+}
+
static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 8570088..0e269e9 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4782,7 +4782,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
use_new &= b <= limit;
#endif
if (use_new &&
- (b == 0x13a ||
+ (b == 0x138 || b == 0x13a ||
(b >= 0x150 && b <= 0x17f) ||
(b >= 0x1d0 && b <= 0x1ff))) {
disas_insn_new(s, cpu, b + 0x100);