diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2014-02-20 15:02:06 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2014-02-20 15:02:07 +0000 |
commit | 3d2bb5cc81ca52dcff854172625a3bb33987495c (patch) | |
tree | 464c866b0d1337bb232669cbdb7591880926ff20 | |
parent | 61e8a923646903d76a6d952019716b417d42eedc (diff) | |
parent | 6399ab3325b7d4f77441c8a00fa9dae98bb0ac43 (diff) | |
download | qemu-3d2bb5cc81ca52dcff854172625a3bb33987495c.zip qemu-3d2bb5cc81ca52dcff854172625a3bb33987495c.tar.gz qemu-3d2bb5cc81ca52dcff854172625a3bb33987495c.tar.bz2 |
Merge remote-tracking branch 'remotes/rth/tcg-next' into staging
* remotes/rth/tcg-next:
tcg/i386: Use SHLX/SHRX/SARX instructions
tcg/i386: Use ANDN instruction
tcg/i386: Add tcg_out_vex_modrm
tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
tcg/optimize: Add more identity simplifications
tcg/optimize: Optmize ANDC X,Y,Y to MOV X,0
tcg/optimize: Simply some logical ops to NOT
tcg/optimize: Handle known-zeros masks for ANDC
tcg/optimize: add known-zero bits compute for load ops
tcg/optimize: improve known-zero bits for 32-bit ops
tcg/optimize: fix known-zero bits optimization
tcg/optimize: fix known-zero bits for right shift ops
tcg-arm: The shift count of op_rotl_i32 is in args[2] not args[1].
TCG: Fix 32-bit host allocation typo
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | disas/i386.c | 146 | ||||
-rw-r--r-- | tcg/arm/tcg-target.c | 2 | ||||
-rw-r--r-- | tcg/i386/tcg-target.c | 156 | ||||
-rw-r--r-- | tcg/i386/tcg-target.h | 9 | ||||
-rw-r--r-- | tcg/optimize.c | 165 | ||||
-rw-r--r-- | tcg/tcg.c | 2 |
6 files changed, 414 insertions, 66 deletions
diff --git a/disas/i386.c b/disas/i386.c index 044e02c..00ceca9 100644 --- a/disas/i386.c +++ b/disas/i386.c @@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis static void print_displacement (char *, bfd_vma); static void OP_E (int, int); static void OP_G (int, int); +static void OP_vvvv (int, int); static bfd_vma get64 (void); static bfd_signed_vma get32 (void); static bfd_signed_vma get32s (void); @@ -264,6 +265,9 @@ static int rex_used; current instruction. */ static int used_prefixes; +/* The VEX.vvvv register, unencoded. */ +static int vex_reg; + /* Flags stored in PREFIXES. */ #define PREFIX_REPZ 1 #define PREFIX_REPNZ 2 @@ -278,6 +282,10 @@ static int used_prefixes; #define PREFIX_ADDR 0x400 #define PREFIX_FWAIT 0x800 +#define PREFIX_VEX_0F 0x1000 +#define PREFIX_VEX_0F38 0x2000 +#define PREFIX_VEX_0F3A 0x4000 + /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive) to ADDR (exclusive) are valid. Returns 1 for success, longjmps on error. */ @@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define XX { NULL, 0 } +#define Bv { OP_vvvv, v_mode } #define Eb { OP_E, b_mode } #define Ev { OP_E, v_mode } #define Ed { OP_E, d_mode } @@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } } #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } } #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } } - +#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } } +#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } } #define X86_64_0 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } } #define X86_64_1 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } } @@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = { { "(bad)", { XX } }, }, + /* PREGRP105 */ + { + { "andnS", { Gv, Bv, Ev } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + }, + + /* PREGRP106 */ + { + { "bextrS", { Gv, Ev, Bv } }, + { "sarxS", { Gv, Ev, Bv } }, + { "shlxS", { Gv, Ev, Bv } }, + { "shrxS", { Gv, Ev, Bv } }, + }, + }; static const struct dis386 x86_64_table[][2] = { @@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = { /* f0 */ { PREGRP87 }, { PREGRP88 }, + { PREGRP105 }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { PREGRP106 }, /* f8 */ { "(bad)", { XX } }, { "(bad)", { XX } }, @@ -3477,6 +3503,74 @@ ckprefix (void) } } +static void +ckvexprefix (void) +{ + int op, vex2, vex3, newrex = 0, newpfx = prefixes; + + if (address_mode == mode_16bit) { + return; + } + + fetch_data(the_info, codep + 1); + op = *codep; + + if (op != 0xc4 && op != 0xc5) { + return; + } + + fetch_data(the_info, codep + 2); + vex2 = codep[1]; + + if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) { + return; + } + + if (op == 0xc4) { + /* Three byte VEX prefix. */ + fetch_data(the_info, codep + 3); + vex3 = codep[2]; + + newrex |= (vex2 & 0x80 ? 0 : REX_R); + newrex |= (vex2 & 0x40 ? 0 : REX_X); + newrex |= (vex2 & 0x20 ? 0 : REX_B); + newrex |= (vex3 & 0x80 ? REX_W : 0); + switch (vex2 & 0x1f) { /* VEX.m-mmmm */ + case 1: + newpfx |= PREFIX_VEX_0F; + break; + case 2: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38; + break; + case 3: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A; + break; + } + vex2 = vex3; + codep += 3; + } else { + /* Two byte VEX prefix. */ + newrex |= (vex2 & 0x80 ? 0 : REX_R); + codep += 2; + } + + vex_reg = (~vex2 >> 3) & 15; /* VEX.vvvv */ + switch (vex2 & 3) { /* VEX.pp */ + case 1: + newpfx |= PREFIX_DATA; /* 0x66 */ + break; + case 2: + newpfx |= PREFIX_REPZ; /* 0xf3 */ + break; + case 3: + newpfx |= PREFIX_REPNZ; /* 0xf2 */ + break; + } + + rex = newrex; + prefixes = newpfx; +} + /* Return the name of the prefix byte PREF, or NULL if PREF is not a prefix byte. */ @@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info) const char *p; struct dis_private priv; unsigned char op; + unsigned char threebyte; if (info->mach == bfd_mach_x86_64_intel_syntax || info->mach == bfd_mach_x86_64) @@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info) obufp = obuf; ckprefix (); + ckvexprefix (); insn_codep = codep; sizeflag = priv.orig_sizeflag; @@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info) } op = 0; + if (prefixes & PREFIX_VEX_0F) + { + used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A; + if (prefixes & PREFIX_VEX_0F38) + threebyte = 0x38; + else if (prefixes & PREFIX_VEX_0F3A) + threebyte = 0x3a; + else + threebyte = *codep++; + goto vex_opcode; + } if (*codep == 0x0f) { - unsigned char threebyte; fetch_data(info, codep + 2); - threebyte = *++codep; + threebyte = codep[1]; + codep += 2; + vex_opcode: dp = &dis386_twobyte[threebyte]; - need_modrm = twobyte_has_modrm[*codep]; - uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep]; - uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep]; - uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep]; - uses_LOCK_prefix = (*codep & ~0x02) == 0x20; - codep++; + need_modrm = twobyte_has_modrm[threebyte]; + uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte]; + uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte]; + uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte]; + uses_LOCK_prefix = (threebyte & ~0x02) == 0x20; if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE) { fetch_data(info, codep + 2); @@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag) } } +static void +OP_vvvv (int bytemode, int sizeflags) +{ + USED_REX (REX_W); + if (rex & REX_W) { + oappend(names64[vex_reg]); + } else { + oappend(names32[vex_reg]); + } +} + static bfd_vma get64 (void) { diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 82658a1..c8884b3 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1866,7 +1866,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, SHIFT_IMM_ROR((0x20 - args[2]) & 0x1f) : SHIFT_IMM_LSL(0)); } else { - tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[1], 0x20); + tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[2], 0x20); tcg_out_dat_reg(s, COND_AL, ARITH_MOV, args[0], 0, args[1], SHIFT_REG_ROR(TCG_REG_TMP)); } diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 5d4cf93..fef1717 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -88,6 +88,11 @@ static const int tcg_target_call_oarg_regs[] = { #endif }; +/* Constants we accept. */ +#define TCG_CT_CONST_S32 0x100 +#define TCG_CT_CONST_U32 0x200 +#define TCG_CT_CONST_I32 0x400 + /* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ @@ -124,6 +129,16 @@ static bool have_movbe; # define have_movbe 0 #endif +/* We need this symbol in tcg-target.h, and we can't properly conditionalize + it there. Therefore we always define the variable. */ +bool have_bmi1; + +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) +static bool have_bmi2; +#else +# define have_bmi2 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -166,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); break; case 'c': + case_c: ct->ct |= TCG_CT_REG; tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); break; @@ -194,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xf); break; case 'r': + case_r: ct->ct |= TCG_CT_REG; if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(ct->u.regs, 0, 0xffff); @@ -201,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xff); } break; + case 'C': + /* With SHRX et al, we need not use ECX as shift count register. */ + if (have_bmi2) { + goto case_r; + } else { + goto case_c; + } /* qemu_ld/st address constraint */ case 'L': @@ -220,6 +244,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) case 'Z': ct->ct |= TCG_CT_CONST_U32; break; + case 'I': + ct->ct |= TCG_CT_CONST_I32; + break; default: return -1; @@ -243,6 +270,9 @@ static inline int tcg_target_const_match(tcg_target_long val, if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { return 1; } + if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { + return 1; + } return 0; } @@ -268,10 +298,13 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ +#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ +#define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) #define OPC_BSWAP (0xc8 | P_EXT) #define OPC_CALL_Jz (0xe8) @@ -309,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_SHIFT_1 (0xd1) #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) +#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) +#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_XCHG_ax_r32 (0x90) @@ -398,9 +434,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) rex = 0; rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ - rex |= (r & 8) >> 1; /* REX.R */ - rex |= (x & 8) >> 2; /* REX.X */ - rex |= (rm & 8) >> 3; /* REX.B */ + rex |= (r & 8) >> 1; /* REX.R */ + rex |= (x & 8) >> 2; /* REX.X */ + rex |= (rm & 8) >> 3; /* REX.B */ /* P_REXB_{R,RM} indicates that the given register is the low byte. For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, @@ -449,6 +485,48 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +{ + int tmp; + + if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) { + /* Three byte VEX prefix. */ + tcg_out8(s, 0xc4); + + /* VEX.m-mmmm */ + if (opc & P_EXT38) { + tmp = 2; + } else if (opc & P_EXT) { + tmp = 1; + } else { + tcg_abort(); + } + tmp |= 0x40; /* VEX.X */ + tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ + tcg_out8(s, tmp); + + tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ + } else { + /* Two byte VEX prefix. */ + tcg_out8(s, 0xc5); + + tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + } + /* VEX.pp */ + if (opc & P_DATA16) { + tmp |= 1; /* 0x66 */ + } else if (opc & P_SIMDF3) { + tmp |= 2; /* 0xf3 */ + } else if (opc & P_SIMDF2) { + tmp |= 3; /* 0xf2 */ + } + tmp |= (~v & 15) << 3; /* VEX.vvvv */ + tcg_out8(s, tmp); + tcg_out8(s, opc); + tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); +} + /* Output an opcode with a full "rm + (index<<shift) + offset" address mode. We handle either RM and INDEX missing with a negative value. In 64-bit mode for absolute addresses, ~RM is the size of the immediate operand @@ -1638,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) { - int c, rexw = 0; + int c, vexop, rexw = 0; #if TCG_TARGET_REG_BITS == 64 # define OP_32_64(x) \ @@ -1774,6 +1852,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_32_64(andc): + if (const_args[2]) { + tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, + args[0], args[1]); + tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0); + } else { + tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]); + } + break; + OP_32_64(mul): if (const_args[2]) { int32_t val; @@ -1799,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, OP_32_64(shl): c = SHIFT_SHL; - goto gen_shift; + vexop = OPC_SHLX; + goto gen_shift_maybe_vex; OP_32_64(shr): c = SHIFT_SHR; - goto gen_shift; + vexop = OPC_SHRX; + goto gen_shift_maybe_vex; OP_32_64(sar): c = SHIFT_SAR; - goto gen_shift; + vexop = OPC_SARX; + goto gen_shift_maybe_vex; OP_32_64(rotl): c = SHIFT_ROL; goto gen_shift; OP_32_64(rotr): c = SHIFT_ROR; goto gen_shift; + gen_shift_maybe_vex: + if (have_bmi2 && !const_args[2]) { + tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); + break; + } + /* FALLTHRU */ gen_shift: if (const_args[2]) { tcg_out_shifti(s, c + rexw, args[0], args[2]); @@ -2002,10 +2099,11 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i32, { "r", "0", "ri" } }, { INDEX_op_or_i32, { "r", "0", "ri" } }, { INDEX_op_xor_i32, { "r", "0", "ri" } }, + { INDEX_op_andc_i32, { "r", "r", "ri" } }, - { INDEX_op_shl_i32, { "r", "0", "ci" } }, - { INDEX_op_shr_i32, { "r", "0", "ci" } }, - { INDEX_op_sar_i32, { "r", "0", "ci" } }, + { INDEX_op_shl_i32, { "r", "0", "Ci" } }, + { INDEX_op_shr_i32, { "r", "0", "Ci" } }, + { INDEX_op_sar_i32, { "r", "0", "Ci" } }, { INDEX_op_rotl_i32, { "r", "0", "ci" } }, { INDEX_op_rotr_i32, { "r", "0", "ci" } }, @@ -2059,10 +2157,11 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i64, { "r", "0", "reZ" } }, { INDEX_op_or_i64, { "r", "0", "re" } }, { INDEX_op_xor_i64, { "r", "0", "re" } }, + { INDEX_op_andc_i64, { "r", "r", "rI" } }, - { INDEX_op_shl_i64, { "r", "0", "ci" } }, - { INDEX_op_shr_i64, { "r", "0", "ci" } }, - { INDEX_op_sar_i64, { "r", "0", "ci" } }, + { INDEX_op_shl_i64, { "r", "0", "Ci" } }, + { INDEX_op_shr_i64, { "r", "0", "Ci" } }, + { INDEX_op_sar_i64, { "r", "0", "Ci" } }, { INDEX_op_rotl_i64, { "r", "0", "ci" } }, { INDEX_op_rotr_i64, { "r", "0", "ci" } }, @@ -2196,25 +2295,34 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { -#if !(defined(have_cmov) && defined(have_movbe)) - { - unsigned a, b, c, d; - int ret = __get_cpuid(1, &a, &b, &c, &d); + unsigned a, b, c, d; + int max = __get_cpuid_max(0, 0); -# ifndef have_cmov + if (max >= 1) { + __cpuid(1, a, b, c, d); +#ifndef have_cmov /* For 32-bit, 99% certainty that we're running on hardware that supports cmov, but we still need to check. In case cmov is not available, we'll use a small forward branch. */ - have_cmov = ret && (d & bit_CMOV); -# endif - -# ifndef have_movbe + have_cmov = (d & bit_CMOV) != 0; +#endif +#ifndef have_movbe /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ - have_movbe = ret && (c & bit_MOVBE); -# endif + have_movbe = (c & bit_MOVBE) != 0; +#endif } + + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b, c, d); +#ifdef bit_BMI + have_bmi1 = (b & bit_BMI) != 0; #endif +#ifndef have_bmi2 + have_bmi2 = (b & bit_BMI2) != 0; +#endif + } if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 92c0fcd..bdf2222 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -64,9 +64,6 @@ typedef enum { TCG_REG_RDI = TCG_REG_EDI, } TCGReg; -#define TCG_CT_CONST_S32 0x100 -#define TCG_CT_CONST_U32 0x200 - /* used for function call generation */ #define TCG_REG_CALL_STACK TCG_REG_ESP #define TCG_TARGET_STACK_ALIGN 16 @@ -76,6 +73,8 @@ typedef enum { #define TCG_TARGET_CALL_STACK_OFFSET 0 #endif +extern bool have_bmi1; + /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 #define TCG_TARGET_HAS_rot_i32 1 @@ -87,7 +86,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap32_i32 1 #define TCG_TARGET_HAS_neg_i32 1 #define TCG_TARGET_HAS_not_i32 1 -#define TCG_TARGET_HAS_andc_i32 0 +#define TCG_TARGET_HAS_andc_i32 have_bmi1 #define TCG_TARGET_HAS_orc_i32 0 #define TCG_TARGET_HAS_eqv_i32 0 #define TCG_TARGET_HAS_nand_i32 0 @@ -115,7 +114,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap64_i64 1 #define TCG_TARGET_HAS_neg_i64 1 #define TCG_TARGET_HAS_not_i64 1 -#define TCG_TARGET_HAS_andc_i64 0 +#define TCG_TARGET_HAS_andc_i64 have_bmi1 #define TCG_TARGET_HAS_orc_i64 0 #define TCG_TARGET_HAS_eqv_i64 0 #define TCG_TARGET_HAS_nand_i64 0 diff --git a/tcg/optimize.c b/tcg/optimize.c index 89e2d6a..7777743 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -655,11 +655,68 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, } } break; + CASE_OP_32_64(xor): + CASE_OP_32_64(nand): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == -1) { + i = 1; + goto try_not; + } + break; + CASE_OP_32_64(nor): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == 0) { + i = 1; + goto try_not; + } + break; + CASE_OP_32_64(andc): + if (temps[args[2]].state != TCG_TEMP_CONST + && temps[args[1]].state == TCG_TEMP_CONST + && temps[args[1]].val == -1) { + i = 2; + goto try_not; + } + break; + CASE_OP_32_64(orc): + CASE_OP_32_64(eqv): + if (temps[args[2]].state != TCG_TEMP_CONST + && temps[args[1]].state == TCG_TEMP_CONST + && temps[args[1]].val == 0) { + i = 2; + goto try_not; + } + break; + try_not: + { + TCGOpcode not_op; + bool have_not; + + if (def->flags & TCG_OPF_64BIT) { + not_op = INDEX_op_not_i64; + have_not = TCG_TARGET_HAS_not_i64; + } else { + not_op = INDEX_op_not_i32; + have_not = TCG_TARGET_HAS_not_i32; + } + if (!have_not) { + break; + } + s->gen_opc_buf[op_index] = not_op; + reset_temp(args[0]); + gen_args[0] = args[0]; + gen_args[1] = args[i]; + args += 3; + gen_args += 2; + continue; + } default: break; } - /* Simplify expression for "op r, a, 0 => mov r, a" cases */ + /* Simplify expression for "op r, a, const => mov r, a" cases */ switch (op) { CASE_OP_32_64(add): CASE_OP_32_64(sub): @@ -670,28 +727,38 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, CASE_OP_32_64(rotr): CASE_OP_32_64(or): CASE_OP_32_64(xor): - if (temps[args[1]].state == TCG_TEMP_CONST) { - /* Proceed with possible constant folding. */ - break; - } - if (temps[args[2]].state == TCG_TEMP_CONST + CASE_OP_32_64(andc): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST && temps[args[2]].val == 0) { - if (temps_are_copies(args[0], args[1])) { - s->gen_opc_buf[op_index] = INDEX_op_nop; - } else { - s->gen_opc_buf[op_index] = op_to_mov(op); - tcg_opt_gen_mov(s, gen_args, args[0], args[1]); - gen_args += 2; - } - args += 3; - continue; + goto do_mov3; + } + break; + CASE_OP_32_64(and): + CASE_OP_32_64(orc): + CASE_OP_32_64(eqv): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == -1) { + goto do_mov3; } break; + do_mov3: + if (temps_are_copies(args[0], args[1])) { + s->gen_opc_buf[op_index] = INDEX_op_nop; + } else { + s->gen_opc_buf[op_index] = op_to_mov(op); + tcg_opt_gen_mov(s, gen_args, args[0], args[1]); + gen_args += 2; + } + args += 3; + continue; default: break; } - /* Simplify using known-zero bits */ + /* Simplify using known-zero bits. Currently only ops with a single + output argument is supported. */ mask = -1; affected = -1; switch (op) { @@ -726,16 +793,36 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[1]].mask & mask; break; - CASE_OP_32_64(sar): + CASE_OP_32_64(andc): + /* Known-zeros does not imply known-ones. Therefore unless + args[2] is constant, we can't infer anything from it. */ if (temps[args[2]].state == TCG_TEMP_CONST) { - mask = ((tcg_target_long)temps[args[1]].mask - >> temps[args[2]].val); + mask = ~temps[args[2]].mask; + goto and_const; } + /* But we certainly know nothing outside args[1] may be set. */ + mask = temps[args[1]].mask; break; - CASE_OP_32_64(shr): + case INDEX_op_sar_i32: if (temps[args[2]].state == TCG_TEMP_CONST) { - mask = temps[args[1]].mask >> temps[args[2]].val; + mask = (int32_t)temps[args[1]].mask >> temps[args[2]].val; + } + break; + case INDEX_op_sar_i64: + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = (int64_t)temps[args[1]].mask >> temps[args[2]].val; + } + break; + + case INDEX_op_shr_i32: + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = (uint32_t)temps[args[1]].mask >> temps[args[2]].val; + } + break; + case INDEX_op_shr_i64: + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = (uint64_t)temps[args[1]].mask >> temps[args[2]].val; } break; @@ -769,10 +856,40 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[3]].mask | temps[args[4]].mask; break; + CASE_OP_32_64(ld8u): + case INDEX_op_qemu_ld8u: + mask = 0xff; + break; + CASE_OP_32_64(ld16u): + case INDEX_op_qemu_ld16u: + mask = 0xffff; + break; + case INDEX_op_ld32u_i64: +#if TCG_TARGET_REG_BITS == 64 + case INDEX_op_qemu_ld32u: +#endif + mask = 0xffffffffu; + break; + + CASE_OP_32_64(qemu_ld): + { + TCGMemOp mop = args[def->nb_oargs + def->nb_iargs]; + if (!(mop & MO_SIGN)) { + mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1; + } + } + break; + default: break; } + /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit + results */ + if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { + mask &= 0xffffffffu; + } + if (mask == 0) { assert(def->nb_oargs == 1); s->gen_opc_buf[op_index] = op_to_movi(op); @@ -839,6 +956,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, /* Simplify expression for "op r, a, a => movi r, 0" cases */ switch (op) { + CASE_OP_32_64(andc): CASE_OP_32_64(sub): CASE_OP_32_64(xor): if (temps_are_copies(args[1], args[2])) { @@ -1140,6 +1258,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, } else { for (i = 0; i < def->nb_oargs; i++) { reset_temp(args[i]); + /* Save the corresponding known-zero bits mask for the + first output argument (only one supported so far). */ + if (i == 0) { + temps[args[i]].mask = mask; + } } } for (i = 0; i < def->nb_args; i++) { @@ -526,7 +526,7 @@ static inline int tcg_temp_new_internal(TCGType type, int temp_local) ts->temp_local = temp_local; ts->name = NULL; ts++; - ts->base_type = TCG_TYPE_I32; + ts->base_type = type; ts->type = TCG_TYPE_I32; ts->temp_allocated = 1; ts->temp_local = temp_local; |