diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2025-09-05 09:51:27 +0200 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2025-09-05 09:51:27 +0200 |
commit | 6a9fa5ef3230a7d51e0d953a59ee9ef10af705b8 (patch) | |
tree | fef48da5efa7cc3b5450a848ded462271ff20fd4 /tcg | |
parent | baa79455fa92984ff0f4b9ae94bed66823177a27 (diff) | |
parent | cb2540979264c8d3984e26c5dd90a840e47ec5dd (diff) | |
download | qemu-master.zip qemu-master.tar.gz qemu-master.tar.bz2 |
tcg/arm: Fix tgen_deposit
tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts
# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmi6lgYdHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV9zUggAjXoSFDgMz3yr959F
# e6pSGkV+UIAYZ+fm9TAFQuKccUlEjX6Sq6sxV1my2ODnUnwFF1sV6rx8TG1VHFL/
# GxADQuwY3/6tsiZ24drU8oaocxISi91Km+5P7xwrAbdhSGVMJakzQqTPS178l1Fw
# pXRWN9Offz74gKKUxk6AiPyCUPZutUiM6Hwe5wZSwWIxSoEQWwnAoH8lTPrzAD/Z
# Bo0Cs/LHzmeantok7BRKTlQT4wpvCwRIunkD1V28zdFN63Ny6qTsbxtbRxmKvYC7
# UKli29d/KxFad1ccTNGo9DpFKBB9xHb7W4gBzSrJm9D1bWKcL4wLTmp29Z9aWWpW
# TnsyaQ==
# =8WbV
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 05 Sep 2025 09:49:26 AM CEST
# gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg: issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate]
* tag 'pull-tcg-20250905' of https://gitlab.com/rth7680/qemu:
tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts
tcg/i386: Add INDEX_op_x86_vgf2p8affineqb_vec
tcg/i386: Use canonical operand ordering in expand_vec_sari
tcg/i386: Expand sari of bits-1 as pcmpgt
cpuinfo/i386: Detect GFNI as an AVX extension
tcg/arm: Fix tgen_deposit
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'tcg')
-rw-r--r-- | tcg/arm/tcg-target.c.inc | 3 | ||||
-rw-r--r-- | tcg/i386/tcg-target-opc.h.inc | 1 | ||||
-rw-r--r-- | tcg/i386/tcg-target.c.inc | 91 |
3 files changed, 88 insertions, 7 deletions
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 836894b..338c57b 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -975,7 +975,8 @@ static void tgen_deposit(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1, TCGReg a2, unsigned ofs, unsigned len) { /* bfi/bfc */ - tcg_out32(s, 0x07c00010 | (COND_AL << 28) | (a0 << 12) | a1 + tcg_debug_assert(a0 == a1); + tcg_out32(s, 0x07c00010 | (COND_AL << 28) | (a0 << 12) | a2 | (ofs << 7) | ((ofs + len - 1) << 16)); } diff --git a/tcg/i386/tcg-target-opc.h.inc b/tcg/i386/tcg-target-opc.h.inc index 8cc0dba..8a5cb34 100644 --- a/tcg/i386/tcg-target-opc.h.inc +++ b/tcg/i386/tcg-target-opc.h.inc @@ -35,3 +35,4 @@ DEF(x86_punpckh_vec, 1, 2, 0, TCG_OPF_VECTOR) DEF(x86_vpshldi_vec, 1, 2, 1, TCG_OPF_VECTOR) DEF(x86_vpshldv_vec, 1, 3, 0, TCG_OPF_VECTOR) DEF(x86_vpshrdv_vec, 1, 3, 0, TCG_OPF_VECTOR) +DEF(x86_vgf2p8affineqb_vec, 1, 2, 1, TCG_OPF_VECTOR) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 088c6c9..ee27266 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -451,6 +451,7 @@ static bool tcg_target_const_match(int64_t val, int ct, #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) +#define OPC_VGF2P8AFFINEQB (0xce | P_EXT3A | P_DATA16 | P_VEXW) #define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX) #define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX) #define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX) @@ -4084,6 +4085,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, insn = vpshldi_insn[vece]; sub = args[3]; goto gen_simd_imm8; + case INDEX_op_x86_vgf2p8affineqb_vec: + insn = OPC_VGF2P8AFFINEQB; + sub = args[3]; + goto gen_simd_imm8; case INDEX_op_not_vec: insn = OPC_VPTERNLOGQ; @@ -4188,6 +4193,7 @@ tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags) case INDEX_op_x86_punpckl_vec: case INDEX_op_x86_punpckh_vec: case INDEX_op_x86_vpshldi_vec: + case INDEX_op_x86_vgf2p8affineqb_vec: #if TCG_TARGET_REG_BITS == 32 case INDEX_op_dup2_vec: #endif @@ -4336,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } } +static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0, + TCGv_vec v1, uint64_t matrix) +{ + vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0); +} + static void expand_vec_shi(TCGType type, unsigned vece, bool right, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_shi[2][8] = { + /* left shift */ + { 0, + 0x0001020408102040ull, + 0x0000010204081020ull, + 0x0000000102040810ull, + 0x0000000001020408ull, + 0x0000000000010204ull, + 0x0000000000000102ull, + 0x0000000000000001ull }, + /* right shift */ + { 0, + 0x0204081020408000ull, + 0x0408102040800000ull, + 0x0810204080000000ull, + 0x1020408000000000ull, + 0x2040800000000000ull, + 0x4080000000000000ull, + 0x8000000000000000ull } + }; uint8_t mask; tcg_debug_assert(vece == MO_8); + + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]); + return; + } + if (right) { mask = 0xff >> imm; tcg_gen_shri_vec(MO_16, v0, v1, imm); @@ -4355,10 +4395,31 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool right, static void expand_vec_sari(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_sar[8] = { + 0, + 0x0204081020408080ull, + 0x0408102040808080ull, + 0x0810204080808080ull, + 0x1020408080808080ull, + 0x2040808080808080ull, + 0x4080808080808080ull, + 0x8080808080808080ull, + }; TCGv_vec t1, t2; + if (imm >= (8 << vece) - 1) { + tcg_gen_cmp_vec(TCG_COND_LT, vece, v0, v1, + tcg_constant_vec(type, MO_64, 0)); + return; + } + switch (vece) { case MO_8: + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]); + break; + } + /* Unpack to 16-bit, shift, and repack. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); @@ -4393,8 +4454,8 @@ static void expand_vec_sari(TCGType type, unsigned vece, /* Otherwise we will need to use a compare vs 0 to produce * the sign-extend, shift and merge. */ - tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, - tcg_constant_vec(type, MO_64, 0), v1); + tcg_gen_cmp_vec(TCG_COND_LT, MO_64, t1, v1, + tcg_constant_vec(type, MO_64, 0)); tcg_gen_shri_vec(MO_64, v0, v1, imm); tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); tcg_gen_or_vec(MO_64, v0, v0, t1); @@ -4410,12 +4471,30 @@ static void expand_vec_sari(TCGType type, unsigned vece, static void expand_vec_rotli(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_rol[8] = { + 0, + 0x8001020408102040ull, + 0x4080010204081020ull, + 0x2040800102040810ull, + 0x1020408001020408ull, + 0x0810204080010204ull, + 0x0408102040800102ull, + 0x0204081020408001ull, + }; TCGv_vec t; - if (vece != MO_8 && have_avx512vbmi2) { - vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, - tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); - return; + if (vece == MO_8) { + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]); + return; + } + } else { + if (have_avx512vbmi2) { + vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(v1), imm); + return; + } } t = tcg_temp_new_vec(type); |