Merge tag 'pull-tcg-20250905' of https://gitlab.com/rth7680/qemu into stagingHEAD staging master

tcg/arm: Fix tgen_deposit tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmi6lgYdHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV9zUggAjXoSFDgMz3yr959F # e6pSGkV+UIAYZ+fm9TAFQuKccUlEjX6Sq6sxV1my2ODnUnwFF1sV6rx8TG1VHFL/ # GxADQuwY3/6tsiZ24drU8oaocxISi91Km+5P7xwrAbdhSGVMJakzQqTPS178l1Fw # pXRWN9Offz74gKKUxk6AiPyCUPZutUiM6Hwe5wZSwWIxSoEQWwnAoH8lTPrzAD/Z # Bo0Cs/LHzmeantok7BRKTlQT4wpvCwRIunkD1V28zdFN63Ny6qTsbxtbRxmKvYC7 # UKli29d/KxFad1ccTNGo9DpFKBB9xHb7W4gBzSrJm9D1bWKcL4wLTmp29Z9aWWpW # TnsyaQ== # =8WbV # -----END PGP SIGNATURE----- # gpg: Signature made Fri 05 Sep 2025 09:49:26 AM CEST # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate] * tag 'pull-tcg-20250905' of https://gitlab.com/rth7680/qemu: tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts tcg/i386: Add INDEX_op_x86_vgf2p8affineqb_vec tcg/i386: Use canonical operand ordering in expand_vec_sari tcg/i386: Expand sari of bits-1 as pcmpgt cpuinfo/i386: Detect GFNI as an AVX extension tcg/arm: Fix tgen_deposit Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
author: Richard Henderson <richard.henderson@linaro.org> 2025-09-05 09:51:27 +0200
committer: Richard Henderson <richard.henderson@linaro.org> 2025-09-05 09:51:27 +0200
commit: 6a9fa5ef3230a7d51e0d953a59ee9ef10af705b8 (patch)
tree: fef48da5efa7cc3b5450a848ded462271ff20fd4 /tcg
parent: baa79455fa92984ff0f4b9ae94bed66823177a27 (diff)
parent: cb2540979264c8d3984e26c5dd90a840e47ec5dd (diff)
download: qemu-master.zip
qemu-master.tar.gz
qemu-master.tar.bz2
3 files changed, 88 insertions, 7 deletions
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 836894b..338c57b 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -975,7 +975,8 @@ static void tgen_deposit(TCGContext *s, TCGType type, TCGReg a0, TCGReg a1,
                          TCGReg a2, unsigned ofs, unsigned len)
 {
     /* bfi/bfc */
-    tcg_out32(s, 0x07c00010 | (COND_AL << 28) | (a0 << 12) | a1
+    tcg_debug_assert(a0 == a1);
+    tcg_out32(s, 0x07c00010 | (COND_AL << 28) | (a0 << 12) | a2
               | (ofs << 7) | ((ofs + len - 1) << 16));
 }
 
diff --git a/tcg/i386/tcg-target-opc.h.inc b/tcg/i386/tcg-target-opc.h.inc
index 8cc0dba..8a5cb34 100644
--- a/tcg/i386/tcg-target-opc.h.inc
+++ b/tcg/i386/tcg-target-opc.h.inc
@@ -35,3 +35,4 @@ DEF(x86_punpckh_vec, 1, 2, 0, TCG_OPF_VECTOR)
 DEF(x86_vpshldi_vec, 1, 2, 1, TCG_OPF_VECTOR)
 DEF(x86_vpshldv_vec, 1, 3, 0, TCG_OPF_VECTOR)
 DEF(x86_vpshrdv_vec, 1, 3, 0, TCG_OPF_VECTOR)
+DEF(x86_vgf2p8affineqb_vec, 1, 2, 1, TCG_OPF_VECTOR)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 088c6c9..ee27266 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -451,6 +451,7 @@ static bool tcg_target_const_match(int64_t val, int ct,
 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VGF2P8AFFINEQB (0xce | P_EXT3A | P_DATA16 | P_VEXW)
 #define OPC_VPMOVM2B    (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
 #define OPC_VPMOVM2W    (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 #define OPC_VPMOVM2D    (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
@@ -4084,6 +4085,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         insn = vpshldi_insn[vece];
         sub = args[3];
         goto gen_simd_imm8;
+    case INDEX_op_x86_vgf2p8affineqb_vec:
+        insn = OPC_VGF2P8AFFINEQB;
+        sub = args[3];
+        goto gen_simd_imm8;
 
     case INDEX_op_not_vec:
         insn = OPC_VPTERNLOGQ;
@@ -4188,6 +4193,7 @@ tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags)
     case INDEX_op_x86_punpckl_vec:
     case INDEX_op_x86_punpckh_vec:
     case INDEX_op_x86_vpshldi_vec:
+    case INDEX_op_x86_vgf2p8affineqb_vec:
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_dup2_vec:
 #endif
@@ -4336,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     }
 }
 
+static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0,
+                                TCGv_vec v1, uint64_t matrix)
+{
+    vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8,
+              tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+              tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0);
+}
+
 static void expand_vec_shi(TCGType type, unsigned vece, bool right,
                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_shi[2][8] = {
+        /* left shift */
+        { 0,
+          0x0001020408102040ull,
+          0x0000010204081020ull,
+          0x0000000102040810ull,
+          0x0000000001020408ull,
+          0x0000000000010204ull,
+          0x0000000000000102ull,
+          0x0000000000000001ull },
+        /* right shift */
+        { 0,
+          0x0204081020408000ull,
+          0x0408102040800000ull,
+          0x0810204080000000ull,
+          0x1020408000000000ull,
+          0x2040800000000000ull,
+          0x4080000000000000ull,
+          0x8000000000000000ull }
+    };
     uint8_t mask;
 
     tcg_debug_assert(vece == MO_8);
+
+    if (cpuinfo & CPUINFO_GFNI) {
+        gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]);
+        return;
+    }
+
     if (right) {
         mask = 0xff >> imm;
         tcg_gen_shri_vec(MO_16, v0, v1, imm);
@@ -4355,10 +4395,31 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool right,
 static void expand_vec_sari(TCGType type, unsigned vece,
                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_sar[8] = {
+        0,
+        0x0204081020408080ull,
+        0x0408102040808080ull,
+        0x0810204080808080ull,
+        0x1020408080808080ull,
+        0x2040808080808080ull,
+        0x4080808080808080ull,
+        0x8080808080808080ull,
+    };
     TCGv_vec t1, t2;
 
+    if (imm >= (8 << vece) - 1) {
+        tcg_gen_cmp_vec(TCG_COND_LT, vece, v0, v1,
+                        tcg_constant_vec(type, MO_64, 0));
+        return;
+    }
+
     switch (vece) {
     case MO_8:
+        if (cpuinfo & CPUINFO_GFNI) {
+            gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]);
+            break;
+        }
+
         /* Unpack to 16-bit, shift, and repack.  */
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
@@ -4393,8 +4454,8 @@ static void expand_vec_sari(TCGType type, unsigned vece,
             /* Otherwise we will need to use a compare vs 0 to produce
              * the sign-extend, shift and merge.
              */
-            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
-                            tcg_constant_vec(type, MO_64, 0), v1);
+            tcg_gen_cmp_vec(TCG_COND_LT, MO_64, t1, v1,
+                            tcg_constant_vec(type, MO_64, 0));
             tcg_gen_shri_vec(MO_64, v0, v1, imm);
             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
             tcg_gen_or_vec(MO_64, v0, v0, t1);
@@ -4410,12 +4471,30 @@ static void expand_vec_sari(TCGType type, unsigned vece,
 static void expand_vec_rotli(TCGType type, unsigned vece,
                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_rol[8] = {
+        0,
+        0x8001020408102040ull,
+        0x4080010204081020ull,
+        0x2040800102040810ull,
+        0x1020408001020408ull,
+        0x0810204080010204ull,
+        0x0408102040800102ull,
+        0x0204081020408001ull,
+    };
     TCGv_vec t;
 
-    if (vece != MO_8 && have_avx512vbmi2) {
-        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
-                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
-        return;
+    if (vece == MO_8) {
+        if (cpuinfo & CPUINFO_GFNI) {
+            gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]);
+            return;
+        }
+    } else {
+        if (have_avx512vbmi2) {
+            vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
+                      tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+                      tcgv_vec_arg(v1), imm);
+            return;
+        }
     }
 
     t = tcg_temp_new_vec(type);
author	Richard Henderson <richard.henderson@linaro.org>	2025-09-05 09:51:27 +0200
committer	Richard Henderson <richard.henderson@linaro.org>	2025-09-05 09:51:27 +0200
commit	6a9fa5ef3230a7d51e0d953a59ee9ef10af705b8 (patch)
tree	fef48da5efa7cc3b5450a848ded462271ff20fd4 /tcg
parent	baa79455fa92984ff0f4b9ae94bed66823177a27 (diff)
parent	cb2540979264c8d3984e26c5dd90a840e47ec5dd (diff)
download	qemu-master.zip qemu-master.tar.gz qemu-master.tar.bz2