target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8, respectively. vprtybw: rept loop master patch 8 12500 0,01198900 0,00703100 (-41.4%) 25 4000 0,01070100 0,00571400 (-46.6%) 100 1000 0,01123300 0,00678200 (-39.6%) 500 200 0,01601500 0,01535600 (-4.1%) 2500 40 0,03872900 0,05562100 (43.6%) 8000 12 0,10047000 0,16643000 (65.7%) vprtybd: rept loop master patch 8 12500 0,00757700 0,00788100 (4.0%) 25 4000 0,00652500 0,00669600 (2.6%) 100 1000 0,00714400 0,00825400 (15.5%) 500 200 0,01211000 0,01903700 (57.2%) 2500 40 0,03483800 0,07021200 (101.5%) 8000 12 0,09591800 0,21036200 (119.3%) vprtybq: rept loop master patch 8 12500 0,00675600 0,00667200 (-1.2%) 25 4000 0,00619400 0,00643200 (3.8%) 100 1000 0,00707100 0,00751100 (6.2%) 500 200 0,01199300 0,01342000 (11.9%) 2500 40 0,03490900 0,04092900 (17.2%) 8000 12 0,09588200 0,11465100 (19.6%) I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ, I'm not sure if it's worth to move those instructions. Comparing the assembly of the helper with the TCGop they are pretty similar, so I'm not sure why vprtybd took so much more time. Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-Id: <20221019125040.48028-6-lucas.araujo@eldorado.org.br> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
author: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br> 2022-10-19 09:50:33 -0300
committer: Daniel Henrique Barboza <danielhb413@gmail.com> 2022-10-28 13:15:22 -0300
commit: d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882 (patch)
tree: c6753cae39af4f88408d20bb5222779948d747dd
parent: 90b5aadb09b1063dfc34636efe212c6862cbe32f (diff)
download: qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.zip
qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.tar.gz
qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.tar.bz2
5 files changed, 71 insertions, 33 deletions
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b2e910b..a06193b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2fafde0..b05c89e 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM          000100 ..... ..... ..... 11111000100    @VX
 VPDEPD          000100 ..... ..... ..... 10111001101    @VX
 VPEXTD          000100 ..... ..... ..... 10110001101    @VX
 
+VPRTYBD         000100 ..... 01001 ..... 11000000010    @VX_tb
+VPRTYBQ         000100 ..... 01010 ..... 11000000010    @VX_tb
+VPRTYBW         000100 ..... 01000 ..... 11000000010    @VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX      000100 ..... ..... ..... ..... 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c7fd0d1..c6ce466 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env)
     env->vscr_sat.u32[0] = 1;
 }
 
-/* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
-{
-    int i;
-    for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-        uint64_t res = b->u32[i] ^ (b->u32[i] >> 16);
-        res ^= res >> 8;
-        r->u32[i] = res & 1;
-    }
-}
-
-/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
-{
-    int i;
-    for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-        uint64_t res = b->u64[i] ^ (b->u64[i] >> 32);
-        res ^= res >> 16;
-        res ^= res >> 8;
-        r->u64[i] = res & 1;
-    }
-}
-
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
     uint64_t res = b->u64[0] ^ b->u64[1];
     res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc
index b9a9e83..cbb2a3e 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,71 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 24);
-GEN_VXFORM_NOA(vprtybd, 1, 24);
-GEN_VXFORM_NOA(vprtybq, 1, 24);
+
+static void gen_vprtyb_vec(unsigned vece, TCGv_vec t, TCGv_vec b)
+{
+    int i;
+    TCGv_vec tmp = tcg_temp_new_vec_matching(b);
+    /* MO_32 is 2, so 2 iteractions for MO_32 and 3 for MO_64 */
+    for (i = 0; i < vece; i++) {
+        tcg_gen_shri_vec(vece, tmp, b, (4 << (vece - i)));
+        tcg_gen_xor_vec(vece, b, tmp, b);
+    }
+    tcg_gen_and_vec(vece, t, b, tcg_constant_vec_matching(t, vece, 1));
+    tcg_temp_free_vec(tmp);
+}
+
+/* vprtybw */
+static void gen_vprtyb_i32(TCGv_i32 t, TCGv_i32 b)
+{
+    tcg_gen_ctpop_i32(t, b);
+    tcg_gen_and_i32(t, t, tcg_constant_i32(1));
+}
+
+/* vprtybd */
+static void gen_vprtyb_i64(TCGv_i64 t, TCGv_i64 b)
+{
+    tcg_gen_ctpop_i64(t, b);
+    tcg_gen_and_i64(t, t, tcg_constant_i64(1));
+}
+
+static bool do_vx_vprtyb(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, 0
+    };
+
+    static const GVecGen2 op[] = {
+        {
+            .fniv = gen_vprtyb_vec,
+            .fni4 = gen_vprtyb_i32,
+            .opt_opc = vecop_list,
+            .vece = MO_32
+        },
+        {
+            .fniv = gen_vprtyb_vec,
+            .fni8 = gen_vprtyb_i64,
+            .opt_opc = vecop_list,
+            .vece = MO_64
+        },
+        {
+            .fno = gen_helper_VPRTYBQ,
+            .vece = MO_128
+        },
+    };
+
+    REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+    REQUIRE_VECTOR(ctx);
+
+    tcg_gen_gvec_2(avr_full_offset(a->vrt), avr_full_offset(a->vrb),
+                   16, 16, &op[vece - MO_32]);
+
+    return true;
+}
+
+TRANS(VPRTYBW, do_vx_vprtyb, MO_32)
+TRANS(VPRTYBD, do_vx_vprtyb, MO_64)
+TRANS(VPRTYBQ, do_vx_vprtyb, MO_128)
 
 static void gen_vsplt(DisasContext *ctx, int vece)
 {
diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc
index 2790853..46a620a 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -106,9 +106,6 @@ GEN_VXFORM_300(vsrv, 2, 28),
 GEN_VXFORM_300(vslv, 2, 29),
 GEN_VXFORM(vslo, 6, 16),
 GEN_VXFORM(vsro, 6, 17),
-GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300),
-GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300),
-GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300),
 
 GEN_VXFORM(xpnd04_1, 0, 22),
 GEN_VXFORM_300(bcdsr, 0, 23),
author	Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>	2022-10-19 09:50:33 -0300
committer	Daniel Henrique Barboza <danielhb413@gmail.com>	2022-10-28 13:15:22 -0300
commit	d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882 (patch)
tree	c6753cae39af4f88408d20bb5222779948d747dd
parent	90b5aadb09b1063dfc34636efe212c6862cbe32f (diff)
download	qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.zip qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.tar.gz qemu-d57fbd8fd9ea4f559cf4ed6c8fb6f064b73d6882.tar.bz2