From d14f33976282a8744ca1bf1d64e73996c145aa3f Mon Sep 17 00:00:00 2001
From: "Maxiwell S. Garcia" <maxiwell@linux.ibm.com>
Date: Thu, 11 Jul 2019 16:47:02 -0300
Subject: migration: Do not re-read the clock on pre_save in case of paused
 guest

Re-read the timebase before migrate was ported from x86 commit:
   6053a86fe7bd: kvmclock: reduce kvmclock difference on migration

The clock move makes the guest knows about the paused time between
the stop and migrate commands. This is an issue in an already-paused
VM because some side effects, like process stalls, could happen
after migration.

So, this patch checks the runstate of guest in the pre_save handler and
do not re-reads the timebase in case of paused state (cold migration).

Signed-off-by: Maxiwell S. Garcia <maxiwell@linux.ibm.com>
Message-Id: <20190711194702.26598-1-maxiwell@linux.ibm.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/cpu-qom.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'target/ppc')

diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
index a2f202f..5769fb7 100644
--- a/target/ppc/cpu-qom.h
+++ b/target/ppc/cpu-qom.h
@@ -201,6 +201,7 @@ typedef struct PowerPCCPUClass {
 typedef struct PPCTimebase {
     uint64_t guest_timebase;
     int64_t time_of_the_day_ns;
+    bool runstate_paused;
 } PPCTimebase;
 
 extern const VMStateDescription vmstate_ppc_timebase;
-- 
cgit v1.1


From 1cc792698e568626b5bf833e15e641ad0a81c6bb Mon Sep 17 00:00:00 2001
From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Date: Mon, 15 Jul 2019 16:22:47 +0200
Subject: target/ppc: Optimize emulation of lvsl and lvsr instructions

Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by addition of the result with 0x0001020304050607. Value obtained
is placed in higher doubleword element of vD.
(sh+8):(sh+15) by adding the result of previous multiplication with
0x08090a0b0c0d0e0f. Value obtained is placed in lower doubleword element
of vD.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by substraction of the result from 0x1011121314151617. Value
obtained is placed in higher doubleword element of vD.
(sh+8):(sh+15) by substracting the result of previous multiplication from
0x18191a1b1c1d1e1f. Value obtained is placed in lower doubleword element
of vD.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <1563200574-11098-2-git-send-email-stefan.brankovic@rt-rk.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/helper.h                 |   2 -
 target/ppc/int_helper.c             |  18 ------
 target/ppc/translate/vmx-impl.inc.c | 121 ++++++++++++++++++++++++++----------
 3 files changed, 89 insertions(+), 52 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 380c9b1..121d786 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,8 +193,6 @@ DEF_HELPER_2(vprtybw, void, avr, avr)
 DEF_HELPER_2(vprtybd, void, avr, avr)
 DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
-DEF_HELPER_2(lvsl, void, avr, tl)
-DEF_HELPER_2(lvsr, void, avr, tl)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8f037af..5dcca53 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -459,24 +459,6 @@ SATCVT(sd, uw, int64_t, uint32_t, 0, UINT32_MAX)
 #undef SATCVT
 #undef SATCVTU
 
-void helper_lvsl(ppc_avr_t *r, target_ulong sh)
-{
-    int i, j = (sh & 0xf);
-
-    for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-        r->VsrB(i) = j++;
-    }
-}
-
-void helper_lvsr(ppc_avr_t *r, target_ulong sh)
-{
-    int i, j = 0x10 - (sh & 0xf);
-
-    for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-        r->VsrB(i) = j++;
-    }
-}
-
 void helper_mtvscr(CPUPPCState *env, uint32_t vscr)
 {
     env->vscr = vscr & ~(1u << VSCR_SAT);
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index 663275b..a9fe3c7 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-    TCGv_ptr rd;
-    TCGv EA;
-    if (unlikely(!ctx->altivec_enabled)) {
-        gen_exception(ctx, POWERPC_EXCP_VPU);
-        return;
-    }
-    EA = tcg_temp_new();
-    gen_addr_reg_index(ctx, EA);
-    rd = gen_avr_ptr(rD(ctx->opcode));
-    gen_helper_lvsl(rd, EA);
-    tcg_temp_free(EA);
-    tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-    TCGv_ptr rd;
-    TCGv EA;
-    if (unlikely(!ctx->altivec_enabled)) {
-        gen_exception(ctx, POWERPC_EXCP_VPU);
-        return;
-    }
-    EA = tcg_temp_new();
-    gen_addr_reg_index(ctx, EA);
-    rd = gen_avr_ptr(rD(ctx->opcode));
-    gen_helper_lvsr(rd, EA);
-    tcg_temp_free(EA);
-    tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
     TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)                         \
     tcg_temp_free_ptr(rd);                                              \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)                              \
+static void glue(gen_, name)(DisasContext *ctx)                         \
+{                                                                       \
+    if (unlikely(!ctx->altivec_enabled)) {                              \
+        gen_exception(ctx, POWERPC_EXCP_VPU);                           \
+        return;                                                         \
+    }                                                                   \
+    trans_##name(ctx);                                                  \
+}
+
 #define GEN_VXFORM_ENV(name, opc2, opc3)                                \
 static void glue(gen_, name)(DisasContext *ctx)                         \
 {                                                                       \
@@ -515,6 +493,83 @@ static void gen_vmrgow(DisasContext *ctx)
     tcg_temp_free_i64(avr);
 }
 
+/*
+ * lvsl VRT,RA,RB - Load Vector for Shift Left
+ *
+ * Let the EA be the sum (rA|0)+(rB). Let sh=EA[28–31].
+ * Let X be the 32-byte value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F.
+ * Bytes sh:sh+15 of X are placed into vD.
+ */
+static void trans_lvsl(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv EA = tcg_temp_new();
+
+    /* Get sh(from description) by anding EA with 0xf. */
+    gen_addr_reg_index(ctx, EA);
+    tcg_gen_extu_tl_i64(sh, EA);
+    tcg_gen_andi_i64(sh, sh, 0xfULL);
+
+    /*
+     * Create bytes sh:sh+7 of X(from description) and place them in
+     * higher doubleword of vD.
+     */
+    tcg_gen_muli_i64(sh, sh, 0x0101010101010101ULL);
+    tcg_gen_addi_i64(result, sh, 0x0001020304050607ull);
+    set_avr64(VT, result, true);
+    /*
+     * Create bytes sh+8:sh+15 of X(from description) and place them in
+     * lower doubleword of vD.
+     */
+    tcg_gen_addi_i64(result, sh, 0x08090a0b0c0d0e0fULL);
+    set_avr64(VT, result, false);
+
+    tcg_temp_free_i64(result);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free(EA);
+}
+
+/*
+ * lvsr VRT,RA,RB - Load Vector for Shift Right
+ *
+ * Let the EA be the sum (rA|0)+(rB). Let sh=EA[28–31].
+ * Let X be the 32-byte value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F.
+ * Bytes (16-sh):(31-sh) of X are placed into vD.
+ */
+static void trans_lvsr(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv EA = tcg_temp_new();
+
+
+    /* Get sh(from description) by anding EA with 0xf. */
+    gen_addr_reg_index(ctx, EA);
+    tcg_gen_extu_tl_i64(sh, EA);
+    tcg_gen_andi_i64(sh, sh, 0xfULL);
+
+    /*
+     * Create bytes (16-sh):(23-sh) of X(from description) and place them in
+     * higher doubleword of vD.
+     */
+    tcg_gen_muli_i64(sh, sh, 0x0101010101010101ULL);
+    tcg_gen_subfi_i64(result, 0x1011121314151617ULL, sh);
+    set_avr64(VT, result, true);
+    /*
+     * Create bytes (24-sh):(32-sh) of X(from description) and place them in
+     * lower doubleword of vD.
+     */
+    tcg_gen_subfi_i64(result, 0x18191a1b1c1d1e1fULL, sh);
+    set_avr64(VT, result, false);
+
+    tcg_temp_free_i64(result);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free(EA);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -662,6 +717,8 @@ GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
+GEN_VXFORM_TRANS(lvsl, 6, 31)
+GEN_VXFORM_TRANS(lvsr, 6, 32)
 GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
                 vextuwrx, PPC_NONE, PPC2_ISA300)
 
-- 
cgit v1.1


From 4e6d0920e7547e6af4bbac5ffe9adfe6ea621822 Mon Sep 17 00:00:00 2001
From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Date: Mon, 15 Jul 2019 16:22:48 +0200
Subject: target/ppc: Optimize emulation of vsl and vsr instructions

Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction, the first step is bits 125-127 of register vB have
to be saved in variable sh. Then, the highest sh bits of the lower
doubleword element of register vA are saved in variable shifted,
in order not to lose those bits when shift operation is performed on
the lower doubleword element of register vA, which is the next
step. After shifting the lower doubleword element shift operation
is performed on higher doubleword element of vA, with replacement of
the lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction, firstly, the bits 125-127 of register vB have
to be saved in variable sh. Then, the lowest sh bits of the higher
doubleword element of register vA are saved in variable shifted,
in odred not to lose those bits when the shift operation is
performed on the higher doubleword element of register vA, which is
the next step. After shifting higher doubleword element, shift operation
is performed on lower doubleword element of vA, with replacement of
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <1563200574-11098-3-git-send-email-stefan.brankovic@rt-rk.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/helper.h                 |   2 -
 target/ppc/int_helper.c             |  35 -------------
 target/ppc/translate/vmx-impl.inc.c | 101 +++++++++++++++++++++++++++++++++++-
 3 files changed, 99 insertions(+), 39 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 121d786..6fb823c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -217,8 +217,6 @@ DEF_HELPER_3(vrlb, void, avr, avr, avr)
 DEF_HELPER_3(vrlh, void, avr, avr, avr)
 DEF_HELPER_3(vrlw, void, avr, avr, avr)
 DEF_HELPER_3(vrld, void, avr, avr, avr)
-DEF_HELPER_3(vsl, void, avr, avr, avr)
-DEF_HELPER_3(vsr, void, avr, avr, avr)
 DEF_HELPER_4(vsldoi, void, avr, avr, avr, i32)
 DEF_HELPER_3(vextractub, void, avr, avr, i32)
 DEF_HELPER_3(vextractuh, void, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 5dcca53..d2cad78 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1740,41 +1740,6 @@ VEXTU_X_DO(vextuhrx, 16, 0)
 VEXTU_X_DO(vextuwrx, 32, 0)
 #undef VEXTU_X_DO
 
-/*
- * The specification says that the results are undefined if all of the
- * shift counts are not identical.  We check to make sure that they
- * are to conform to what real hardware appears to do.
- */
-#define VSHIFT(suffix, leftp)                                           \
-    void helper_vs##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)    \
-    {                                                                   \
-        int shift = b->VsrB(15) & 0x7;                                  \
-        int doit = 1;                                                   \
-        int i;                                                          \
-                                                                        \
-        for (i = 0; i < ARRAY_SIZE(r->u8); i++) {                       \
-            doit = doit && ((b->u8[i] & 0x7) == shift);                 \
-        }                                                               \
-        if (doit) {                                                     \
-            if (shift == 0) {                                           \
-                *r = *a;                                                \
-            } else if (leftp) {                                         \
-                uint64_t carry = a->VsrD(1) >> (64 - shift);            \
-                                                                        \
-                r->VsrD(0) = (a->VsrD(0) << shift) | carry;             \
-                r->VsrD(1) = a->VsrD(1) << shift;                       \
-            } else {                                                    \
-                uint64_t carry = a->VsrD(0) << (64 - shift);            \
-                                                                        \
-                r->VsrD(1) = (a->VsrD(1) >> shift) | carry;             \
-                r->VsrD(0) = a->VsrD(0) >> shift;                       \
-            }                                                           \
-        }                                                               \
-    }
-VSHIFT(l, 1)
-VSHIFT(r, 0)
-#undef VSHIFT
-
 void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
     int i;
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index a9fe3c7..e06e65a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -570,6 +570,103 @@ static void trans_lvsr(DisasContext *ctx)
     tcg_temp_free(EA);
 }
 
+/*
+ * vsl VRT,VRA,VRB - Vector Shift Left
+ *
+ * Shifting left 128 bit value of vA by value specified in bits 125-127 of vB.
+ * Lowest 3 bits in each byte element of register vB must be identical or
+ * result is undefined.
+ */
+static void trans_vsl(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    int VA = rA(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i64 avrA = tcg_temp_new_i64();
+    TCGv_i64 avrB = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv_i64 shifted = tcg_temp_new_i64();
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    /* Place bits 125-127 of vB in sh. */
+    get_avr64(avrB, VB, false);
+    tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+
+    /*
+     * Save highest sh bits of lower doubleword element of vA in variable
+     * shifted and perform shift on lower doubleword.
+     */
+    get_avr64(avrA, VA, false);
+    tcg_gen_subfi_i64(tmp, 64, sh);
+    tcg_gen_shr_i64(shifted, avrA, tmp);
+    tcg_gen_andi_i64(shifted, shifted, 0x7fULL);
+    tcg_gen_shl_i64(avrA, avrA, sh);
+    set_avr64(VT, avrA, false);
+
+    /*
+     * Perform shift on higher doubleword element of vA and replace lowest
+     * sh bits with shifted.
+     */
+    get_avr64(avrA, VA, true);
+    tcg_gen_shl_i64(avrA, avrA, sh);
+    tcg_gen_or_i64(avrA, avrA, shifted);
+    set_avr64(VT, avrA, true);
+
+    tcg_temp_free_i64(avrA);
+    tcg_temp_free_i64(avrB);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free_i64(shifted);
+    tcg_temp_free_i64(tmp);
+}
+
+/*
+ * vsr VRT,VRA,VRB - Vector Shift Right
+ *
+ * Shifting right 128 bit value of vA by value specified in bits 125-127 of vB.
+ * Lowest 3 bits in each byte element of register vB must be identical or
+ * result is undefined.
+ */
+static void trans_vsr(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    int VA = rA(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i64 avrA = tcg_temp_new_i64();
+    TCGv_i64 avrB = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv_i64 shifted = tcg_temp_new_i64();
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    /* Place bits 125-127 of vB in sh. */
+    get_avr64(avrB, VB, false);
+    tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+
+    /*
+     * Save lowest sh bits of higher doubleword element of vA in variable
+     * shifted and perform shift on higher doubleword.
+     */
+    get_avr64(avrA, VA, true);
+    tcg_gen_subfi_i64(tmp, 64, sh);
+    tcg_gen_shl_i64(shifted, avrA, tmp);
+    tcg_gen_andi_i64(shifted, shifted, 0xfe00000000000000ULL);
+    tcg_gen_shr_i64(avrA, avrA, sh);
+    set_avr64(VT, avrA, true);
+    /*
+     * Perform shift on lower doubleword element of vA and replace highest
+     * sh bits with shifted.
+     */
+    get_avr64(avrA, VA, false);
+    tcg_gen_shr_i64(avrA, avrA, sh);
+    tcg_gen_or_i64(avrA, avrA, shifted);
+    set_avr64(VT, avrA, false);
+
+    tcg_temp_free_i64(avrA);
+    tcg_temp_free_i64(avrB);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free_i64(shifted);
+    tcg_temp_free_i64(tmp);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -682,11 +779,11 @@ GEN_VXFORM(vrld, 2, 3);
 GEN_VXFORM(vrldmi, 2, 3);
 GEN_VXFORM_DUAL(vrld, PPC_NONE, PPC2_ALTIVEC_207, \
                 vrldmi, PPC_NONE, PPC2_ISA300)
-GEN_VXFORM(vsl, 2, 7);
+GEN_VXFORM_TRANS(vsl, 2, 7);
 GEN_VXFORM(vrldnm, 2, 7);
 GEN_VXFORM_DUAL(vsl, PPC_ALTIVEC, PPC_NONE, \
                 vrldnm, PPC_NONE, PPC2_ISA300)
-GEN_VXFORM(vsr, 2, 11);
+GEN_VXFORM_TRANS(vsr, 2, 11);
 GEN_VXFORM_ENV(vpkuhum, 7, 0);
 GEN_VXFORM_ENV(vpkuwum, 7, 1);
 GEN_VXFORM_ENV(vpkudum, 7, 17);
-- 
cgit v1.1


From 28876bf27d2792e6b16cfb5283b9fb959fc0ad12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Tue, 16 Jul 2019 13:13:52 +0100
Subject: target/ppc: move opcode decode tables to PowerPCCPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The opcode decode tables aren't really part of the CPUPPCState but an
internal implementation detail for the translator. This can cause
problems with memcpy in cpu_copy as any table created during
ppc_cpu_realize get written over causing a memory leak. To avoid this
move the tables into PowerPCCPU which is better suited to hold
internal implementation details.

Attempts to fix: https://bugs.launchpad.net/qemu/+bug/1836558
Cc: 1836558@bugs.launchpad.net
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20190716121352.302-1-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/cpu.h                |  8 ++++----
 target/ppc/translate.c          |  3 ++-
 target/ppc/translate_init.inc.c | 16 +++++++---------
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 4ea33cf..630a25c 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1104,10 +1104,6 @@ struct CPUPPCState {
     bool resume_as_sreset;
 #endif
 
-    /* Those resources are used only during code translation */
-    /* opcode handlers */
-    opc_handler_t *opcodes[PPC_CPU_OPCODES_LEN];
-
     /* Those resources are used only in QEMU core */
     target_ulong hflags;      /* hflags is a MSR & HFLAGS_MASK         */
     target_ulong hflags_nmsr; /* specific hflags, not coming from MSR */
@@ -1191,6 +1187,10 @@ struct PowerPCCPU {
     int32_t node_id; /* NUMA node this CPU belongs to */
     PPCHash64Options *hash64_opts;
 
+    /* Those resources are used only during code translation */
+    /* opcode handlers */
+    opc_handler_t *opcodes[PPC_CPU_OPCODES_LEN];
+
     /* Fields related to migration compatibility hacks */
     bool pre_2_8_migration;
     target_ulong mig_msr_mask;
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 9f9553a..1afb31e 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -7858,6 +7858,7 @@ static bool ppc_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cs,
 static void ppc_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
     CPUPPCState *env = cs->env_ptr;
     opc_handler_t **table, *handler;
 
@@ -7875,7 +7876,7 @@ static void ppc_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
               opc3(ctx->opcode), opc4(ctx->opcode),
               ctx->le_mode ? "little" : "big");
     ctx->base.pc_next += 4;
-    table = env->opcodes;
+    table = cpu->opcodes;
     handler = table[opc1(ctx->opcode)];
     if (is_indirect_opcode(handler)) {
         table = ind_table(handler);
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index 86fc8f2..9cd2033 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -9440,14 +9440,13 @@ static void fix_opcode_tables(opc_handler_t **ppc_opcodes)
 static void create_ppc_opcodes(PowerPCCPU *cpu, Error **errp)
 {
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
-    CPUPPCState *env = &cpu->env;
     opcode_t *opc;
 
-    fill_new_table(env->opcodes, PPC_CPU_OPCODES_LEN);
+    fill_new_table(cpu->opcodes, PPC_CPU_OPCODES_LEN);
     for (opc = opcodes; opc < &opcodes[ARRAY_SIZE(opcodes)]; opc++) {
         if (((opc->handler.type & pcc->insns_flags) != 0) ||
             ((opc->handler.type2 & pcc->insns_flags2) != 0)) {
-            if (register_insn(env->opcodes, opc) < 0) {
+            if (register_insn(cpu->opcodes, opc) < 0) {
                 error_setg(errp, "ERROR initializing PowerPC instruction "
                            "0x%02x 0x%02x 0x%02x", opc->opc1, opc->opc2,
                            opc->opc3);
@@ -9455,7 +9454,7 @@ static void create_ppc_opcodes(PowerPCCPU *cpu, Error **errp)
             }
         }
     }
-    fix_opcode_tables(env->opcodes);
+    fix_opcode_tables(cpu->opcodes);
     fflush(stdout);
     fflush(stderr);
 }
@@ -10023,7 +10022,6 @@ static void ppc_cpu_unrealize(DeviceState *dev, Error **errp)
 {
     PowerPCCPU *cpu = POWERPC_CPU(dev);
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
-    CPUPPCState *env = &cpu->env;
     Error *local_err = NULL;
     opc_handler_t **table, **table_2;
     int i, j, k;
@@ -10035,11 +10033,11 @@ static void ppc_cpu_unrealize(DeviceState *dev, Error **errp)
     }
 
     for (i = 0; i < PPC_CPU_OPCODES_LEN; i++) {
-        if (env->opcodes[i] == &invalid_handler) {
+        if (cpu->opcodes[i] == &invalid_handler) {
             continue;
         }
-        if (is_indirect_opcode(env->opcodes[i])) {
-            table = ind_table(env->opcodes[i]);
+        if (is_indirect_opcode(cpu->opcodes[i])) {
+            table = ind_table(cpu->opcodes[i]);
             for (j = 0; j < PPC_CPU_INDIRECT_OPCODES_LEN; j++) {
                 if (table[j] == &invalid_handler) {
                     continue;
@@ -10057,7 +10055,7 @@ static void ppc_cpu_unrealize(DeviceState *dev, Error **errp)
                                              ~PPC_INDIRECT));
                 }
             }
-            g_free((opc_handler_t *)((uintptr_t)env->opcodes[i] &
+            g_free((opc_handler_t *)((uintptr_t)cpu->opcodes[i] &
                 ~PPC_INDIRECT));
         }
     }
-- 
cgit v1.1


From 083b3f012fc27536afc74d005d706b20eae200f8 Mon Sep 17 00:00:00 2001
From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Date: Mon, 15 Jul 2019 16:22:50 +0200
Subject: target/ppc: Optimize emulation of vgbbd instruction

Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for both doubleword elements of source register
in parallel, in order to reduce the number of instructions needed(that's why
arrays are used):
First, both doubleword elements of source register vB are placed in
appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
have to be shifted right for 7 and 8 places, respectively, in order to get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
After first 8 iteration(first loop), all the first bits are in their final
places, all second bits but second bit from eight byte are in their places...
only 1 eight bit from eight byte is in it's place). In second loop we do all
operations symmetrically, in order to get other half of bits in their final
spots. Results for first and second doubleword elements are saved in
result[0] and result[1] respectively. In the end those results are saved in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <1563200574-11098-5-git-send-email-stefan.brankovic@rt-rk.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/helper.h                 |   1 -
 target/ppc/int_helper.c             | 276 ------------------------------------
 target/ppc/translate/vmx-impl.inc.c |  77 +++++++++-
 3 files changed, 76 insertions(+), 278 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 6fb823c..9b486a0 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -324,7 +324,6 @@ DEF_HELPER_1(vclzlsbb, tl, avr)
 DEF_HELPER_1(vctzlsbb, tl, avr)
 DEF_HELPER_3(vbpermd, void, avr, avr, avr)
 DEF_HELPER_3(vbpermq, void, avr, avr, avr)
-DEF_HELPER_2(vgbbd, void, avr, avr)
 DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index d2cad78..a265cb0 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1187,282 +1187,6 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
-static const uint64_t VGBBD_MASKS[256] = {
-    0x0000000000000000ull, /* 00 */
-    0x0000000000000080ull, /* 01 */
-    0x0000000000008000ull, /* 02 */
-    0x0000000000008080ull, /* 03 */
-    0x0000000000800000ull, /* 04 */
-    0x0000000000800080ull, /* 05 */
-    0x0000000000808000ull, /* 06 */
-    0x0000000000808080ull, /* 07 */
-    0x0000000080000000ull, /* 08 */
-    0x0000000080000080ull, /* 09 */
-    0x0000000080008000ull, /* 0A */
-    0x0000000080008080ull, /* 0B */
-    0x0000000080800000ull, /* 0C */
-    0x0000000080800080ull, /* 0D */
-    0x0000000080808000ull, /* 0E */
-    0x0000000080808080ull, /* 0F */
-    0x0000008000000000ull, /* 10 */
-    0x0000008000000080ull, /* 11 */
-    0x0000008000008000ull, /* 12 */
-    0x0000008000008080ull, /* 13 */
-    0x0000008000800000ull, /* 14 */
-    0x0000008000800080ull, /* 15 */
-    0x0000008000808000ull, /* 16 */
-    0x0000008000808080ull, /* 17 */
-    0x0000008080000000ull, /* 18 */
-    0x0000008080000080ull, /* 19 */
-    0x0000008080008000ull, /* 1A */
-    0x0000008080008080ull, /* 1B */
-    0x0000008080800000ull, /* 1C */
-    0x0000008080800080ull, /* 1D */
-    0x0000008080808000ull, /* 1E */
-    0x0000008080808080ull, /* 1F */
-    0x0000800000000000ull, /* 20 */
-    0x0000800000000080ull, /* 21 */
-    0x0000800000008000ull, /* 22 */
-    0x0000800000008080ull, /* 23 */
-    0x0000800000800000ull, /* 24 */
-    0x0000800000800080ull, /* 25 */
-    0x0000800000808000ull, /* 26 */
-    0x0000800000808080ull, /* 27 */
-    0x0000800080000000ull, /* 28 */
-    0x0000800080000080ull, /* 29 */
-    0x0000800080008000ull, /* 2A */
-    0x0000800080008080ull, /* 2B */
-    0x0000800080800000ull, /* 2C */
-    0x0000800080800080ull, /* 2D */
-    0x0000800080808000ull, /* 2E */
-    0x0000800080808080ull, /* 2F */
-    0x0000808000000000ull, /* 30 */
-    0x0000808000000080ull, /* 31 */
-    0x0000808000008000ull, /* 32 */
-    0x0000808000008080ull, /* 33 */
-    0x0000808000800000ull, /* 34 */
-    0x0000808000800080ull, /* 35 */
-    0x0000808000808000ull, /* 36 */
-    0x0000808000808080ull, /* 37 */
-    0x0000808080000000ull, /* 38 */
-    0x0000808080000080ull, /* 39 */
-    0x0000808080008000ull, /* 3A */
-    0x0000808080008080ull, /* 3B */
-    0x0000808080800000ull, /* 3C */
-    0x0000808080800080ull, /* 3D */
-    0x0000808080808000ull, /* 3E */
-    0x0000808080808080ull, /* 3F */
-    0x0080000000000000ull, /* 40 */
-    0x0080000000000080ull, /* 41 */
-    0x0080000000008000ull, /* 42 */
-    0x0080000000008080ull, /* 43 */
-    0x0080000000800000ull, /* 44 */
-    0x0080000000800080ull, /* 45 */
-    0x0080000000808000ull, /* 46 */
-    0x0080000000808080ull, /* 47 */
-    0x0080000080000000ull, /* 48 */
-    0x0080000080000080ull, /* 49 */
-    0x0080000080008000ull, /* 4A */
-    0x0080000080008080ull, /* 4B */
-    0x0080000080800000ull, /* 4C */
-    0x0080000080800080ull, /* 4D */
-    0x0080000080808000ull, /* 4E */
-    0x0080000080808080ull, /* 4F */
-    0x0080008000000000ull, /* 50 */
-    0x0080008000000080ull, /* 51 */
-    0x0080008000008000ull, /* 52 */
-    0x0080008000008080ull, /* 53 */
-    0x0080008000800000ull, /* 54 */
-    0x0080008000800080ull, /* 55 */
-    0x0080008000808000ull, /* 56 */
-    0x0080008000808080ull, /* 57 */
-    0x0080008080000000ull, /* 58 */
-    0x0080008080000080ull, /* 59 */
-    0x0080008080008000ull, /* 5A */
-    0x0080008080008080ull, /* 5B */
-    0x0080008080800000ull, /* 5C */
-    0x0080008080800080ull, /* 5D */
-    0x0080008080808000ull, /* 5E */
-    0x0080008080808080ull, /* 5F */
-    0x0080800000000000ull, /* 60 */
-    0x0080800000000080ull, /* 61 */
-    0x0080800000008000ull, /* 62 */
-    0x0080800000008080ull, /* 63 */
-    0x0080800000800000ull, /* 64 */
-    0x0080800000800080ull, /* 65 */
-    0x0080800000808000ull, /* 66 */
-    0x0080800000808080ull, /* 67 */
-    0x0080800080000000ull, /* 68 */
-    0x0080800080000080ull, /* 69 */
-    0x0080800080008000ull, /* 6A */
-    0x0080800080008080ull, /* 6B */
-    0x0080800080800000ull, /* 6C */
-    0x0080800080800080ull, /* 6D */
-    0x0080800080808000ull, /* 6E */
-    0x0080800080808080ull, /* 6F */
-    0x0080808000000000ull, /* 70 */
-    0x0080808000000080ull, /* 71 */
-    0x0080808000008000ull, /* 72 */
-    0x0080808000008080ull, /* 73 */
-    0x0080808000800000ull, /* 74 */
-    0x0080808000800080ull, /* 75 */
-    0x0080808000808000ull, /* 76 */
-    0x0080808000808080ull, /* 77 */
-    0x0080808080000000ull, /* 78 */
-    0x0080808080000080ull, /* 79 */
-    0x0080808080008000ull, /* 7A */
-    0x0080808080008080ull, /* 7B */
-    0x0080808080800000ull, /* 7C */
-    0x0080808080800080ull, /* 7D */
-    0x0080808080808000ull, /* 7E */
-    0x0080808080808080ull, /* 7F */
-    0x8000000000000000ull, /* 80 */
-    0x8000000000000080ull, /* 81 */
-    0x8000000000008000ull, /* 82 */
-    0x8000000000008080ull, /* 83 */
-    0x8000000000800000ull, /* 84 */
-    0x8000000000800080ull, /* 85 */
-    0x8000000000808000ull, /* 86 */
-    0x8000000000808080ull, /* 87 */
-    0x8000000080000000ull, /* 88 */
-    0x8000000080000080ull, /* 89 */
-    0x8000000080008000ull, /* 8A */
-    0x8000000080008080ull, /* 8B */
-    0x8000000080800000ull, /* 8C */
-    0x8000000080800080ull, /* 8D */
-    0x8000000080808000ull, /* 8E */
-    0x8000000080808080ull, /* 8F */
-    0x8000008000000000ull, /* 90 */
-    0x8000008000000080ull, /* 91 */
-    0x8000008000008000ull, /* 92 */
-    0x8000008000008080ull, /* 93 */
-    0x8000008000800000ull, /* 94 */
-    0x8000008000800080ull, /* 95 */
-    0x8000008000808000ull, /* 96 */
-    0x8000008000808080ull, /* 97 */
-    0x8000008080000000ull, /* 98 */
-    0x8000008080000080ull, /* 99 */
-    0x8000008080008000ull, /* 9A */
-    0x8000008080008080ull, /* 9B */
-    0x8000008080800000ull, /* 9C */
-    0x8000008080800080ull, /* 9D */
-    0x8000008080808000ull, /* 9E */
-    0x8000008080808080ull, /* 9F */
-    0x8000800000000000ull, /* A0 */
-    0x8000800000000080ull, /* A1 */
-    0x8000800000008000ull, /* A2 */
-    0x8000800000008080ull, /* A3 */
-    0x8000800000800000ull, /* A4 */
-    0x8000800000800080ull, /* A5 */
-    0x8000800000808000ull, /* A6 */
-    0x8000800000808080ull, /* A7 */
-    0x8000800080000000ull, /* A8 */
-    0x8000800080000080ull, /* A9 */
-    0x8000800080008000ull, /* AA */
-    0x8000800080008080ull, /* AB */
-    0x8000800080800000ull, /* AC */
-    0x8000800080800080ull, /* AD */
-    0x8000800080808000ull, /* AE */
-    0x8000800080808080ull, /* AF */
-    0x8000808000000000ull, /* B0 */
-    0x8000808000000080ull, /* B1 */
-    0x8000808000008000ull, /* B2 */
-    0x8000808000008080ull, /* B3 */
-    0x8000808000800000ull, /* B4 */
-    0x8000808000800080ull, /* B5 */
-    0x8000808000808000ull, /* B6 */
-    0x8000808000808080ull, /* B7 */
-    0x8000808080000000ull, /* B8 */
-    0x8000808080000080ull, /* B9 */
-    0x8000808080008000ull, /* BA */
-    0x8000808080008080ull, /* BB */
-    0x8000808080800000ull, /* BC */
-    0x8000808080800080ull, /* BD */
-    0x8000808080808000ull, /* BE */
-    0x8000808080808080ull, /* BF */
-    0x8080000000000000ull, /* C0 */
-    0x8080000000000080ull, /* C1 */
-    0x8080000000008000ull, /* C2 */
-    0x8080000000008080ull, /* C3 */
-    0x8080000000800000ull, /* C4 */
-    0x8080000000800080ull, /* C5 */
-    0x8080000000808000ull, /* C6 */
-    0x8080000000808080ull, /* C7 */
-    0x8080000080000000ull, /* C8 */
-    0x8080000080000080ull, /* C9 */
-    0x8080000080008000ull, /* CA */
-    0x8080000080008080ull, /* CB */
-    0x8080000080800000ull, /* CC */
-    0x8080000080800080ull, /* CD */
-    0x8080000080808000ull, /* CE */
-    0x8080000080808080ull, /* CF */
-    0x8080008000000000ull, /* D0 */
-    0x8080008000000080ull, /* D1 */
-    0x8080008000008000ull, /* D2 */
-    0x8080008000008080ull, /* D3 */
-    0x8080008000800000ull, /* D4 */
-    0x8080008000800080ull, /* D5 */
-    0x8080008000808000ull, /* D6 */
-    0x8080008000808080ull, /* D7 */
-    0x8080008080000000ull, /* D8 */
-    0x8080008080000080ull, /* D9 */
-    0x8080008080008000ull, /* DA */
-    0x8080008080008080ull, /* DB */
-    0x8080008080800000ull, /* DC */
-    0x8080008080800080ull, /* DD */
-    0x8080008080808000ull, /* DE */
-    0x8080008080808080ull, /* DF */
-    0x8080800000000000ull, /* E0 */
-    0x8080800000000080ull, /* E1 */
-    0x8080800000008000ull, /* E2 */
-    0x8080800000008080ull, /* E3 */
-    0x8080800000800000ull, /* E4 */
-    0x8080800000800080ull, /* E5 */
-    0x8080800000808000ull, /* E6 */
-    0x8080800000808080ull, /* E7 */
-    0x8080800080000000ull, /* E8 */
-    0x8080800080000080ull, /* E9 */
-    0x8080800080008000ull, /* EA */
-    0x8080800080008080ull, /* EB */
-    0x8080800080800000ull, /* EC */
-    0x8080800080800080ull, /* ED */
-    0x8080800080808000ull, /* EE */
-    0x8080800080808080ull, /* EF */
-    0x8080808000000000ull, /* F0 */
-    0x8080808000000080ull, /* F1 */
-    0x8080808000008000ull, /* F2 */
-    0x8080808000008080ull, /* F3 */
-    0x8080808000800000ull, /* F4 */
-    0x8080808000800080ull, /* F5 */
-    0x8080808000808000ull, /* F6 */
-    0x8080808000808080ull, /* F7 */
-    0x8080808080000000ull, /* F8 */
-    0x8080808080000080ull, /* F9 */
-    0x8080808080008000ull, /* FA */
-    0x8080808080008080ull, /* FB */
-    0x8080808080800000ull, /* FC */
-    0x8080808080800080ull, /* FD */
-    0x8080808080808000ull, /* FE */
-    0x8080808080808080ull, /* FF */
-};
-
-void helper_vgbbd(ppc_avr_t *r, ppc_avr_t *b)
-{
-    int i;
-    uint64_t t[2] = { 0, 0 };
-
-    VECTOR_FOR_INORDER_I(i, u8) {
-#if defined(HOST_WORDS_BIGENDIAN)
-        t[i >> 3] |= VGBBD_MASKS[b->u8[i]] >> (i & 7);
-#else
-        t[i >> 3] |= VGBBD_MASKS[b->u8[i]] >> (7 - (i & 7));
-#endif
-    }
-
-    r->u64[0] = t[0];
-    r->u64[1] = t[1];
-}
-
 #define PMSUM(name, srcfld, trgfld, trgtyp)                   \
 void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
 {                                                             \
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index e06e65a..1315335 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -667,6 +667,81 @@ static void trans_vsr(DisasContext *ctx)
     tcg_temp_free_i64(tmp);
 }
 
+/*
+ * vgbbd VRT,VRB - Vector Gather Bits by Bytes by Doubleword
+ *
+ * All ith bits (i in range 1 to 8) of each byte of doubleword element in source
+ * register are concatenated and placed into ith byte of appropriate doubleword
+ * element in destination register.
+ *
+ * Following solution is done for both doubleword elements of source register
+ * in parallel, in order to reduce the number of instructions needed(that's why
+ * arrays are used):
+ * First, both doubleword elements of source register vB are placed in
+ * appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
+ * loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
+ * byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
+ * tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
+ * have to be shifted right for 7 and 8 places, respectively, in order to get
+ * bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
+ * shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
+ * After first 8 iteration(first loop), all the first bits are in their final
+ * places, all second bits but second bit from eight byte are in their places...
+ * only 1 eight bit from eight byte is in it's place). In second loop we do all
+ * operations symmetrically, in order to get other half of bits in their final
+ * spots. Results for first and second doubleword elements are saved in
+ * result[0] and result[1] respectively. In the end those results are saved in
+ * appropriate doubleword element of destination register vD.
+ */
+static void trans_vgbbd(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    uint64_t mask = 0x8040201008040201ULL;
+    int i, j;
+
+    TCGv_i64 result[2];
+    result[0] = tcg_temp_new_i64();
+    result[1] = tcg_temp_new_i64();
+    TCGv_i64 avr[2];
+    avr[0] = tcg_temp_new_i64();
+    avr[1] = tcg_temp_new_i64();
+    TCGv_i64 tcg_mask = tcg_temp_new_i64();
+
+    tcg_gen_movi_i64(tcg_mask, mask);
+    for (j = 0; j < 2; j++) {
+        get_avr64(avr[j], VB, j);
+        tcg_gen_and_i64(result[j], avr[j], tcg_mask);
+    }
+    for (i = 1; i < 8; i++) {
+        tcg_gen_movi_i64(tcg_mask, mask >> (i * 8));
+        for (j = 0; j < 2; j++) {
+            tcg_gen_shri_i64(tmp, avr[j], i * 7);
+            tcg_gen_and_i64(tmp, tmp, tcg_mask);
+            tcg_gen_or_i64(result[j], result[j], tmp);
+        }
+    }
+    for (i = 1; i < 8; i++) {
+        tcg_gen_movi_i64(tcg_mask, mask << (i * 8));
+        for (j = 0; j < 2; j++) {
+            tcg_gen_shli_i64(tmp, avr[j], i * 7);
+            tcg_gen_and_i64(tmp, tmp, tcg_mask);
+            tcg_gen_or_i64(result[j], result[j], tmp);
+        }
+    }
+    for (j = 0; j < 2; j++) {
+        set_avr64(VT, result[j], j);
+    }
+
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i64(tcg_mask);
+    tcg_temp_free_i64(result[0]);
+    tcg_temp_free_i64(result[1]);
+    tcg_temp_free_i64(avr[0]);
+    tcg_temp_free_i64(avr[1]);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1211,7 +1286,7 @@ GEN_VXFORM_DUAL(vclzd, PPC_NONE, PPC2_ALTIVEC_207, \
                 vpopcntd, PPC_NONE, PPC2_ALTIVEC_207)
 GEN_VXFORM(vbpermd, 6, 23);
 GEN_VXFORM(vbpermq, 6, 21);
-GEN_VXFORM_NOA(vgbbd, 6, 20);
+GEN_VXFORM_TRANS(vgbbd, 6, 20);
 GEN_VXFORM(vpmsumb, 4, 16)
 GEN_VXFORM(vpmsumh, 4, 17)
 GEN_VXFORM(vpmsumw, 4, 18)
-- 
cgit v1.1


From b8313f0d91b192c9415b3c678b387acb77ad112b Mon Sep 17 00:00:00 2001
From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Date: Mon, 15 Jul 2019 16:22:51 +0200
Subject: target/ppc: Optimize emulation of vclzd instruction

Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <1563200574-11098-6-git-send-email-stefan.brankovic@rt-rk.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/helper.h                 |  1 -
 target/ppc/int_helper.c             |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++++++++++++++++++++++++++-
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9b486a0..e203f76 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -311,7 +311,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vclzw, void, avr, avr)
-DEF_HELPER_2(vclzd, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index a265cb0..b82765d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1820,17 +1820,14 @@ VUPK(lsw, s64, s32, UPKLO)
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
 #define clzw(v) clz32((v))
-#define clzd(v) clz64((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
 VGENERIC_DO(clzw, u32)
-VGENERIC_DO(clzd, u64)
 
 #undef clzb
 #undef clzh
 #undef clzw
-#undef clzd
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index 1315335..3372c2c 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -742,6 +742,32 @@ static void trans_vgbbd(DisasContext *ctx)
     tcg_temp_free_i64(avr[1]);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i64 avr = tcg_temp_new_i64();
+
+    /* high doubleword */
+    get_avr64(avr, VB, true);
+    tcg_gen_clzi_i64(avr, avr, 64);
+    set_avr64(VT, avr, true);
+
+    /* low doubleword */
+    get_avr64(avr, VB, false);
+    tcg_gen_clzi_i64(avr, avr, 64);
+    set_avr64(VT, avr, false);
+
+    tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1258,7 +1284,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
cgit v1.1


From 1872588ede5770751ebc0e1df9909c8a785cb549 Mon Sep 17 00:00:00 2001
From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Date: Mon, 15 Jul 2019 16:22:52 +0200
Subject: target/ppc: Optimize emulation of vclzw instruction

Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

Counting is to be performed in four iterations of for loop(one for each
word elemnt of source register vB). Every iteration consists of loading
appropriate word element from source register, counting leading zeros
with tcg_gen_clzi_i32, and saving the result in appropriate word element
of destination register.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <1563200574-11098-7-git-send-email-stefan.brankovic@rt-rk.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/helper.h                 |  1 -
 target/ppc/int_helper.c             |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++++++++++++++++++++++++++-
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index e203f76..54ea9b9 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -310,7 +310,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
-DEF_HELPER_2(vclzw, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index b82765d..46deb57 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1819,15 +1819,12 @@ VUPK(lsw, s64, s32, UPKLO)
 
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-#define clzw(v) clz32((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
-VGENERIC_DO(clzw, u32)
 
 #undef clzb
 #undef clzh
-#undef clzw
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index 3372c2c..0d71c10 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -743,6 +743,32 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    int i;
+
+    /* Perform count for every word element using tcg_gen_clzi_i32. */
+    for (i = 0; i < 4; i++) {
+        tcg_gen_ld_i32(tmp, cpu_env,
+            offsetof(CPUPPCState, vsr[32 + VB].u64[0]) + i * 4);
+        tcg_gen_clzi_i32(tmp, tmp, 32);
+        tcg_gen_st_i32(tmp, cpu_env,
+            offsetof(CPUPPCState, vsr[32 + VT].u64[0]) + i * 4);
+    }
+
+    tcg_temp_free_i32(tmp);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in source
@@ -1283,7 +1309,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
cgit v1.1


From 03ef074c04a219188bbd0094ee599bd50a0a374e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 18 Jul 2019 13:42:11 +1000
Subject: spapr: Implement dispatch tracking for tcg

Implement cpu_exec_enter/exit on ppc which calls into new methods of
the same name in PPCVirtualHypervisorClass. These are used by spapr
to implement the splpar VPA dispatch counter initially.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Message-Id: <20190718034214.14948-2-npiggin@gmail.com>
[dwg: Removed unnecessary CONFIG_USER_ONLY checks as suggested by gkurz]
Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/cpu.h                |  4 ++++
 target/ppc/translate_init.inc.c | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'target/ppc')

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 630a25c..50245a8 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1224,6 +1224,10 @@ struct PPCVirtualHypervisorClass {
     void (*hpte_set_r)(PPCVirtualHypervisor *vhyp, hwaddr ptex, uint64_t pte1);
     void (*get_pate)(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry);
     target_ulong (*encode_hpt_for_kvm_pr)(PPCVirtualHypervisor *vhyp);
+#ifndef CONFIG_USER_ONLY
+    void (*cpu_exec_enter)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
+    void (*cpu_exec_exit)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
+#endif
 };
 
 #define TYPE_PPC_VIRTUAL_HYPERVISOR "ppc-virtual-hypervisor"
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index 9cd2033..c9fcd87 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -10469,6 +10469,28 @@ static bool ppc_cpu_is_big_endian(CPUState *cs)
 
     return !msr_le;
 }
+
+static void ppc_cpu_exec_enter(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    if (cpu->vhyp) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        vhc->cpu_exec_enter(cpu->vhyp, cpu);
+    }
+}
+
+static void ppc_cpu_exec_exit(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    if (cpu->vhyp) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        vhc->cpu_exec_exit(cpu->vhyp, cpu);
+    }
+}
 #endif
 
 static void ppc_cpu_instance_init(Object *obj)
@@ -10622,6 +10644,11 @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_initialize = ppc_translate_init;
     cc->tlb_fill = ppc_cpu_tlb_fill;
 #endif
+#ifndef CONFIG_USER_ONLY
+    cc->cpu_exec_enter = ppc_cpu_exec_enter;
+    cc->cpu_exec_exit = ppc_cpu_exec_exit;
+#endif
+
     cc->disas_set_info = ppc_disas_set_info;
 
     dc->fw_name = "PowerPC,UNKNOWN";
-- 
cgit v1.1


From 1e8f51e856621d3615a21cfea6f0baf055cbcce8 Mon Sep 17 00:00:00 2001
From: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Date: Thu, 25 Jul 2019 09:15:08 -0500
Subject: ppc: remove idle_timer logic

The logic is broken for multiple vcpu guests, also causing memory leak.
The logic is in place to handle kvm not having KVM_CAP_PPC_IRQ_LEVEL,
which is part of the kernel now since 2.6.37. Instead of fixing the
leak, drop the redundant logic which is not excercised on new kernels
anymore. Exit with error on older kernels.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Message-Id: <156406409479.19996.7606556689856621111.stgit@lep8c.aus.stglabs.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/kvm.c | 75 ++++----------------------------------------------------
 1 file changed, 5 insertions(+), 70 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 6162a90..8c5b1f2 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -58,7 +58,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 };
 
 static int cap_interrupt_unset;
-static int cap_interrupt_level;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
@@ -90,25 +89,6 @@ static int cap_large_decr;
 static uint32_t debug_inst_opcode;
 
 /*
- * XXX We have a race condition where we actually have a level triggered
- *     interrupt, but the infrastructure can't expose that yet, so the guest
- *     takes but ignores it, goes to sleep and never gets notified that there's
- *     still an interrupt pending.
- *
- *     As a quick workaround, let's just wake up again 20 ms after we injected
- *     an interrupt. That way we can assure that we're always reinjecting
- *     interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
-static void kvm_kick_cpu(void *opaque)
-{
-    PowerPCCPU *cpu = opaque;
-
-    qemu_cpu_kick(CPU(cpu));
-}
-
-/*
  * Check whether we are running with KVM-PR (instead of KVM-HV).  This
  * should only be used for fallback tests - generally we should use
  * explicit capabilities for the features we want, rather than
@@ -127,7 +107,6 @@ static int kvmppc_get_dec_bits(void);
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
-    cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
     cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
@@ -163,9 +142,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
      */
     cap_ppc_pvr_compat = false;
 
-    if (!cap_interrupt_level) {
-        fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
-                        "VM to stall at times!\n");
+    if (!kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL)) {
+        error_report("KVM: Host kernel doesn't have level irq capability");
+        exit(1);
     }
 
     kvm_ppc_register_host_cpu_type(ms);
@@ -493,8 +472,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
         return ret;
     }
 
-    idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
-
     switch (cenv->mmu_model) {
     case POWERPC_MMU_BOOKE206:
         /* This target supports access to KVM's guest TLB */
@@ -1334,7 +1311,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
         return 0;
     }
 
-    if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
+    if (!kvm_enabled() || !cap_interrupt_unset) {
         return 0;
     }
 
@@ -1351,49 +1328,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
 
 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 {
-    PowerPCCPU *cpu = POWERPC_CPU(cs);
-    CPUPPCState *env = &cpu->env;
-    int r;
-    unsigned irq;
-
-    qemu_mutex_lock_iothread();
-
-    /*
-     * PowerPC QEMU tracks the various core input pins (interrupt,
-     * critical interrupt, reset, etc) in PPC-specific
-     * env->irq_input_state.
-     */
-    if (!cap_interrupt_level &&
-        run->ready_for_interrupt_injection &&
-        (cs->interrupt_request & CPU_INTERRUPT_HARD) &&
-        (env->irq_input_state & (1 << PPC_INPUT_INT)))
-    {
-        /*
-         * For now KVM disregards the 'irq' argument. However, in the
-         * future KVM could cache it in-kernel to avoid a heavyweight
-         * exit when reading the UIC.
-         */
-        irq = KVM_INTERRUPT_SET;
-
-        trace_kvm_injected_interrupt(irq);
-        r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &irq);
-        if (r < 0) {
-            printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
-        }
-
-        /* Always wake up soon in case the interrupt was level based */
-        timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-                       (NANOSECONDS_PER_SECOND / 50));
-    }
-
-    /*
-     * We don't know if there are more interrupts pending after
-     * this. However, the guest will return to userspace in the course
-     * of handling this one anyways, so we will get a chance to
-     * deliver the rest.
-     */
-
-    qemu_mutex_unlock_iothread();
+    return;
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
-- 
cgit v1.1


From cfc61ba62f30753849fe2b78da8b6d0a0a639db2 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 16 Aug 2019 16:17:33 +1000
Subject: target/ppc: Add Directed Privileged Door-bell Exception State (DPDES)
 SPR

DPDES stores a status of a doorbell message and if it is lost in
migration, the destination CPU won't receive it. This does not hit us
much as IPIs complete too quick to catch a pending one and even if
we missed one, broadcasts happen often enough to wake that CPU.

This defines DPDES and registers with KVM for migration.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Message-Id: <20190816061733.53572-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/cpu.h                |  1 +
 target/ppc/translate_init.inc.c | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'target/ppc')

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 50245a8..4b35c8e 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1466,6 +1466,7 @@ typedef PowerPCCPU ArchCPU;
 #define SPR_MPC_ICTRL         (0x09E)
 #define SPR_MPC_BAR           (0x09F)
 #define SPR_PSPB              (0x09F)
+#define SPR_DPDES             (0x0B0)
 #define SPR_DAWR              (0x0B4)
 #define SPR_RPR               (0x0BA)
 #define SPR_CIABR             (0x0BB)
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index c9fcd87..7e41ae1 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -8198,6 +8198,18 @@ static void gen_spr_power8_pspb(CPUPPCState *env)
                      KVM_REG_PPC_PSPB, 0);
 }
 
+static void gen_spr_power8_dpdes(CPUPPCState *env)
+{
+#if !defined(CONFIG_USER_ONLY)
+    /* Directed Privileged Door-bell Exception State, used for IPI */
+    spr_register_kvm_hv(env, SPR_DPDES, "DPDES",
+                        SPR_NOACCESS, SPR_NOACCESS,
+                        &spr_read_generic, SPR_NOACCESS,
+                        &spr_read_generic, &spr_write_generic,
+                        KVM_REG_PPC_DPDES, 0x00000000);
+#endif
+}
+
 static void gen_spr_power8_ic(CPUPPCState *env)
 {
 #if !defined(CONFIG_USER_ONLY)
@@ -8629,6 +8641,7 @@ static void init_proc_POWER8(CPUPPCState *env)
     gen_spr_power8_pmu_user(env);
     gen_spr_power8_tm(env);
     gen_spr_power8_pspb(env);
+    gen_spr_power8_dpdes(env);
     gen_spr_vtb(env);
     gen_spr_power8_ic(env);
     gen_spr_power8_book4(env);
@@ -8817,6 +8830,7 @@ static void init_proc_POWER9(CPUPPCState *env)
     gen_spr_power8_pmu_user(env);
     gen_spr_power8_tm(env);
     gen_spr_power8_pspb(env);
+    gen_spr_power8_dpdes(env);
     gen_spr_vtb(env);
     gen_spr_power8_ic(env);
     gen_spr_power8_book4(env);
-- 
cgit v1.1


From 31eb7dddacc242af244798854444c3497b515e9f Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Fri, 16 Aug 2019 14:03:23 -0500
Subject: ppc: Add support for 'mffsl' instruction

ISA 3.0B added a set of Floating-Point Status and Control Register (FPSCR)
instructions: mffsce, mffscdrn, mffscdrni, mffscrn, mffscrni, mffsl.
This patch adds support for 'mffsl'.

'mffsl' is identical to 'mffs', except it only returns mode, status, and enable
bits from the FPSCR.

On CPUs without support for 'mffsl' (below ISA 3.0), the 'mffsl' instruction
will execute identically to 'mffs'.

Note: I renamed FPSCR_RN to FPSCR_RN0 so I could create an FPSCR_RN mask which
is both bits of the FPSCR rounding mode, as defined in the ISA.

I also fixed a typo in the definition of FPSCR_FR.

Signed-off-by: Paul A. Clarke <pc@us.ibm.com>

v4:
- nit: added some braces to resolve a checkpatch complaint.

v3:
- Changed tcg_gen_and_i64 to tcg_gen_andi_i64, eliminating the need for a
  temporary, per review from Richard Henderson.

v2:
- I found that I copied too much of the 'mffs' implementation.
  The 'Rc' condition code bits are not needed for 'mffsl'.  Removed.
- I now free the (renamed) 'tmask' temporary.
- I now bail early for older ISA to the original 'mffs' implementation.

Message-Id: <1565982203-11048-1-git-send-email-pc@us.ibm.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/cpu.h                   | 15 ++++++++++-----
 target/ppc/fpu_helper.c            |  4 ++--
 target/ppc/translate/fp-impl.inc.c | 22 ++++++++++++++++++++++
 target/ppc/translate/fp-ops.inc.c  |  4 +++-
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 4b35c8e..eaee1a5 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -591,7 +591,7 @@ enum {
 #define FPSCR_XE     3  /* Floating-point inexact exception enable           */
 #define FPSCR_NI     2  /* Floating-point non-IEEE mode                      */
 #define FPSCR_RN1    1
-#define FPSCR_RN     0  /* Floating-point rounding control                   */
+#define FPSCR_RN0    0  /* Floating-point rounding control                   */
 #define fpscr_fex    (((env->fpscr) >> FPSCR_FEX)    & 0x1)
 #define fpscr_vx     (((env->fpscr) >> FPSCR_VX)     & 0x1)
 #define fpscr_ox     (((env->fpscr) >> FPSCR_OX)     & 0x1)
@@ -614,7 +614,7 @@ enum {
 #define fpscr_ze     (((env->fpscr) >> FPSCR_ZE)     & 0x1)
 #define fpscr_xe     (((env->fpscr) >> FPSCR_XE)     & 0x1)
 #define fpscr_ni     (((env->fpscr) >> FPSCR_NI)     & 0x1)
-#define fpscr_rn     (((env->fpscr) >> FPSCR_RN)     & 0x3)
+#define fpscr_rn     (((env->fpscr) >> FPSCR_RN0)    & 0x3)
 /* Invalid operation exception summary */
 #define fpscr_ix ((env->fpscr) & ((1 << FPSCR_VXSNAN) | (1 << FPSCR_VXISI)  | \
                                   (1 << FPSCR_VXIDI)  | (1 << FPSCR_VXZDZ)  | \
@@ -640,7 +640,7 @@ enum {
 #define FP_VXZDZ        (1ull << FPSCR_VXZDZ)
 #define FP_VXIMZ        (1ull << FPSCR_VXIMZ)
 #define FP_VXVC         (1ull << FPSCR_VXVC)
-#define FP_FR           (1ull << FSPCR_FR)
+#define FP_FR           (1ull << FPSCR_FR)
 #define FP_FI           (1ull << FPSCR_FI)
 #define FP_C            (1ull << FPSCR_C)
 #define FP_FL           (1ull << FPSCR_FL)
@@ -648,7 +648,7 @@ enum {
 #define FP_FE           (1ull << FPSCR_FE)
 #define FP_FU           (1ull << FPSCR_FU)
 #define FP_FPCC         (FP_FL | FP_FG | FP_FE | FP_FU)
-#define FP_FPRF         (FP_C  | FP_FL | FP_FG | FP_FE | FP_FU)
+#define FP_FPRF         (FP_C | FP_FPCC)
 #define FP_VXSOFT       (1ull << FPSCR_VXSOFT)
 #define FP_VXSQRT       (1ull << FPSCR_VXSQRT)
 #define FP_VXCVI        (1ull << FPSCR_VXCVI)
@@ -659,7 +659,12 @@ enum {
 #define FP_XE           (1ull << FPSCR_XE)
 #define FP_NI           (1ull << FPSCR_NI)
 #define FP_RN1          (1ull << FPSCR_RN1)
-#define FP_RN           (1ull << FPSCR_RN)
+#define FP_RN0          (1ull << FPSCR_RN0)
+#define FP_RN           (FP_RN1 | FP_RN0)
+
+#define FP_MODE         FP_RN
+#define FP_ENABLES      (FP_VE | FP_OE | FP_UE | FP_ZE | FP_XE)
+#define FP_STATUS       (FP_FR | FP_FI | FP_FPRF)
 
 /* the exception bits which can be cleared by mcrfs - includes FX */
 #define FP_EX_CLEAR_BITS (FP_FX     | FP_OX     | FP_UX     | FP_ZX     | \
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index f437c88..5611cf0 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -403,7 +403,7 @@ void helper_fpscr_clrbit(CPUPPCState *env, uint32_t bit)
     if (prev == 1) {
         switch (bit) {
         case FPSCR_RN1:
-        case FPSCR_RN:
+        case FPSCR_RN0:
             fpscr_set_rounding_mode(env);
             break;
         case FPSCR_VXSNAN:
@@ -557,7 +557,7 @@ void helper_fpscr_setbit(CPUPPCState *env, uint32_t bit)
             }
             break;
         case FPSCR_RN1:
-        case FPSCR_RN:
+        case FPSCR_RN0:
             fpscr_set_rounding_mode(env);
             break;
         default:
diff --git a/target/ppc/translate/fp-impl.inc.c b/target/ppc/translate/fp-impl.inc.c
index 9dcff94..7cd9d8d 100644
--- a/target/ppc/translate/fp-impl.inc.c
+++ b/target/ppc/translate/fp-impl.inc.c
@@ -617,6 +617,28 @@ static void gen_mffs(DisasContext *ctx)
     tcg_temp_free_i64(t0);
 }
 
+/* mffsl */
+static void gen_mffsl(DisasContext *ctx)
+{
+    TCGv_i64 t0;
+
+    if (unlikely(!(ctx->insns_flags2 & PPC2_ISA300))) {
+        return gen_mffs(ctx);
+    }
+
+    if (unlikely(!ctx->fpu_enabled)) {
+        gen_exception(ctx, POWERPC_EXCP_FPU);
+        return;
+    }
+    t0 = tcg_temp_new_i64();
+    gen_reset_fpstatus();
+    tcg_gen_extu_tl_i64(t0, cpu_fpscr);
+    /* Mask everything except mode, status, and enables.  */
+    tcg_gen_andi_i64(t0, t0, FP_MODE | FP_STATUS | FP_ENABLES);
+    set_fpr(rD(ctx->opcode), t0);
+    tcg_temp_free_i64(t0);
+}
+
 /* mtfsb0 */
 static void gen_mtfsb0(DisasContext *ctx)
 {
diff --git a/target/ppc/translate/fp-ops.inc.c b/target/ppc/translate/fp-ops.inc.c
index 621f6bf..88ebc25 100644
--- a/target/ppc/translate/fp-ops.inc.c
+++ b/target/ppc/translate/fp-ops.inc.c
@@ -104,7 +104,9 @@ GEN_HANDLER_E(fcpsgn, 0x3F, 0x08, 0x00, 0x00000000, PPC_NONE, PPC2_ISA205),
 GEN_HANDLER_E(fmrgew, 0x3F, 0x06, 0x1E, 0x00000001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER_E(fmrgow, 0x3F, 0x06, 0x1A, 0x00000001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER(mcrfs, 0x3F, 0x00, 0x02, 0x0063F801, PPC_FLOAT),
-GEN_HANDLER(mffs, 0x3F, 0x07, 0x12, 0x001FF800, PPC_FLOAT),
+GEN_HANDLER_E_2(mffs, 0x3F, 0x07, 0x12, 0x00, 0x00000000, PPC_FLOAT, PPC_NONE),
+GEN_HANDLER_E_2(mffsl, 0x3F, 0x07, 0x12, 0x18, 0x00000000, PPC_FLOAT,
+    PPC2_ISA300),
 GEN_HANDLER(mtfsb0, 0x3F, 0x06, 0x02, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsb1, 0x3F, 0x06, 0x01, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsf, 0x3F, 0x07, 0x16, 0x00000000, PPC_FLOAT),
-- 
cgit v1.1


From e6f1bfb21121b5aaf6f5ff339157c0d55e8b9fe2 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Mon, 19 Aug 2019 12:43:21 -0500
Subject: ppc: conform to processor User's Manual for xscvdpspn

The POWER8 and POWER9 User's Manuals specify the implementation
behavior for what the ISA leaves "undefined" behavior for the
xscvdpspn and xscvdpsp instructions.  This patch corrects the QEMU
implementation to match the hardware implementation for that case.

ISA 3.0B has xscvdpspn leaving its result in word 0 of the target register,
with the other words of the target register left "undefined".

The User's Manuals specify:
  VSX scalar convert from double-precision to single-precision (xscvdpsp,
  xscvdpspn).
  VSR[32:63] is set to VSR[0:31].
So, words 0 and 1 both contain the result.

Note: this is important because GCC as of version 8 or so, assumes and takes
advantage of this behavior to optimize the following sequence:
  xscvdpspn vs0,vs1
  mffprwz   r8,f0
ISA 3.0B has xscvdpspn leaving its result in word 0 of the target register,
and mffprwz expecting its input to come from word 1 of the source register.
This sequence fails with QEMU, as a shift is required between those two
instructions.  However, since the hardware splats the result to both words 0
and 1 of its output register, the shift is not necessary.

Expect a future revision of the ISA to specify this behavior.

Signed-off-by: Paul A. Clarke <pc@us.ibm.com>

v2
- Splitting patch "ppc: Three floating point fixes"; this is just one part.
- Updated commit message to clarify behavior is documented in User's Manuals.
- Updated commit message to correct which words are in output and source of
  xscvdpspn and mffprz.
- No source changes to this part of the original patch.

Message-Id: <1566236601-22954-1-git-send-email-pc@us.ibm.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/fpu_helper.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'target/ppc')

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 5611cf0..23b9c97 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -2871,10 +2871,14 @@ void helper_xscvqpdp(CPUPPCState *env, uint32_t opcode,
 
 uint64_t helper_xscvdpspn(CPUPPCState *env, uint64_t xb)
 {
+    uint64_t result;
+
     float_status tstat = env->fp_status;
     set_float_exception_flags(0, &tstat);
 
-    return (uint64_t)float64_to_float32(xb, &tstat) << 32;
+    result = (uint64_t)float64_to_float32(xb, &tstat);
+    /* hardware replicates result to both words of the doubleword result.  */
+    return (result << 32) | result;
 }
 
 uint64_t helper_xscvspdpn(CPUPPCState *env, uint64_t xb)
-- 
cgit v1.1


From a7b7b98318f7d483e016f0f412df2b8a7dc1d636 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Mon, 19 Aug 2019 14:19:48 -0500
Subject: ppc: Fix emulated INFINITY and NAN conversions

helper_todouble() was not properly converting INFINITY from 32 bit
float to 64 bit double.

(Normalized operand conversion is unchanged, other than indentation.)

Signed-off-by: Paul A. Clarke <pc@us.ibm.com>
Message-Id: <1566242388-9244-1-git-send-email-pc@us.ibm.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/fpu_helper.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 23b9c97..52bcda2 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -58,10 +58,17 @@ uint64_t helper_todouble(uint32_t arg)
     uint64_t ret;
 
     if (likely(abs_arg >= 0x00800000)) {
-        /* Normalized operand, or Inf, or NaN.  */
-        ret  = (uint64_t)extract32(arg, 30, 2) << 62;
-        ret |= ((extract32(arg, 30, 1) ^ 1) * (uint64_t)7) << 59;
-        ret |= (uint64_t)extract32(arg, 0, 30) << 29;
+        if (unlikely(extract32(arg, 23, 8) == 0xff)) {
+            /* Inf or NAN.  */
+            ret  = (uint64_t)extract32(arg, 31, 1) << 63;
+            ret |= (uint64_t)0x7ff << 52;
+            ret |= (uint64_t)extract32(arg, 0, 23) << 29;
+        } else {
+            /* Normalized operand.  */
+            ret  = (uint64_t)extract32(arg, 30, 2) << 62;
+            ret |= ((extract32(arg, 30, 1) ^ 1) * (uint64_t)7) << 59;
+            ret |= (uint64_t)extract32(arg, 0, 30) << 29;
+        }
     } else {
         /* Zero or Denormalized operand.  */
         ret = (uint64_t)extract32(arg, 31, 1) << 63;
-- 
cgit v1.1


From c0e6616b6685ffdb4c5e091bc152e46e14703dd1 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Mon, 19 Aug 2019 16:42:16 -0500
Subject: ppc: Fix emulated single to double denormalized conversions

helper_todouble() was not properly converting any denormalized 32 bit
float to 64 bit double.

Fix-suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paul A. Clarke <pc@us.ibm.com>

v2:
- Splitting patch "ppc: Three floating point fixes"; this is just one part.
- Original suggested "fix" was likely flawed.  v2 is rewritten by
  Richard Henderson (Thanks, Richard!); I reformatted the comments in a
  couple of places, compiled, and tested.
Message-Id: <1566250936-14538-1-git-send-email-pc@us.ibm.com>

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/fpu_helper.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'target/ppc')

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 52bcda2..07bc905 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -73,11 +73,20 @@ uint64_t helper_todouble(uint32_t arg)
         /* Zero or Denormalized operand.  */
         ret = (uint64_t)extract32(arg, 31, 1) << 63;
         if (unlikely(abs_arg != 0)) {
-            /* Denormalized operand.  */
-            int shift = clz32(abs_arg) - 9;
-            int exp = -126 - shift + 1023;
+            /*
+             * Denormalized operand.
+             * Shift fraction so that the msb is in the implicit bit position.
+             * Thus, shift is in the range [1:23].
+             */
+            int shift = clz32(abs_arg) - 8;
+            /*
+             * The first 3 terms compute the float64 exponent.  We then bias
+             * this result by -1 so that we can swallow the implicit bit below.
+             */
+            int exp = -126 - shift + 1023 - 1;
+
             ret |= (uint64_t)exp << 52;
-            ret |= abs_arg << (shift + 29);
+            ret += (uint64_t)abs_arg << (52 - 23 + shift);
         }
     }
     return ret;
-- 
cgit v1.1