target/i386: tcg: use cout to commonize add/adc/sub/sbb cases

Use the carry-out vector as the basis to compute AF, CF and OF. The cost is pretty much the same, because the carry-out is just four boolean operations, and the code is much smaller because add/adc/sub/sbb now share most of it. A similar algorithm to what is used in target/i386/emulate can also be used for APX, in order to build the result of CCMP/CTEST with a new CC_OP_*. CCMP needs to place into the flags from either a subtraction or a constant value; CTEST likewise place into the flags either an AND or a constant value. The new CC_OP for CCMP and CTEST would store for a successful predcate: - in DST and SRC2, the result of the operation; - in SRC, a carry-out vector for CCMP or zero for CTEST; If the default flag value is used, DST/SRC/SRC2 can be filled with constants: - in DST the negated ZF; - in SRC's top 2 bits, a value that results in the desired OF and CF; - in SRC2 a suitable value (any of 0/1/~0/~1) that can be used instead of DST to compute the desired SF and PF. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
author: Paolo Bonzini <pbonzini@redhat.com> 2025-04-03 18:20:06 +0200
committer: Paolo Bonzini <pbonzini@redhat.com> 2025-04-17 18:23:26 +0200
commit: 5dcdbd071253e249a76c7771bcf78eca3763a131 (patch)
tree: 1311dedfd62b837e21254a977c2788b21e93fe7f
parent: 767149d3d078356073a32238b313cee9d02db5d8 (diff)
download: qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.zip
qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.tar.gz
qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.tar.bz2
3 files changed, 52 insertions, 78 deletions
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 76f2444..7a8d695 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2843,4 +2843,29 @@ static inline bool ctl_has_irq(CPUX86State *env)
 # define TARGET_VSYSCALL_PAGE  (UINT64_C(-10) << 20)
 #endif
 
+/* majority(NOT a, b, c) = (a ^ b) ? b : c */
+#define MAJ_INV1(a, b, c)  ((((a) ^ (b)) & ((b) ^ (c))) ^ (c))
+
+/*
+ * ADD_COUT_VEC(x, y) = majority((x + y) ^ x ^ y, x, y)
+ *
+ * If two corresponding bits in x and y are the same, that's the carry
+ * independent of the value (x+y)^x^y.  Hence x^y can be replaced with
+ * 1 in (x+y)^x^y, resulting in majority(NOT (x+y), x, y)
+ */
+#define ADD_COUT_VEC(op1, op2, result) \
+   MAJ_INV1(result, op1, op2)
+
+/*
+ * SUB_COUT_VEC(x, y) = NOT majority(x, NOT y, (x - y) ^ x ^ NOT y)
+ *                    = majority(NOT x, y, (x - y) ^ x ^ y)
+ *
+ * Note that the carry out is actually a borrow, i.e. it is inverted.
+ * If two corresponding bits in x and y are different, the value of the
+ * bit in (x-y)^x^y likewise does not matter.  Hence, x^y can be replaced
+ * with 0 in (x-y)^x^y, resulting in majority(NOT x, y, x-y)
+ */
+#define SUB_COUT_VEC(op1, op2, result) \
+   MAJ_INV1(op1, op2, result)
+
 #endif /* I386_CPU_H */
diff --git a/target/i386/hvf/x86_flags.c b/target/i386/hvf/x86_flags.c
index 60ab4f0..0c75e04 100644
--- a/target/i386/hvf/x86_flags.c
+++ b/target/i386/hvf/x86_flags.c
@@ -45,31 +45,6 @@
 #define LF_MASK_CF     (0x01 << LF_BIT_CF)
 #define LF_MASK_PO     (0x01 << LF_BIT_PO)
 
-/* majority(NOT a, b, c) = (a ^ b) ? b : c */
-#define MAJ_INV1(a, b, c)  ((((a) ^ (b)) & ((b) ^ (c))) ^ (c))
-
-/*
- * ADD_COUT_VEC(x, y) = majority((x + y) ^ x ^ y, x, y)
- *
- * If two corresponding bits in x and y are the same, that's the carry
- * independent of the value (x+y)^x^y.  Hence x^y can be replaced with
- * 1 in (x+y)^x^y, resulting in majority(NOT (x+y), x, y)
- */
-#define ADD_COUT_VEC(op1, op2, result) \
-   MAJ_INV1(result, op1, op2)
-
-/*
- * SUB_COUT_VEC(x, y) = NOT majority(x, NOT y, (x - y) ^ x ^ NOT y)
- *                    = majority(NOT x, y, (x - y) ^ x ^ y)
- *
- * Note that the carry out is actually a borrow, i.e. it is inverted.
- * If two corresponding bits in x and y are different, the value of the
- * bit in (x-y)^x^y likewise does not matter.  Hence, x^y can be replaced
- * with 0 in (x-y)^x^y, resulting in majority(NOT x, y, x-y)
- */
-#define SUB_COUT_VEC(op1, op2, result) \
-   MAJ_INV1(op1, op2, result)
-
 /* ******************* */
 /* OSZAPC */
 /* ******************* */
diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index b821e5b..d8fd976 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -44,18 +44,32 @@
 
 /* dynamic flags computation */
 
-static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_cout, SUFFIX)(DATA_TYPE dst, DATA_TYPE carries)
 {
-    uint32_t cf, pf, af, zf, sf, of;
-    DATA_TYPE src2 = dst - src1;
+    uint32_t af_cf, pf, zf, sf, of;
 
-    cf = dst < src1;
+    /* PF, ZF, SF computed from result.  */
     pf = compute_pf(dst);
-    af = (dst ^ src1 ^ src2) & CC_A;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
-    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf + pf + af + zf + sf + of;
+
+    /*
+     * AF, CF, OF computed from carry out vector.  To compute AF and CF, rotate it
+     * left by one so cout(DATA_BITS - 1) is in bit 0 and cout(3) in bit 4.
+     *
+     * To compute OF, place the highest two carry bits into OF and the bit
+     * immediately to the right of it; then, adding CC_O / 2 XORs them.
+     */
+    af_cf = ((carries << 1) | (carries >> (DATA_BITS - 1))) & (CC_A | CC_C);
+    of = (lshift(carries, 12 - DATA_BITS) + CC_O / 2) & CC_O;
+    return pf + zf + sf + af_cf + of;
+}
+
+static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+{
+    DATA_TYPE src2 = dst - src1;
+    DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst);
+    return glue(compute_all_cout, SUFFIX)(dst, carries);
 }
 
 static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -66,25 +80,9 @@ static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
                                          DATA_TYPE src3)
 {
-    uint32_t cf, pf, af, zf, sf, of;
-
-#ifdef WIDER_TYPE
-    WIDER_TYPE src13 = (WIDER_TYPE) src1 + (WIDER_TYPE) src3;
-    DATA_TYPE src2 = dst - src13;
-
-    cf = dst < src13;
-#else
     DATA_TYPE src2 = dst - src1 - src3;
-
-    cf = (src3 ? dst <= src1 : dst < src1);
-#endif
-
-    pf = compute_pf(dst);
-    af = (dst ^ src1 ^ src2) & 0x10;
-    zf = (dst == 0) << 6;
-    sf = lshift(dst, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf + pf + af + zf + sf + of;
+    DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst);
+    return glue(compute_all_cout, SUFFIX)(dst, carries);
 }
 
 static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
@@ -101,16 +99,9 @@ static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
 
 static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
 {
-    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src1 = dst + src2;
-
-    cf = src1 < src2;
-    pf = compute_pf(dst);
-    af = (dst ^ src1 ^ src2) & CC_A;
-    zf = (dst == 0) * CC_Z;
-    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
-    of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf + pf + af + zf + sf + of;
+    DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst);
+    return glue(compute_all_cout, SUFFIX)(dst, carries);
 }
 
 static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
@@ -123,25 +114,9 @@ static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
 static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
                                          DATA_TYPE src3)
 {
-    uint32_t cf, pf, af, zf, sf, of;
-
-#ifdef WIDER_TYPE
-    WIDER_TYPE src23 = (WIDER_TYPE) src2 + (WIDER_TYPE) src3;
-    DATA_TYPE src1 = dst + src23;
-
-    cf = src1 < src23;
-#else
     DATA_TYPE src1 = dst + src2 + src3;
-
-    cf = (src3 ? src1 <= src2 : src1 < src2);
-#endif
-
-    pf = compute_pf(dst);
-    af = (dst ^ src1 ^ src2) & 0x10;
-    zf = (dst == 0) << 6;
-    sf = lshift(dst, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf + pf + af + zf + sf + of;
+    DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst);
+    return glue(compute_all_cout, SUFFIX)(dst, carries);
 }
 
 static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
@@ -286,6 +261,5 @@ static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 #undef DATA_BITS
 #undef SIGN_MASK
 #undef DATA_TYPE
-#undef DATA_MASK
 #undef SUFFIX
 #undef WIDER_TYPE
author	Paolo Bonzini <pbonzini@redhat.com>	2025-04-03 18:20:06 +0200
committer	Paolo Bonzini <pbonzini@redhat.com>	2025-04-17 18:23:26 +0200
commit	5dcdbd071253e249a76c7771bcf78eca3763a131 (patch)
tree	1311dedfd62b837e21254a977c2788b21e93fe7f
parent	767149d3d078356073a32238b313cee9d02db5d8 (diff)
download	qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.zip qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.tar.gz qemu-5dcdbd071253e249a76c7771bcf78eca3763a131.tar.bz2