aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2023-05-30 13:25:18 -0700
committerRichard Henderson <richard.henderson@linaro.org>2023-05-30 13:25:18 -0700
commit51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c (patch)
tree50fbedc5a85acaa17460515926605111b62b8f3b
parent7f027ee0ce1f79302acd7330d796fb7a9e2529b1 (diff)
parent276d77de503e8f5f5cbd3f7d94302ca12d1d982e (diff)
downloadqemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.zip
qemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.tar.gz
qemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.tar.bz2
Merge tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu into staging
Improvements to 128-bit atomics: - Separate __int128_t type and arithmetic detection - Support 128-bit load/store in backend for i386, aarch64, ppc64, s390x - Accelerate atomics via host/include/ Decodetree: - Add named field syntax - Move tests to meson # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmR2R10dHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/bsgf/XLi8q+ITyoEAKwG4 # 6ML7DktLAdIs9Euah9twqe16U0BM0YzpKfymBfVVBKKaIa0524N4ZKIT3h6EeJo+ # f+ultqrpsnH+aQh4wc3ZCkEvRdhzhFT8VcoRTunJuJrbL3Y8n2ZSgODUL2a0tahT # Nn+zEPm8rzQanSKQHq5kyNBLpgTUKjc5wKfvy/WwttnFmkTnqzcuEA6nPVOVwOHC # lZBQCByIQWsHfFHUVJFvsFzBQbm0mAiW6FNKzPBkoXon0h/UZUI1lV+xXzgutFs+ # zR2O8IZwLYRu2wOWiTF8Nn2qQafkB3Dhwoq3JTEXhOqosOPExbIiWlsZDlPiKRJk # bwmQlg== # =XQMb # -----END PGP SIGNATURE----- # gpg: Signature made Tue 30 May 2023 11:58:37 AM PDT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate] * tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu: (27 commits) tests/decode: Add tests for various named-field cases scripts/decodetree: Implement named field support scripts/decodetree: Implement a topological sort scripts/decodetree: Pass lvalue-formatter function to str_extract() docs: Document decodetree named field syntax tests/decode: Convert tests to meson decodetree: Do not remove output_file from /dev decodetree: Diagnose empty pattern group decodetree: Fix recursion in prop_format and build_tree decodetree: Add --test-for-error tcg: Remove TCG_TARGET_TLB_DISPLACEMENT_BITS accel/tcg: Add aarch64 store_atom_insert_al16 accel/tcg: Add aarch64 lse2 load_atom_extract_al16_or_al8 accel/tcg: Add x86_64 load_atom_extract_al16_or_al8 accel/tcg: Extract store_atom_insert_al16 to host header accel/tcg: Extract load_atom_extract_al16_or_al8 to host header tcg/s390x: Support 128-bit load/store tcg/ppc: Support 128-bit load/store tcg/aarch64: Support 128-bit load/store tcg/aarch64: Simplify constraints on qemu_ld/st ... Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--accel/tcg/ldst_atomicity.c.inc80
-rw-r--r--docs/devel/decodetree.rst33
-rw-r--r--host/include/aarch64/host/load-extract-al16-al8.h40
-rw-r--r--host/include/aarch64/host/store-insert-al16.h47
-rw-r--r--host/include/generic/host/load-extract-al16-al8.h45
-rw-r--r--host/include/generic/host/store-insert-al16.h50
-rw-r--r--host/include/x86_64/host/atomic128-ldst.h68
-rw-r--r--host/include/x86_64/host/load-extract-al16-al8.h50
-rw-r--r--include/qemu/int128.h4
-rw-r--r--meson.build15
-rw-r--r--scripts/decodetree.py265
-rw-r--r--tcg/aarch64/tcg-target-con-set.h4
-rw-r--r--tcg/aarch64/tcg-target-con-str.h1
-rw-r--r--tcg/aarch64/tcg-target.c.inc243
-rw-r--r--tcg/aarch64/tcg-target.h12
-rw-r--r--tcg/arm/tcg-target.h1
-rw-r--r--tcg/i386/tcg-target.c.inc191
-rw-r--r--tcg/i386/tcg-target.h5
-rw-r--r--tcg/mips/tcg-target.h1
-rw-r--r--tcg/ppc/tcg-target-con-set.h2
-rw-r--r--tcg/ppc/tcg-target-con-str.h1
-rw-r--r--tcg/ppc/tcg-target.c.inc108
-rw-r--r--tcg/ppc/tcg-target.h4
-rw-r--r--tcg/riscv/tcg-target.h1
-rw-r--r--tcg/s390x/tcg-target-con-set.h2
-rw-r--r--tcg/s390x/tcg-target.c.inc107
-rw-r--r--tcg/s390x/tcg-target.h3
-rw-r--r--tcg/sparc64/tcg-target.h1
-rw-r--r--tcg/tcg.c4
-rw-r--r--tcg/tci/tcg-target.h1
-rwxr-xr-xtests/decode/check.sh24
-rw-r--r--tests/decode/err_field10.decode7
-rw-r--r--tests/decode/err_field7.decode7
-rw-r--r--tests/decode/err_field8.decode8
-rw-r--r--tests/decode/err_field9.decode14
-rw-r--r--tests/decode/meson.build64
-rw-r--r--tests/decode/succ_named_field.decode19
-rw-r--r--tests/meson.build5
38 files changed, 1312 insertions, 225 deletions
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 0f6b3f8..2514899 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -9,6 +9,9 @@
* See the COPYING file in the top-level directory.
*/
+#include "host/load-extract-al16-al8.h"
+#include "host/store-insert-al16.h"
+
#ifdef CONFIG_ATOMIC64
# define HAVE_al8 true
#else
@@ -156,7 +159,7 @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
* another process, because the fallback start_exclusive solution
* provides no protection across processes.
*/
- if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
+ if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
uint64_t *p = __builtin_assume_aligned(pv, 8);
return *p;
}
@@ -191,7 +194,7 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
* another process, because the fallback start_exclusive solution
* provides no protection across processes.
*/
- if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
+ if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
return *p;
}
#endif
@@ -312,40 +315,6 @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
}
/**
- * load_atom_extract_al16_or_al8:
- * @p: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
-load_atom_extract_al16_or_al8(void *pv, int s)
-{
- uintptr_t pi = (uintptr_t)pv;
- int o = pi & 7;
- int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
- Int128 r;
-
- pv = (void *)(pi & ~7);
- if (pi & 8) {
- uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
- uint64_t a = qatomic_read__nocheck(p8);
- uint64_t b = qatomic_read__nocheck(p8 + 1);
-
- if (HOST_BIG_ENDIAN) {
- r = int128_make128(b, a);
- } else {
- r = int128_make128(a, b);
- }
- } else {
- r = atomic16_read_ro(pv);
- }
- return int128_getlo(int128_urshift(r, shr));
-}
-
-/**
* load_atom_4_by_2:
* @pv: host address
*
@@ -714,45 +683,6 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
}
/**
- * store_atom_insert_al16:
- * @p: host address
- * @val: shifted value to store
- * @msk: mask for value to store
- *
- * Atomically store @val to @p masked by @msk.
- */
-static void ATTRIBUTE_ATOMIC128_OPT
-store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
-{
-#if defined(CONFIG_ATOMIC128)
- __uint128_t *pu, old, new;
-
- /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
- pu = __builtin_assume_aligned(ps, 16);
- old = *pu;
- do {
- new = (old & ~msk.u) | val.u;
- } while (!__atomic_compare_exchange_n(pu, &old, new, true,
- __ATOMIC_RELAXED, __ATOMIC_RELAXED));
-#elif defined(CONFIG_CMPXCHG128)
- __uint128_t *pu, old, new;
-
- /*
- * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
- * defer to libatomic, so we must use __sync_*_compare_and_swap_16
- * and accept the sequential consistency that comes with it.
- */
- pu = __builtin_assume_aligned(ps, 16);
- do {
- old = *pu;
- new = (old & ~msk.u) | val.u;
- } while (!__sync_bool_compare_and_swap_16(pu, old, new));
-#else
- qemu_build_not_reached();
-#endif
-}
-
-/**
* store_bytes_leN:
* @pv: host address
* @size: number of bytes to store
diff --git a/docs/devel/decodetree.rst b/docs/devel/decodetree.rst
index 49ea50c..e3392aa 100644
--- a/docs/devel/decodetree.rst
+++ b/docs/devel/decodetree.rst
@@ -23,22 +23,42 @@ Fields
Syntax::
- field_def := '%' identifier ( unnamed_field )* ( !function=identifier )?
+ field_def := '%' identifier ( field )* ( !function=identifier )?
+ field := unnamed_field | named_field
unnamed_field := number ':' ( 's' ) number
+ named_field := identifier ':' ( 's' ) number
For *unnamed_field*, the first number is the least-significant bit position
of the field and the second number is the length of the field. If the 's' is
-present, the field is considered signed. If multiple ``unnamed_fields`` are
-present, they are concatenated. In this way one can define disjoint fields.
+present, the field is considered signed.
+
+A *named_field* refers to some other field in the instruction pattern
+or format. Regardless of the length of the other field where it is
+defined, it will be inserted into this field with the specified
+signedness and bit width.
+
+Field definitions that involve loops (i.e. where a field is defined
+directly or indirectly in terms of itself) are errors.
+
+A format can include fields that refer to named fields that are
+defined in the instruction pattern(s) that use the format.
+Conversely, an instruction pattern can include fields that refer to
+named fields that are defined in the format it uses. However you
+cannot currently do both at once (i.e. pattern P uses format F; F has
+a field A that refers to a named field B that is defined in P, and P
+has a field C that refers to a named field D that is defined in F).
+
+If multiple ``fields`` are present, they are concatenated.
+In this way one can define disjoint fields.
If ``!function`` is specified, the concatenated result is passed through the
named function, taking and returning an integral value.
-One may use ``!function`` with zero ``unnamed_fields``. This case is called
+One may use ``!function`` with zero ``fields``. This case is called
a *parameter*, and the named function is only passed the ``DisasContext``
and returns an integral value extracted from there.
-A field with no ``unnamed_fields`` and no ``!function`` is in error.
+A field with no ``fields`` and no ``!function`` is in error.
Field examples:
@@ -56,6 +76,9 @@ Field examples:
| %shimm8 5:s8 13:1 | expand_shimm8(sextract(i, 5, 8) << 1 | |
| !function=expand_shimm8 | extract(i, 13, 1)) |
+---------------------------+---------------------------------------------+
+| %sz_imm 10:2 sz:3 | expand_sz_imm(extract(i, 10, 2) << 3 | |
+| !function=expand_sz_imm | extract(a->sz, 0, 3)) |
++---------------------------+---------------------------------------------+
Argument Sets
=============
diff --git a/host/include/aarch64/host/load-extract-al16-al8.h b/host/include/aarch64/host/load-extract-al16-al8.h
new file mode 100644
index 0000000..bd677c5
--- /dev/null
+++ b/host/include/aarch64/host/load-extract-al16-al8.h
@@ -0,0 +1,40 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H
+#define AARCH64_LOAD_EXTRACT_AL16_AL8_H
+
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+ int shr = (pi & 7) * 8;
+ uint64_t l, h;
+
+ /*
+ * With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned
+ * and single-copy atomic on the parts if 8-byte aligned.
+ * All we need do is align the pointer mod 8.
+ */
+ tcg_debug_assert(HAVE_ATOMIC128_RO);
+ asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align));
+ return (l >> shr) | (h << (-shr & 63));
+}
+
+#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */
diff --git a/host/include/aarch64/host/store-insert-al16.h b/host/include/aarch64/host/store-insert-al16.h
new file mode 100644
index 0000000..1943155
--- /dev/null
+++ b/host/include/aarch64/host/store-insert-al16.h
@@ -0,0 +1,47 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_STORE_INSERT_AL16_H
+#define AARCH64_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+ /*
+ * GCC only implements __sync* primitives for int128 on aarch64.
+ * We can do better without the barriers, and integrating the
+ * arithmetic into the load-exclusive/store-conditional pair.
+ */
+ uint64_t tl, th, vl, vh, ml, mh;
+ uint32_t fail;
+
+ qemu_build_assert(!HOST_BIG_ENDIAN);
+ vl = int128_getlo(val);
+ vh = int128_gethi(val);
+ ml = int128_getlo(msk);
+ mh = int128_gethi(msk);
+
+ asm("0: ldxp %[l], %[h], %[mem]\n\t"
+ "bic %[l], %[l], %[ml]\n\t"
+ "bic %[h], %[h], %[mh]\n\t"
+ "orr %[l], %[l], %[vl]\n\t"
+ "orr %[h], %[h], %[vh]\n\t"
+ "stxp %w[f], %[l], %[h], %[mem]\n\t"
+ "cbnz %w[f], 0b\n"
+ : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+ : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+}
+
+#endif /* AARCH64_STORE_INSERT_AL16_H */
diff --git a/host/include/generic/host/load-extract-al16-al8.h b/host/include/generic/host/load-extract-al16-al8.h
new file mode 100644
index 0000000..d955561
--- /dev/null
+++ b/host/include/generic/host/load-extract-al16-al8.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H
+#define HOST_LOAD_EXTRACT_AL16_AL8_H
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ int o = pi & 7;
+ int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
+ Int128 r;
+
+ pv = (void *)(pi & ~7);
+ if (pi & 8) {
+ uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
+ uint64_t a = qatomic_read__nocheck(p8);
+ uint64_t b = qatomic_read__nocheck(p8 + 1);
+
+ if (HOST_BIG_ENDIAN) {
+ r = int128_make128(b, a);
+ } else {
+ r = int128_make128(a, b);
+ }
+ } else {
+ r = atomic16_read_ro(pv);
+ }
+ return int128_getlo(int128_urshift(r, shr));
+}
+
+#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */
diff --git a/host/include/generic/host/store-insert-al16.h b/host/include/generic/host/store-insert-al16.h
new file mode 100644
index 0000000..4a16621
--- /dev/null
+++ b/host/include/generic/host/store-insert-al16.h
@@ -0,0 +1,50 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_STORE_INSERT_AL16_H
+#define HOST_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+#if defined(CONFIG_ATOMIC128)
+ __uint128_t *pu;
+ Int128Alias old, new;
+
+ /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
+ pu = __builtin_assume_aligned(ps, 16);
+ old.u = *pu;
+ msk = int128_not(msk);
+ do {
+ new.s = int128_and(old.s, msk);
+ new.s = int128_or(new.s, val);
+ } while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+#else
+ Int128 old, new, cmp;
+
+ ps = __builtin_assume_aligned(ps, 16);
+ old = *ps;
+ msk = int128_not(msk);
+ do {
+ cmp = old;
+ new = int128_and(old, msk);
+ new = int128_or(new, val);
+ old = atomic16_cmpxchg(ps, cmp, new);
+ } while (int128_ne(cmp, old));
+#endif
+}
+
+#endif /* HOST_STORE_INSERT_AL16_H */
diff --git a/host/include/x86_64/host/atomic128-ldst.h b/host/include/x86_64/host/atomic128-ldst.h
new file mode 100644
index 0000000..adc9332
--- /dev/null
+++ b/host/include/x86_64/host/atomic128-ldst.h
@@ -0,0 +1,68 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, x86_64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_LDST_H
+#define AARCH64_ATOMIC128_LDST_H
+
+#ifdef CONFIG_INT128_TYPE
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/*
+ * Through clang 16, with -mcx16, __atomic_load_n is incorrectly
+ * expanded to a read-write operation: lock cmpxchg16b.
+ */
+
+#define HAVE_ATOMIC128_RO likely(cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
+#define HAVE_ATOMIC128_RW 1
+
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
+{
+ Int128Alias r;
+
+ tcg_debug_assert(HAVE_ATOMIC128_RO);
+ asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr));
+
+ return r.s;
+}
+
+static inline Int128 atomic16_read_rw(Int128 *ptr)
+{
+ __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+ Int128Alias r;
+
+ if (HAVE_ATOMIC128_RO) {
+ asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+ } else {
+ r.i = __sync_val_compare_and_swap_16(ptr_align, 0, 0);
+ }
+ return r.s;
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+ __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+ Int128Alias new = { .s = val };
+
+ if (HAVE_ATOMIC128_RO) {
+ asm("vmovdqa %1, %0" : "=m"(*ptr_align) : "x" (new.i));
+ } else {
+ __int128_t old;
+ do {
+ old = *ptr_align;
+ } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
+ }
+}
+#else
+/* Provide QEMU_ERROR stubs. */
+#include "host/include/generic/host/atomic128-ldst.h"
+#endif
+
+#endif /* AARCH64_ATOMIC128_LDST_H */
diff --git a/host/include/x86_64/host/load-extract-al16-al8.h b/host/include/x86_64/host/load-extract-al16-al8.h
new file mode 100644
index 0000000..31b6fe8
--- /dev/null
+++ b/host/include/x86_64/host/load-extract-al16-al8.h
@@ -0,0 +1,50 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, x86_64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef X86_64_LOAD_EXTRACT_AL16_AL8_H
+#define X86_64_LOAD_EXTRACT_AL16_AL8_H
+
+#ifdef CONFIG_INT128_TYPE
+#include "host/cpuinfo.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+ int shr = (pi & 7) * 8;
+ Int128Alias r;
+
+ /*
+ * ptr_align % 16 is now only 0 or 8.
+ * If the host supports atomic loads with VMOVDQU, then always use that,
+ * making the branch highly predictable. Otherwise we must use VMOVDQA
+ * when ptr_align % 16 == 0 for 16-byte atomicity.
+ */
+ if ((cpuinfo & CPUINFO_ATOMIC_VMOVDQU) || (pi & 8)) {
+ asm("vmovdqu %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+ } else {
+ asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+ }
+ return int128_getlo(int128_urshift(r.s, shr));
+}
+#else
+/* Fallback definition that must be optimized away, or error. */
+uint64_t QEMU_ERROR("unsupported atomic")
+ load_atom_extract_al16_or_al8(void *pv, int s);
+#endif
+
+#endif /* X86_64_LOAD_EXTRACT_AL16_AL8_H */
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index 9e46cfae..73624e8 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -481,7 +481,7 @@ static inline void bswap128s(Int128 *s)
* a possible structure and the native types. Ease parameter passing
* via use of the transparent union extension.
*/
-#ifdef CONFIG_INT128
+#ifdef CONFIG_INT128_TYPE
typedef union {
__uint128_t u;
__int128_t i;
@@ -489,6 +489,6 @@ typedef union {
} Int128Alias __attribute__((transparent_union));
#else
typedef Int128 Int128Alias;
-#endif /* CONFIG_INT128 */
+#endif /* CONFIG_INT128_TYPE */
#endif /* INT128_H */
diff --git a/meson.build b/meson.build
index 2d48aa1..bc76ea9 100644
--- a/meson.build
+++ b/meson.build
@@ -2543,7 +2543,13 @@ config_host_data.set('CONFIG_ATOMIC64', cc.links('''
return 0;
}'''))
-has_int128 = cc.links('''
+has_int128_type = cc.compiles('''
+ __int128_t a;
+ __uint128_t b;
+ int main(void) { b = a; }''')
+config_host_data.set('CONFIG_INT128_TYPE', has_int128_type)
+
+has_int128 = has_int128_type and cc.links('''
__int128_t a;
__uint128_t b;
int main (void) {
@@ -2552,10 +2558,9 @@ has_int128 = cc.links('''
a = a * a;
return 0;
}''')
-
config_host_data.set('CONFIG_INT128', has_int128)
-if has_int128
+if has_int128_type
# "do we have 128-bit atomics which are handled inline and specifically not
# via libatomic". The reason we can't use libatomic is documented in the
# comment starting "GCC is a house divided" in include/qemu/atomic128.h.
@@ -2564,7 +2569,7 @@ if has_int128
# __alignof(unsigned __int128) for the host.
atomic_test_128 = '''
int main(int ac, char **av) {
- unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], 16);
+ __uint128_t *p = __builtin_assume_aligned(av[ac - 1], 16);
p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED);
__atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED);
__atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
@@ -2586,7 +2591,7 @@ if has_int128
config_host_data.set('CONFIG_CMPXCHG128', cc.links('''
int main(void)
{
- unsigned __int128 x = 0, y = 0;
+ __uint128_t x = 0, y = 0;
__sync_val_compare_and_swap_16(&x, y, x);
return 0;
}
diff --git a/scripts/decodetree.py b/scripts/decodetree.py
index a03dc6b..13db585 100644
--- a/scripts/decodetree.py
+++ b/scripts/decodetree.py
@@ -35,6 +35,7 @@ arguments = {}
formats = {}
allpatterns = []
anyextern = False
+testforerror = False
translate_prefix = 'trans'
translate_scope = 'static '
@@ -53,6 +54,80 @@ re_fld_ident = '%[a-zA-Z0-9_]*'
re_fmt_ident = '@[a-zA-Z0-9_]*'
re_pat_ident = '[a-zA-Z0-9_]*'
+# Local implementation of a topological sort. We use the same API that
+# the Python graphlib does, so that when QEMU moves forward to a
+# baseline of Python 3.9 or newer this code can all be dropped and
+# replaced with:
+# from graphlib import TopologicalSorter, CycleError
+#
+# https://docs.python.org/3.9/library/graphlib.html#graphlib.TopologicalSorter
+#
+# We only implement the parts of TopologicalSorter we care about:
+# ts = TopologicalSorter(graph=None)
+# create the sorter. graph is a dictionary whose keys are
+# nodes and whose values are lists of the predecessors of that node.
+# (That is, if graph contains "A" -> ["B", "C"] then we must output
+# B and C before A.)
+# ts.static_order()
+# returns a list of all the nodes in sorted order, or raises CycleError
+# CycleError
+# exception raised if there are cycles in the graph. The second
+# element in the args attribute is a list of nodes which form a
+# cycle; the first and last element are the same, eg [a, b, c, a]
+# (Our implementation doesn't give the order correctly.)
+#
+# For our purposes we can assume that the data set is always small
+# (typically 10 nodes or less, actual links in the graph very rare),
+# so we don't need to worry about efficiency of implementation.
+#
+# The core of this implementation is from
+# https://code.activestate.com/recipes/578272-topological-sort/
+# (but updated to Python 3), and is under the MIT license.
+
+class CycleError(ValueError):
+ """Subclass of ValueError raised if cycles exist in the graph"""
+ pass
+
+class TopologicalSorter:
+ """Topologically sort a graph"""
+ def __init__(self, graph=None):
+ self.graph = graph
+
+ def static_order(self):
+ # We do the sort right here, unlike the stdlib version
+ from functools import reduce
+ data = {}
+ r = []
+
+ if not self.graph:
+ return []
+
+ # This code wants the values in the dict to be specifically sets
+ for k, v in self.graph.items():
+ data[k] = set(v)
+
+ # Find all items that don't depend on anything.
+ extra_items_in_deps = (reduce(set.union, data.values())
+ - set(data.keys()))
+ # Add empty dependencies where needed
+ data.update({item:{} for item in extra_items_in_deps})
+ while True:
+ ordered = set(item for item, dep in data.items() if not dep)
+ if not ordered:
+ break
+ r.extend(ordered)
+ data = {item: (dep - ordered)
+ for item, dep in data.items()
+ if item not in ordered}
+ if data:
+ # This doesn't give as nice results as the stdlib, which
+ # gives you the cycle by listing the nodes in order. Here
+ # we only know the nodes in the cycle but not their order.
+ raise CycleError(f'nodes are in a cycle', list(data.keys()))
+
+ return r
+# end TopologicalSorter
+
def error_with_file(file, lineno, *args):
"""Print an error message from file:line and args and exit."""
global output_file
@@ -70,8 +145,13 @@ def error_with_file(file, lineno, *args):
if output_file and output_fd:
output_fd.close()
- os.remove(output_file)
- exit(1)
+ # Do not try to remove e.g. -o /dev/null
+ if not output_file.startswith("/dev"):
+ try:
+ os.remove(output_file)
+ except PermissionError:
+ pass
+ exit(0 if testforerror else 1)
# end error_with_file
@@ -205,11 +285,14 @@ class Field:
s = ''
return str(self.pos) + ':' + s + str(self.len)
- def str_extract(self):
+ def str_extract(self, lvalue_formatter):
global bitop_width
s = 's' if self.sign else ''
return f'{s}extract{bitop_width}(insn, {self.pos}, {self.len})'
+ def referenced_fields(self):
+ return []
+
def __eq__(self, other):
return self.sign == other.sign and self.mask == other.mask
@@ -228,12 +311,12 @@ class MultiField:
def __str__(self):
return str(self.subs)
- def str_extract(self):
+ def str_extract(self, lvalue_formatter):
global bitop_width
ret = '0'
pos = 0
for f in reversed(self.subs):
- ext = f.str_extract()
+ ext = f.str_extract(lvalue_formatter)
if pos == 0:
ret = ext
else:
@@ -241,6 +324,12 @@ class MultiField:
pos += f.len
return ret
+ def referenced_fields(self):
+ l = []
+ for f in self.subs:
+ l.extend(f.referenced_fields())
+ return l
+
def __ne__(self, other):
if len(self.subs) != len(other.subs):
return True
@@ -264,9 +353,12 @@ class ConstField:
def __str__(self):
return str(self.value)
- def str_extract(self):
+ def str_extract(self, lvalue_formatter):
return str(self.value)
+ def referenced_fields(self):
+ return []
+
def __cmp__(self, other):
return self.value - other.value
# end ConstField
@@ -283,8 +375,12 @@ class FunctionField:
def __str__(self):
return self.func + '(' + str(self.base) + ')'
- def str_extract(self):
- return self.func + '(ctx, ' + self.base.str_extract() + ')'
+ def str_extract(self, lvalue_formatter):
+ return (self.func + '(ctx, '
+ + self.base.str_extract(lvalue_formatter) + ')')
+
+ def referenced_fields(self):
+ return self.base.referenced_fields()
def __eq__(self, other):
return self.func == other.func and self.base == other.base
@@ -304,9 +400,12 @@ class ParameterField:
def __str__(self):
return self.func
- def str_extract(self):
+ def str_extract(self, lvalue_formatter):
return self.func + '(ctx)'
+ def referenced_fields(self):
+ return []
+
def __eq__(self, other):
return self.func == other.func
@@ -314,6 +413,32 @@ class ParameterField:
return not self.__eq__(other)
# end ParameterField
+class NamedField:
+ """Class representing a field already named in the pattern"""
+ def __init__(self, name, sign, len):
+ self.mask = 0
+ self.sign = sign
+ self.len = len
+ self.name = name
+
+ def __str__(self):
+ return self.name
+
+ def str_extract(self, lvalue_formatter):
+ global bitop_width
+ s = 's' if self.sign else ''
+ lvalue = lvalue_formatter(self.name)
+ return f'{s}extract{bitop_width}({lvalue}, 0, {self.len})'
+
+ def referenced_fields(self):
+ return [self.name]
+
+ def __eq__(self, other):
+ return self.name == other.name
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+# end NamedField
class Arguments:
"""Class representing the extracted fields of a format"""
@@ -337,7 +462,6 @@ class Arguments:
output('} ', self.struct_name(), ';\n\n')
# end Arguments
-
class General:
"""Common code between instruction formats and instruction patterns"""
def __init__(self, name, lineno, base, fixb, fixm, udfm, fldm, flds, w):
@@ -351,12 +475,59 @@ class General:
self.fieldmask = fldm
self.fields = flds
self.width = w
+ self.dangling = None
def __str__(self):
return self.name + ' ' + str_match_bits(self.fixedbits, self.fixedmask)
def str1(self, i):
return str_indent(i) + self.__str__()
+
+ def dangling_references(self):
+ # Return a list of all named references which aren't satisfied
+ # directly by this format/pattern. This will be either:
+ # * a format referring to a field which is specified by the
+ # pattern(s) using it
+ # * a pattern referring to a field which is specified by the
+ # format it uses
+ # * a user error (referring to a field that doesn't exist at all)
+ if self.dangling is None:
+ # Compute this once and cache the answer
+ dangling = []
+ for n, f in self.fields.items():
+ for r in f.referenced_fields():
+ if r not in self.fields:
+ dangling.append(r)
+ self.dangling = dangling
+ return self.dangling
+
+ def output_fields(self, indent, lvalue_formatter):
+ # We use a topological sort to ensure that any use of NamedField
+ # comes after the initialization of the field it is referencing.
+ graph = {}
+ for n, f in self.fields.items():
+ refs = f.referenced_fields()
+ graph[n] = refs
+
+ try:
+ ts = TopologicalSorter(graph)
+ for n in ts.static_order():
+ # We only want to emit assignments for the keys
+ # in our fields list, not for anything that ends up
+ # in the tsort graph only because it was referenced as
+ # a NamedField.
+ try:
+ f = self.fields[n]
+ output(indent, lvalue_formatter(n), ' = ',
+ f.str_extract(lvalue_formatter), ';\n')
+ except KeyError:
+ pass
+ except CycleError as e:
+ # The second element of args is a list of nodes which form
+ # a cycle (there might be others too, but only one is reported).
+ # Pretty-print it to tell the user.
+ cycle = ' => '.join(e.args[1])
+ error(self.lineno, 'field definitions form a cycle: ' + cycle)
# end General
@@ -370,8 +541,7 @@ class Format(General):
def output_extract(self):
output('static void ', self.extract_name(), '(DisasContext *ctx, ',
self.base.struct_name(), ' *a, ', insntype, ' insn)\n{\n')
- for n, f in self.fields.items():
- output(' a->', n, ' = ', f.str_extract(), ';\n')
+ self.output_fields(str_indent(4), lambda n: 'a->' + n)
output('}\n\n')
# end Format
@@ -392,11 +562,36 @@ class Pattern(General):
ind = str_indent(i)
arg = self.base.base.name
output(ind, '/* ', self.file, ':', str(self.lineno), ' */\n')
+ # We might have named references in the format that refer to fields
+ # in the pattern, or named references in the pattern that refer
+ # to fields in the format. This affects whether we extract the fields
+ # for the format before or after the ones for the pattern.
+ # For simplicity we don't allow cross references in both directions.
+ # This is also where we catch the syntax error of referring to
+ # a nonexistent field.
+ fmt_refs = self.base.dangling_references()
+ for r in fmt_refs:
+ if r not in self.fields:
+ error(self.lineno, f'format refers to undefined field {r}')
+ pat_refs = self.dangling_references()
+ for r in pat_refs:
+ if r not in self.base.fields:
+ error(self.lineno, f'pattern refers to undefined field {r}')
+ if pat_refs and fmt_refs:
+ error(self.lineno, ('pattern that uses fields defined in format '
+ 'cannot use format that uses fields defined '
+ 'in pattern'))
+ if fmt_refs:
+ # pattern fields first
+ self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n)
+ assert not extracted, "dangling fmt refs but it was already extracted"
if not extracted:
output(ind, self.base.extract_name(),
'(ctx, &u.f_', arg, ', insn);\n')
- for n, f in self.fields.items():
- output(ind, 'u.f_', arg, '.', n, ' = ', f.str_extract(), ';\n')
+ if not fmt_refs:
+ # pattern fields last
+ self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n)
+
output(ind, 'if (', translate_prefix, '_', self.name,
'(ctx, &u.f_', arg, ')) return true;\n')
@@ -473,7 +668,7 @@ class MultiPattern(General):
def prop_format(self):
for p in self.pats:
- p.build_tree()
+ p.prop_format()
def prop_width(self):
width = None
@@ -505,6 +700,12 @@ class IncMultiPattern(MultiPattern):
output(ind, '}\n')
else:
p.output_code(i, extracted, p.fixedbits, p.fixedmask)
+
+ def build_tree(self):
+ if not self.pats:
+ error_with_file(self.file, self.lineno, 'empty pattern group')
+ super().build_tree()
+
#end IncMultiPattern
@@ -536,8 +737,10 @@ class Tree:
ind = str_indent(i)
# If we identified all nodes below have the same format,
- # extract the fields now.
- if not extracted and self.base:
+ # extract the fields now. But don't do it if the format relies
+ # on named fields from the insn pattern, as those won't have
+ # been initialised at this point.
+ if not extracted and self.base and not self.base.dangling_references():
output(ind, self.base.extract_name(),
'(ctx, &u.f_', self.base.base.name, ', insn);\n')
extracted = True
@@ -623,7 +826,7 @@ class ExcMultiPattern(MultiPattern):
return t
def build_tree(self):
- super().prop_format()
+ super().build_tree()
self.tree = self.__build_tree(self.pats, self.fixedbits,
self.fixedmask)
@@ -659,6 +862,7 @@ def parse_field(lineno, name, toks):
"""Parse one instruction field from TOKS at LINENO"""
global fields
global insnwidth
+ global re_C_ident
# A "simple" field will have only one entry;
# a "multifield" will have several.
@@ -673,6 +877,25 @@ def parse_field(lineno, name, toks):
func = func[1]
continue
+ if re.fullmatch(re_C_ident + ':s[0-9]+', t):
+ # Signed named field
+ subtoks = t.split(':')
+ n = subtoks[0]
+ le = int(subtoks[1])
+ f = NamedField(n, True, le)
+ subs.append(f)
+ width += le
+ continue
+ if re.fullmatch(re_C_ident + ':[0-9]+', t):
+ # Unsigned named field
+ subtoks = t.split(':')
+ n = subtoks[0]
+ le = int(subtoks[1])
+ f = NamedField(n, False, le)
+ subs.append(f)
+ width += le
+ continue
+
if re.fullmatch('[0-9]+:s[0-9]+', t):
# Signed field extract
subtoks = t.split(':s')
@@ -1286,11 +1509,12 @@ def main():
global bitop_width
global variablewidth
global anyextern
+ global testforerror
decode_scope = 'static '
long_opts = ['decode=', 'translate=', 'output=', 'insnwidth=',
- 'static-decode=', 'varinsnwidth=']
+ 'static-decode=', 'varinsnwidth=', 'test-for-error']
try:
(opts, args) = getopt.gnu_getopt(sys.argv[1:], 'o:vw:', long_opts)
except getopt.GetoptError as err:
@@ -1319,6 +1543,8 @@ def main():
bitop_width = 64
elif insnwidth != 32:
error(0, 'cannot handle insns of width', insnwidth)
+ elif o == '--test-for-error':
+ testforerror = True
else:
assert False, 'unhandled option'
@@ -1417,6 +1643,7 @@ def main():
if output_file:
output_fd.close()
+ exit(1 if testforerror else 0)
# end main
diff --git a/tcg/aarch64/tcg-target-con-set.h b/tcg/aarch64/tcg-target-con-set.h
index d6c6866..3fdee26 100644
--- a/tcg/aarch64/tcg-target-con-set.h
+++ b/tcg/aarch64/tcg-target-con-set.h
@@ -10,11 +10,10 @@
* tcg-target-con-str.h; the constraint combination is inclusive or.
*/
C_O0_I1(r)
-C_O0_I2(lZ, l)
C_O0_I2(r, rA)
C_O0_I2(rZ, r)
C_O0_I2(w, r)
-C_O1_I1(r, l)
+C_O0_I3(rZ, rZ, r)
C_O1_I1(r, r)
C_O1_I1(w, r)
C_O1_I1(w, w)
@@ -33,4 +32,5 @@ C_O1_I2(w, w, wO)
C_O1_I2(w, w, wZ)
C_O1_I3(w, w, w, w)
C_O1_I4(r, r, rA, rZ, rZ)
+C_O2_I1(r, r, r)
C_O2_I4(r, r, rZ, rZ, rA, rMZ)
diff --git a/tcg/aarch64/tcg-target-con-str.h b/tcg/aarch64/tcg-target-con-str.h
index 00adb64..fb1a845 100644
--- a/tcg/aarch64/tcg-target-con-str.h
+++ b/tcg/aarch64/tcg-target-con-str.h
@@ -9,7 +9,6 @@
* REGS(letter, register_mask)
*/
REGS('r', ALL_GENERAL_REGS)
-REGS('l', ALL_QLDST_REGS)
REGS('w', ALL_VECTOR_REGS)
/*
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 8428366..261ad25 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -40,11 +40,12 @@ static const int tcg_target_reg_alloc_order[] = {
TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
- TCG_REG_X16, TCG_REG_X17,
TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7,
+ /* X16 reserved as temporary */
+ /* X17 reserved as temporary */
/* X18 reserved by system */
/* X19 reserved for AREG0 */
/* X29 reserved as fp */
@@ -71,8 +72,10 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
return TCG_REG_X0 + slot;
}
-#define TCG_REG_TMP TCG_REG_X30
-#define TCG_VEC_TMP TCG_REG_V31
+#define TCG_REG_TMP0 TCG_REG_X16
+#define TCG_REG_TMP1 TCG_REG_X17
+#define TCG_REG_TMP2 TCG_REG_X30
+#define TCG_VEC_TMP0 TCG_REG_V31
#ifndef CONFIG_SOFTMMU
#define TCG_REG_GUEST_BASE TCG_REG_X28
@@ -129,14 +132,6 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
#define ALL_GENERAL_REGS 0xffffffffu
#define ALL_VECTOR_REGS 0xffffffff00000000ull
-#ifdef CONFIG_SOFTMMU
-#define ALL_QLDST_REGS \
- (ALL_GENERAL_REGS & ~((1 << TCG_REG_X0) | (1 << TCG_REG_X1) | \
- (1 << TCG_REG_X2) | (1 << TCG_REG_X3)))
-#else
-#define ALL_QLDST_REGS ALL_GENERAL_REGS
-#endif
-
/* Match a constant valid for addition (12-bit, optionally shifted). */
static inline bool is_aimm(uint64_t val)
{
@@ -390,6 +385,10 @@ typedef enum {
I3305_LDR_v64 = 0x5c000000,
I3305_LDR_v128 = 0x9c000000,
+ /* Load/store exclusive. */
+ I3306_LDXP = 0xc8600000,
+ I3306_STXP = 0xc8200000,
+
/* Load/store register. Described here as 3.3.12, but the helper
that emits them can transform to 3.3.10 or 3.3.13. */
I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
@@ -454,6 +453,9 @@ typedef enum {
I3406_ADR = 0x10000000,
I3406_ADRP = 0x90000000,
+ /* Add/subtract extended register instructions. */
+ I3501_ADD = 0x0b200000,
+
/* Add/subtract shifted register instructions (without a shift). */
I3502_ADD = 0x0b000000,
I3502_ADDS = 0x2b000000,
@@ -624,6 +626,12 @@ static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
}
+static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
+ TCGReg rt, TCGReg rt2, TCGReg rn)
+{
+ tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
+}
+
static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
TCGReg rt, int imm19)
{
@@ -706,6 +714,14 @@ static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
}
+static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
+ TCGType sf, TCGReg rd, TCGReg rn,
+ TCGReg rm, int opt, int imm3)
+{
+ tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
+ imm3 << 10 | rn << 5 | rd);
+}
+
/* This function is for both 3.5.2 (Add/Subtract shifted register), for
the rare occasion when we actually want to supply a shift amount. */
static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
@@ -984,7 +1000,7 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
TCGReg r, TCGReg base, intptr_t offset)
{
- TCGReg temp = TCG_REG_TMP;
+ TCGReg temp = TCG_REG_TMP0;
if (offset < -0xffffff || offset > 0xffffff) {
tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
@@ -1136,8 +1152,8 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
}
/* Worst-case scenario, move offset to temp register, use reg offset. */
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
- tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
+ tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
}
static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
@@ -1353,8 +1369,8 @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
if (offset == sextract64(offset, 0, 26)) {
tcg_out_insn(s, 3206, BL, offset);
} else {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
- tcg_out_insn(s, 3207, BLR, TCG_REG_TMP);
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
+ tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
}
}
@@ -1491,7 +1507,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
AArch64Insn insn;
if (rl == ah || (!const_bh && rl == bh)) {
- rl = TCG_REG_TMP;
+ rl = TCG_REG_TMP0;
}
if (const_bl) {
@@ -1508,7 +1524,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
possibility of adding 0+const in the low part, and the
immediate add instructions encode XSP not XZR. Don't try
anything more elaborate here than loading another zero. */
- al = TCG_REG_TMP;
+ al = TCG_REG_TMP0;
tcg_out_movi(s, ext, al, 0);
}
tcg_out_insn_3401(s, insn, ext, rl, al, bl);
@@ -1549,7 +1565,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
{
TCGReg a1 = a0;
if (is_ctz) {
- a1 = TCG_REG_TMP;
+ a1 = TCG_REG_TMP0;
tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
}
if (const_b && b == (ext ? 64 : 32)) {
@@ -1558,7 +1574,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
AArch64Insn sel = I3506_CSEL;
tcg_out_cmp(s, ext, a0, 0, 1);
- tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP, a1);
+ tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);
if (const_b) {
if (b == -1) {
@@ -1571,7 +1587,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
b = d;
}
}
- tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP, b, TCG_COND_NE);
+ tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
}
}
@@ -1588,7 +1604,7 @@ bool tcg_target_has_memory_bswap(MemOp memop)
}
static const TCGLdstHelperParam ldst_helper_param = {
- .ntmp = 1, .tmp = { TCG_REG_TMP }
+ .ntmp = 1, .tmp = { TCG_REG_TMP0 }
};
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1633,19 +1649,19 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
TCGType addr_type = s->addr_type;
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
+ MemOp s_bits = opc & MO_SIZE;
unsigned a_mask;
h->aa = atom_and_align_for_opc(s, opc,
have_lse2 ? MO_ATOM_WITHIN16
: MO_ATOM_IFALIGN,
- false);
+ s_bits == MO_128);
a_mask = (1 << h->aa.align) - 1;
#ifdef CONFIG_SOFTMMU
- unsigned s_bits = opc & MO_SIZE;
unsigned s_mask = (1u << s_bits) - 1;
unsigned mem_index = get_mmuidx(oi);
- TCGReg x3;
+ TCGReg addr_adj;
TCGType mask_type;
uint64_t compare_mask;
@@ -1657,27 +1673,27 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32
? TCG_TYPE_I64 : TCG_TYPE_I32);
- /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {x0,x1}. */
+ /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
- tcg_out_insn(s, 3314, LDP, TCG_REG_X0, TCG_REG_X1, TCG_AREG0,
+ tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
TLB_MASK_TABLE_OFS(mem_index), 1, 0);
/* Extract the TLB index from the address into X0. */
tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
- TCG_REG_X0, TCG_REG_X0, addr_reg,
+ TCG_REG_TMP0, TCG_REG_TMP0, addr_reg,
s->page_bits - CPU_TLB_ENTRY_BITS);
- /* Add the tlb_table pointer, creating the CPUTLBEntry address into X1. */
- tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0);
+ /* Add the tlb_table pointer, forming the CPUTLBEntry address in TMP1. */
+ tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
- /* Load the tlb comparator into X0, and the fast path addend into X1. */
- tcg_out_ld(s, addr_type, TCG_REG_X0, TCG_REG_X1,
+ /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */
+ tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
is_ld ? offsetof(CPUTLBEntry, addr_read)
: offsetof(CPUTLBEntry, addr_write));
- tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1,
+ tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
offsetof(CPUTLBEntry, addend));
/*
@@ -1686,25 +1702,26 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
* cross pages using the address of the last byte of the access.
*/
if (a_mask >= s_mask) {
- x3 = addr_reg;
+ addr_adj = addr_reg;
} else {
+ addr_adj = TCG_REG_TMP2;
tcg_out_insn(s, 3401, ADDI, addr_type,
- TCG_REG_X3, addr_reg, s_mask - a_mask);
- x3 = TCG_REG_X3;
+ addr_adj, addr_reg, s_mask - a_mask);
}
compare_mask = (uint64_t)s->page_mask | a_mask;
- /* Store the page mask part of the address into X3. */
- tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_X3, x3, compare_mask);
+ /* Store the page mask part of the address into TMP2. */
+ tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2,
+ addr_adj, compare_mask);
/* Perform the address comparison. */
- tcg_out_cmp(s, addr_type, TCG_REG_X0, TCG_REG_X3, 0);
+ tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0);
/* If not equal, we jump to the slow path. */
ldst->label_ptr[0] = s->code_ptr;
tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
- h->base = TCG_REG_X1,
+ h->base = TCG_REG_TMP1;
h->index = addr_reg;
h->index_ext = addr_type;
#else
@@ -1822,6 +1839,108 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
}
}
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+ TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+ TCGLabelQemuLdst *ldst;
+ HostAddress h;
+ TCGReg base;
+ bool use_pair;
+
+ ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+ /* Compose the final address, as LDP/STP have no indexing. */
+ if (h.index == TCG_REG_XZR) {
+ base = h.base;
+ } else {
+ base = TCG_REG_TMP2;
+ if (h.index_ext == TCG_TYPE_I32) {
+ /* add base, base, index, uxtw */
+ tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base,
+ h.base, h.index, MO_32, 0);
+ } else {
+ /* add base, base, index */
+ tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index);
+ }
+ }
+
+ use_pair = h.aa.atom < MO_128 || have_lse2;
+
+ if (!use_pair) {
+ tcg_insn_unit *branch = NULL;
+ TCGReg ll, lh, sl, sh;
+
+ /*
+ * If we have already checked for 16-byte alignment, that's all
+ * we need. Otherwise we have determined that misaligned atomicity
+ * may be handled with two 8-byte loads.
+ */
+ if (h.aa.align < MO_128) {
+ /*
+ * TODO: align should be MO_64, so we only need test bit 3,
+ * which means we could use TBNZ instead of ANDS+B_C.
+ */
+ tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15);
+ branch = s->code_ptr;
+ tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
+ use_pair = true;
+ }
+
+ if (is_ld) {
+ /*
+ * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
+ * ldxp lo, hi, [base]
+ * stxp t0, lo, hi, [base]
+ * cbnz t0, .-8
+ * Require no overlap between data{lo,hi} and base.
+ */
+ if (datalo == base || datahi == base) {
+ tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base);
+ base = TCG_REG_TMP2;
+ }
+ ll = sl = datalo;
+ lh = sh = datahi;
+ } else {
+ /*
+ * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
+ * 1: ldxp t0, t1, [base]
+ * stxp t0, lo, hi, [base]
+ * cbnz t0, 1b
+ */
+ tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1);
+ ll = TCG_REG_TMP0;
+ lh = TCG_REG_TMP1;
+ sl = datalo;
+ sh = datahi;
+ }
+
+ tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base);
+ tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base);
+ tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2);
+
+ if (use_pair) {
+ /* "b .+8", branching across the one insn of use_pair. */
+ tcg_out_insn(s, 3206, B, 2);
+ reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr));
+ }
+ }
+
+ if (use_pair) {
+ if (is_ld) {
+ tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0);
+ } else {
+ tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0);
+ }
+ }
+
+ if (ldst) {
+ ldst->type = TCG_TYPE_I128;
+ ldst->datalo_reg = datalo;
+ ldst->datahi_reg = datahi;
+ ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+ }
+}
+
static const tcg_insn_unit *tb_ret_addr;
static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
@@ -1847,7 +1966,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
set_jmp_insn_offset(s, which);
tcg_out32(s, I3206_B);
- tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+ tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
set_jmp_reset_offset(s, which);
}
@@ -1866,7 +1985,7 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
ptrdiff_t i_offset = i_addr - jmp_rx;
/* Note that we asserted this in range in tcg_out_goto_tb. */
- insn = deposit32(I3305_LDR | TCG_REG_TMP, 5, 19, i_offset >> 2);
+ insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
}
qatomic_set((uint32_t *)jmp_rw, insn);
flush_idcache_range(jmp_rx, jmp_rw, 4);
@@ -2060,13 +2179,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_rem_i64:
case INDEX_op_rem_i32:
- tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP, a1, a2);
- tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1);
+ tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2);
+ tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
break;
case INDEX_op_remu_i64:
case INDEX_op_remu_i32:
- tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP, a1, a2);
- tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1);
+ tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2);
+ tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
break;
case INDEX_op_shl_i64:
@@ -2110,8 +2229,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
if (c2) {
tcg_out_rotl(s, ext, a0, a1, a2);
} else {
- tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP, TCG_REG_XZR, a2);
- tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP);
+ tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2);
+ tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0);
}
break;
@@ -2161,6 +2280,14 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_qemu_st_a64_i64:
tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
break;
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true);
+ break;
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false);
+ break;
case INDEX_op_bswap64_i64:
tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1);
@@ -2517,8 +2644,8 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
break;
}
}
- tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
- a2 = TCG_VEC_TMP;
+ tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0);
+ a2 = TCG_VEC_TMP0;
}
if (is_scalar) {
insn = cmp_scalar_insn[cond];
@@ -2799,12 +2926,18 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
case INDEX_op_qemu_ld_a64_i32:
case INDEX_op_qemu_ld_a32_i64:
case INDEX_op_qemu_ld_a64_i64:
- return C_O1_I1(r, l);
+ return C_O1_I1(r, r);
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ return C_O2_I1(r, r, r);
case INDEX_op_qemu_st_a32_i32:
case INDEX_op_qemu_st_a64_i32:
case INDEX_op_qemu_st_a32_i64:
case INDEX_op_qemu_st_a64_i64:
- return C_O0_I2(lZ, l);
+ return C_O0_I2(rZ, r);
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ return C_O0_I3(rZ, rZ, r);
case INDEX_op_deposit_i32:
case INDEX_op_deposit_i64:
@@ -2900,9 +3033,11 @@ static void tcg_target_init(TCGContext *s)
s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
- tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
- tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
+ tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
+ tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1);
+ tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2);
+ tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
}
/* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index d5f7614..ce64de0 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -16,7 +16,6 @@
#include "host/cpuinfo.h"
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
#define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1)
typedef enum {
@@ -131,7 +130,16 @@ typedef enum {
#define TCG_TARGET_HAS_muluh_i64 1
#define TCG_TARGET_HAS_mulsh_i64 1
-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+/*
+ * Without FEAT_LSE2, we must use LDXP+STXP to implement atomic 128-bit load,
+ * which requires writable pages. We must defer to the helper for user-only,
+ * but in system mode all ram is writable for the host.
+ */
+#ifdef CONFIG_USER_ONLY
+#define TCG_TARGET_HAS_qemu_ldst_i128 have_lse2
+#else
+#define TCG_TARGET_HAS_qemu_ldst_i128 1
+#endif
#define TCG_TARGET_HAS_v64 1
#define TCG_TARGET_HAS_v128 1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 65efc53..c649db7 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -31,7 +31,6 @@ extern int arm_arch;
#define use_armv7_instructions (__ARM_ARCH >= 7 || arm_arch >= 7)
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
#define MAX_CODE_GEN_BUFFER_SIZE UINT32_MAX
typedef enum {
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index bfe9d98..ae54e5f 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -91,6 +91,8 @@ static const int tcg_target_reg_alloc_order[] = {
#endif
};
+#define TCG_TMP_VEC TCG_REG_XMM5
+
static const int tcg_target_call_iarg_regs[] = {
#if TCG_TARGET_REG_BITS == 64
#if defined(_WIN64)
@@ -319,6 +321,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16)
+#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16)
#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
@@ -1753,7 +1757,21 @@ typedef struct {
bool tcg_target_has_memory_bswap(MemOp memop)
{
- return have_movbe;
+ TCGAtomAlign aa;
+
+ if (!have_movbe) {
+ return false;
+ }
+ if ((memop & MO_SIZE) < MO_128) {
+ return true;
+ }
+
+ /*
+ * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
+ * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
+ */
+ aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+ return aa.atom < MO_128;
}
/*
@@ -1781,6 +1799,30 @@ static const TCGLdstHelperParam ldst_helper_param = {
static const TCGLdstHelperParam ldst_helper_param = { };
#endif
+static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
+ TCGReg l, TCGReg h, TCGReg v)
+{
+ int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+ /* vpmov{d,q} %v, %l */
+ tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
+ /* vpextr{d,q} $1, %v, %h */
+ tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
+ tcg_out8(s, 1);
+}
+
+static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
+ TCGReg v, TCGReg l, TCGReg h)
+{
+ int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+ /* vmov{d,q} %l, %v */
+ tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
+ /* vpinsr{d,q} $1, %h, %v, %v */
+ tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
+ tcg_out8(s, 1);
+}
+
/*
* Generate code for the slow path for a load at the end of block
*/
@@ -1870,6 +1912,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
{
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
+ MemOp s_bits = opc & MO_SIZE;
unsigned a_mask;
#ifdef CONFIG_SOFTMMU
@@ -1880,7 +1923,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
*h = x86_guest_base;
#endif
h->base = addrlo;
- h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
+ h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
a_mask = (1 << h->aa.align) - 1;
#ifdef CONFIG_SOFTMMU
@@ -1890,7 +1933,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
TCGType tlbtype = TCG_TYPE_I32;
int trexw = 0, hrexw = 0, tlbrexw = 0;
unsigned mem_index = get_mmuidx(oi);
- unsigned s_bits = opc & MO_SIZE;
unsigned s_mask = (1 << s_bits) - 1;
int tlb_mask;
@@ -2070,6 +2112,72 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
h.base, h.index, 0, h.ofs + 4);
}
break;
+
+ case MO_128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+
+ /*
+ * Without 16-byte atomicity, use integer regs.
+ * That is where we want the data, and it allows bswaps.
+ */
+ if (h.aa.atom < MO_128) {
+ if (use_movbe) {
+ TCGReg t = datalo;
+ datalo = datahi;
+ datahi = t;
+ }
+ if (h.base == datalo || h.index == datalo) {
+ tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
+ datalo, datahi, 0);
+ tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
+ datahi, datahi, 8);
+ } else {
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
+ h.base, h.index, 0, h.ofs + 8);
+ }
+ break;
+ }
+
+ /*
+ * With 16-byte atomicity, a vector load is required.
+ * If we already have 16-byte alignment, then VMOVDQA always works.
+ * Else if VMOVDQU has atomicity with dynamic alignment, use that.
+ * Else use we require a runtime test for alignment for VMOVDQA;
+ * use VMOVDQU on the unaligned nonatomic path for simplicity.
+ */
+ if (h.aa.align >= MO_128) {
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ } else {
+ TCGLabel *l1 = gen_new_label();
+ TCGLabel *l2 = gen_new_label();
+
+ tcg_out_testi(s, h.base, 15);
+ tcg_out_jxx(s, JCC_JNE, l1, true);
+
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_jxx(s, JCC_JMP, l2, true);
+
+ tcg_out_label(s, l1);
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_label(s, l2);
+ }
+ tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
+ break;
+
default:
g_assert_not_reached();
}
@@ -2140,6 +2248,63 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
h.base, h.index, 0, h.ofs + 4);
}
break;
+
+ case MO_128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+
+ /*
+ * Without 16-byte atomicity, use integer regs.
+ * That is where we have the data, and it allows bswaps.
+ */
+ if (h.aa.atom < MO_128) {
+ if (use_movbe) {
+ TCGReg t = datalo;
+ datalo = datahi;
+ datahi = t;
+ }
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
+ h.base, h.index, 0, h.ofs + 8);
+ break;
+ }
+
+ /*
+ * With 16-byte atomicity, a vector store is required.
+ * If we already have 16-byte alignment, then VMOVDQA always works.
+ * Else if VMOVDQU has atomicity with dynamic alignment, use that.
+ * Else use we require a runtime test for alignment for VMOVDQA;
+ * use VMOVDQU on the unaligned nonatomic path for simplicity.
+ */
+ tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
+ if (h.aa.align >= MO_128) {
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ } else {
+ TCGLabel *l1 = gen_new_label();
+ TCGLabel *l2 = gen_new_label();
+
+ tcg_out_testi(s, h.base, 15);
+ tcg_out_jxx(s, JCC_JNE, l1, true);
+
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_jxx(s, JCC_JMP, l2, true);
+
+ tcg_out_label(s, l1);
+ tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
+ TCG_TMP_VEC, 0,
+ h.base, h.index, 0, h.ofs);
+ tcg_out_label(s, l2);
+ }
+ break;
+
default:
g_assert_not_reached();
}
@@ -2470,6 +2635,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
}
break;
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
+ break;
case INDEX_op_qemu_st_a64_i32:
case INDEX_op_qemu_st8_a64_i32:
@@ -2496,6 +2666,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
}
break;
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
+ break;
OP_32_64(mulu2):
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
@@ -3193,6 +3368,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
case INDEX_op_qemu_st_a64_i64:
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ return C_O2_I1(r, r, L);
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ return C_O0_I3(L, L, L);
+
case INDEX_op_brcond2_i32:
return C_O0_I4(r, r, ri, ri);
@@ -3962,6 +4146,7 @@ static void tcg_target_init(TCGContext *s)
s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+ tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
#ifdef _WIN64
/* These are call saved, and we don't save them, so don't use them. */
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 0106946..1468f8e 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -28,7 +28,6 @@
#include "host/cpuinfo.h"
#define TCG_TARGET_INSN_UNIT_SIZE 1
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
#ifdef __x86_64__
# define TCG_TARGET_REG_BITS 64
@@ -118,7 +117,6 @@ typedef enum {
#define have_avx1 (cpuinfo & CPUINFO_AVX1)
#define have_avx2 (cpuinfo & CPUINFO_AVX2)
#define have_movbe (cpuinfo & CPUINFO_MOVBE)
-#define have_atomic16 (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
/*
* There are interesting instructions in AVX512, so long as we have AVX512VL,
@@ -202,7 +200,8 @@ typedef enum {
#define TCG_TARGET_HAS_qemu_st8_i32 1
#endif
-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+#define TCG_TARGET_HAS_qemu_ldst_i128 \
+ (TCG_TARGET_REG_BITS == 64 && (cpuinfo & CPUINFO_ATOMIC_VMOVDQA))
/* We do not support older SSE systems, only beginning with AVX1. */
#define TCG_TARGET_HAS_v64 have_avx1
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 8fbb6c6..e4806f6 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -36,7 +36,6 @@
#endif
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
#define TCG_TARGET_NB_REGS 32
#define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1)
diff --git a/tcg/ppc/tcg-target-con-set.h b/tcg/ppc/tcg-target-con-set.h
index f206b29..bbd7b21 100644
--- a/tcg/ppc/tcg-target-con-set.h
+++ b/tcg/ppc/tcg-target-con-set.h
@@ -14,6 +14,7 @@ C_O0_I2(r, r)
C_O0_I2(r, ri)
C_O0_I2(v, r)
C_O0_I3(r, r, r)
+C_O0_I3(o, m, r)
C_O0_I4(r, r, ri, ri)
C_O0_I4(r, r, r, r)
C_O1_I1(r, r)
@@ -34,6 +35,7 @@ C_O1_I3(v, v, v, v)
C_O1_I4(r, r, ri, rZ, rZ)
C_O1_I4(r, r, r, ri, ri)
C_O2_I1(r, r, r)
+C_O2_I1(o, m, r)
C_O2_I2(r, r, r, r)
C_O2_I4(r, r, rI, rZM, r, r)
C_O2_I4(r, r, r, r, rI, rZM)
diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h
index 094613c..2084690 100644
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@@ -9,6 +9,7 @@
* REGS(letter, register_mask)
*/
REGS('r', ALL_GENERAL_REGS)
+REGS('o', ALL_GENERAL_REGS & 0xAAAAAAAAu) /* odd registers */
REGS('v', ALL_VECTOR_REGS)
/*
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index d4269df..d47a9e3 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -295,25 +295,27 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
#define B OPCD( 18)
#define BC OPCD( 16)
+
#define LBZ OPCD( 34)
#define LHZ OPCD( 40)
#define LHA OPCD( 42)
#define LWZ OPCD( 32)
#define LWZUX XO31( 55)
-#define STB OPCD( 38)
-#define STH OPCD( 44)
-#define STW OPCD( 36)
-
-#define STD XO62( 0)
-#define STDU XO62( 1)
-#define STDX XO31(149)
-
#define LD XO58( 0)
#define LDX XO31( 21)
#define LDU XO58( 1)
#define LDUX XO31( 53)
#define LWA XO58( 2)
#define LWAX XO31(341)
+#define LQ OPCD( 56)
+
+#define STB OPCD( 38)
+#define STH OPCD( 44)
+#define STW OPCD( 36)
+#define STD XO62( 0)
+#define STDU XO62( 1)
+#define STDX XO31(149)
+#define STQ XO62( 2)
#define ADDIC OPCD( 12)
#define ADDI OPCD( 14)
@@ -2020,7 +2022,18 @@ typedef struct {
bool tcg_target_has_memory_bswap(MemOp memop)
{
- return true;
+ TCGAtomAlign aa;
+
+ if ((memop & MO_SIZE) <= MO_64) {
+ return true;
+ }
+
+ /*
+ * Reject 16-byte memop with 16-byte atomicity,
+ * but do allow a pair of 64-bit operations.
+ */
+ aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+ return aa.atom <= MO_64;
}
/*
@@ -2035,7 +2048,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
{
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
- MemOp a_bits;
+ MemOp a_bits, s_bits;
/*
* Book II, Section 1.4, Single-Copy Atomicity, specifies:
@@ -2047,10 +2060,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
* As of 3.0, "the non-atomic access is performed as described in
* the corresponding list", which matches MO_ATOM_SUBALIGN.
*/
+ s_bits = opc & MO_SIZE;
h->aa = atom_and_align_for_opc(s, opc,
have_isa_3_00 ? MO_ATOM_SUBALIGN
: MO_ATOM_IFALIGN,
- false);
+ s_bits == MO_128);
a_bits = h->aa.align;
#ifdef CONFIG_SOFTMMU
@@ -2060,7 +2074,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
int fast_off = TLB_MASK_TABLE_OFS(mem_index);
int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
int table_off = fast_off + offsetof(CPUTLBDescFast, table);
- unsigned s_bits = opc & MO_SIZE;
ldst = new_ldst_label(s);
ldst->is_ld = is_ld;
@@ -2303,6 +2316,60 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
}
}
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+ TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+ TCGLabelQemuLdst *ldst;
+ HostAddress h;
+ bool need_bswap;
+ uint32_t insn;
+ TCGReg index;
+
+ ldst = prepare_host_addr(s, &h, addr_reg, -1, oi, is_ld);
+
+ /* Compose the final address, as LQ/STQ have no indexing. */
+ index = h.index;
+ if (h.base != 0) {
+ index = TCG_REG_TMP1;
+ tcg_out32(s, ADD | TAB(index, h.base, h.index));
+ }
+ need_bswap = get_memop(oi) & MO_BSWAP;
+
+ if (h.aa.atom == MO_128) {
+ tcg_debug_assert(!need_bswap);
+ tcg_debug_assert(datalo & 1);
+ tcg_debug_assert(datahi == datalo - 1);
+ insn = is_ld ? LQ : STQ;
+ tcg_out32(s, insn | TAI(datahi, index, 0));
+ } else {
+ TCGReg d1, d2;
+
+ if (HOST_BIG_ENDIAN ^ need_bswap) {
+ d1 = datahi, d2 = datalo;
+ } else {
+ d1 = datalo, d2 = datahi;
+ }
+
+ if (need_bswap) {
+ tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 8);
+ insn = is_ld ? LDBRX : STDBRX;
+ tcg_out32(s, insn | TAB(d1, 0, index));
+ tcg_out32(s, insn | TAB(d2, index, TCG_REG_R0));
+ } else {
+ insn = is_ld ? LD : STD;
+ tcg_out32(s, insn | TAI(d1, index, 0));
+ tcg_out32(s, insn | TAI(d2, index, 8));
+ }
+ }
+
+ if (ldst) {
+ ldst->type = TCG_TYPE_I128;
+ ldst->datalo_reg = datalo;
+ ldst->datahi_reg = datahi;
+ ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+ }
+}
+
static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
{
int i;
@@ -2860,6 +2927,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
args[4], TCG_TYPE_I64);
}
break;
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true);
+ break;
case INDEX_op_qemu_st_a64_i32:
if (TCG_TARGET_REG_BITS == 32) {
@@ -2889,6 +2961,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
args[4], TCG_TYPE_I64);
}
break;
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+ tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false);
+ break;
case INDEX_op_setcond_i32:
tcg_out_setcond(s, TCG_TYPE_I32, args[3], args[0], args[1], args[2],
@@ -3722,6 +3799,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
case INDEX_op_qemu_st_a64_i64:
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I4(r, r, r, r);
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ return C_O2_I1(o, m, r);
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ return C_O0_I3(o, m, r);
+
case INDEX_op_add_vec:
case INDEX_op_sub_vec:
case INDEX_op_mul_vec:
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 0914380..40f20b0 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -34,7 +34,6 @@
#define TCG_TARGET_NB_REGS 64
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
typedef enum {
TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3,
@@ -149,7 +148,8 @@ extern bool have_vsx;
#define TCG_TARGET_HAS_mulsh_i64 1
#endif
-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+#define TCG_TARGET_HAS_qemu_ldst_i128 \
+ (TCG_TARGET_REG_BITS == 64 && have_isa_2_07)
/*
* While technically Altivec could support V64, it has no 64-bit store
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index 62fe61a..54fdff0 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -35,7 +35,6 @@
#define TCG_TARGET_REG_BITS 64
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 20
#define TCG_TARGET_NB_REGS 32
#define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1)
diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index ecc079b..cbad91b 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -14,6 +14,7 @@ C_O0_I2(r, r)
C_O0_I2(r, ri)
C_O0_I2(r, rA)
C_O0_I2(v, r)
+C_O0_I3(o, m, r)
C_O1_I1(r, r)
C_O1_I1(v, r)
C_O1_I1(v, v)
@@ -36,6 +37,7 @@ C_O1_I2(v, v, v)
C_O1_I3(v, v, v, v)
C_O1_I4(r, r, ri, rI, r)
C_O1_I4(r, r, rA, rI, r)
+C_O2_I1(o, m, r)
C_O2_I2(o, m, 0, r)
C_O2_I2(o, m, r, r)
C_O2_I3(o, m, 0, 1, r)
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index dfaa34c..503126c 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -243,6 +243,7 @@ typedef enum S390Opcode {
RXY_LLGF = 0xe316,
RXY_LLGH = 0xe391,
RXY_LMG = 0xeb04,
+ RXY_LPQ = 0xe38f,
RXY_LRV = 0xe31e,
RXY_LRVG = 0xe30f,
RXY_LRVH = 0xe31f,
@@ -253,6 +254,7 @@ typedef enum S390Opcode {
RXY_STG = 0xe324,
RXY_STHY = 0xe370,
RXY_STMG = 0xeb24,
+ RXY_STPQ = 0xe38e,
RXY_STRV = 0xe33e,
RXY_STRVG = 0xe32f,
RXY_STRVH = 0xe33f,
@@ -1577,7 +1579,18 @@ typedef struct {
bool tcg_target_has_memory_bswap(MemOp memop)
{
- return true;
+ TCGAtomAlign aa;
+
+ if ((memop & MO_SIZE) <= MO_64) {
+ return true;
+ }
+
+ /*
+ * Reject 16-byte memop with 16-byte atomicity,
+ * but do allow a pair of 64-bit operations.
+ */
+ aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+ return aa.atom <= MO_64;
}
static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data,
@@ -1734,13 +1747,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
{
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
+ MemOp s_bits = opc & MO_SIZE;
unsigned a_mask;
- h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
+ h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
a_mask = (1 << h->aa.align) - 1;
#ifdef CONFIG_SOFTMMU
- unsigned s_bits = opc & MO_SIZE;
unsigned s_mask = (1 << s_bits) - 1;
int mem_index = get_mmuidx(oi);
int fast_off = TLB_MASK_TABLE_OFS(mem_index);
@@ -1865,6 +1878,80 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
}
}
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+ TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+ TCGLabel *l1 = NULL, *l2 = NULL;
+ TCGLabelQemuLdst *ldst;
+ HostAddress h;
+ bool need_bswap;
+ bool use_pair;
+ S390Opcode insn;
+
+ ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+ use_pair = h.aa.atom < MO_128;
+ need_bswap = get_memop(oi) & MO_BSWAP;
+
+ if (!use_pair) {
+ /*
+ * Atomicity requires we use LPQ. If we've already checked for
+ * 16-byte alignment, that's all we need. If we arrive with
+ * lesser alignment, we have determined that less than 16-byte
+ * alignment can be satisfied with two 8-byte loads.
+ */
+ if (h.aa.align < MO_128) {
+ use_pair = true;
+ l1 = gen_new_label();
+ l2 = gen_new_label();
+
+ tcg_out_insn(s, RI, TMLL, addr_reg, 15);
+ tgen_branch(s, 7, l1); /* CC in {1,2,3} */
+ }
+
+ tcg_debug_assert(!need_bswap);
+ tcg_debug_assert(datalo & 1);
+ tcg_debug_assert(datahi == datalo - 1);
+ insn = is_ld ? RXY_LPQ : RXY_STPQ;
+ tcg_out_insn_RXY(s, insn, datahi, h.base, h.index, h.disp);
+
+ if (use_pair) {
+ tgen_branch(s, S390_CC_ALWAYS, l2);
+ tcg_out_label(s, l1);
+ }
+ }
+ if (use_pair) {
+ TCGReg d1, d2;
+
+ if (need_bswap) {
+ d1 = datalo, d2 = datahi;
+ insn = is_ld ? RXY_LRVG : RXY_STRVG;
+ } else {
+ d1 = datahi, d2 = datalo;
+ insn = is_ld ? RXY_LG : RXY_STG;
+ }
+
+ if (h.base == d1 || h.index == d1) {
+ tcg_out_insn(s, RXY, LAY, TCG_TMP0, h.base, h.index, h.disp);
+ h.base = TCG_TMP0;
+ h.index = TCG_REG_NONE;
+ h.disp = 0;
+ }
+ tcg_out_insn_RXY(s, insn, d1, h.base, h.index, h.disp);
+ tcg_out_insn_RXY(s, insn, d2, h.base, h.index, h.disp + 8);
+ }
+ if (l2) {
+ tcg_out_label(s, l2);
+ }
+
+ if (ldst) {
+ ldst->type = TCG_TYPE_I128;
+ ldst->datalo_reg = datalo;
+ ldst->datahi_reg = datahi;
+ ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+ }
+}
+
static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
{
/* Reuse the zeroing that exists for goto_ptr. */
@@ -2226,6 +2313,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_qemu_st_a64_i64:
tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I64);
break;
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true);
+ break;
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false);
+ break;
case INDEX_op_ld16s_i64:
tcg_out_mem(s, 0, RXY_LGH, args[0], args[1], TCG_REG_NONE, args[2]);
@@ -3107,6 +3202,12 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
case INDEX_op_qemu_st_a32_i32:
case INDEX_op_qemu_st_a64_i32:
return C_O0_I2(r, r);
+ case INDEX_op_qemu_ld_a32_i128:
+ case INDEX_op_qemu_ld_a64_i128:
+ return C_O2_I1(o, m, r);
+ case INDEX_op_qemu_st_a32_i128:
+ case INDEX_op_qemu_st_a64_i128:
+ return C_O0_I3(o, m, r);
case INDEX_op_deposit_i32:
case INDEX_op_deposit_i64:
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index 170007b..9a40500 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -26,7 +26,6 @@
#define S390_TCG_TARGET_H
#define TCG_TARGET_INSN_UNIT_SIZE 2
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 19
/* We have a +- 4GB range on the branches; leave some slop. */
#define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB)
@@ -140,7 +139,7 @@ extern uint64_t s390_facilities[3];
#define TCG_TARGET_HAS_muluh_i64 0
#define TCG_TARGET_HAS_mulsh_i64 0
-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+#define TCG_TARGET_HAS_qemu_ldst_i128 1
#define TCG_TARGET_HAS_v64 HAVE_FACILITY(VECTOR)
#define TCG_TARGET_HAS_v128 HAVE_FACILITY(VECTOR)
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index 31c5537..d454278 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -26,7 +26,6 @@
#define SPARC_TCG_TARGET_H
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
#define TCG_TARGET_NB_REGS 32
#define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index ac30d48..2352ca4 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -5736,8 +5736,8 @@ static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *ldst,
mov[0].dst = ldst->datalo_reg;
mov[0].src =
tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, HOST_BIG_ENDIAN);
- mov[0].dst_type = TCG_TYPE_I32;
- mov[0].src_type = TCG_TYPE_I32;
+ mov[0].dst_type = TCG_TYPE_REG;
+ mov[0].src_type = TCG_TYPE_REG;
mov[0].src_ext = TCG_TARGET_REG_BITS == 32 ? MO_32 : MO_64;
mov[1].dst = ldst->datahi_reg;
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 28dc6d5..60a6ed6 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -42,7 +42,6 @@
#define TCG_TARGET_INTERPRETER 1
#define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
#define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1)
#if UINTPTR_MAX == UINT32_MAX
diff --git a/tests/decode/check.sh b/tests/decode/check.sh
deleted file mode 100755
index 95445a0..0000000
--- a/tests/decode/check.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-# This work is licensed under the terms of the GNU LGPL, version 2 or later.
-# See the COPYING.LIB file in the top-level directory.
-
-PYTHON=$1
-DECODETREE=$2
-E=0
-
-# All of these tests should produce errors
-for i in err_*.decode; do
- if $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then
- # Pass, aka failed to fail.
- echo FAIL: $i 1>&2
- E=1
- fi
-done
-
-for i in succ_*.decode; do
- if ! $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then
- echo FAIL:$i 1>&2
- fi
-done
-
-exit $E
diff --git a/tests/decode/err_field10.decode b/tests/decode/err_field10.decode
new file mode 100644
index 0000000..3e672b7
--- /dev/null
+++ b/tests/decode/err_field10.decode
@@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose formats which refer to undefined fields
+%field1 field2:3
+@fmt ........ ........ ........ ........ %field1
+insn 00000000 00000000 00000000 00000000 @fmt
diff --git a/tests/decode/err_field7.decode b/tests/decode/err_field7.decode
new file mode 100644
index 0000000..51fad7c
--- /dev/null
+++ b/tests/decode/err_field7.decode
@@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose fields whose definitions form a loop
+%field1 field2:3
+%field2 field1:4
+insn 00000000 00000000 00000000 00000000 %field1 %field2
diff --git a/tests/decode/err_field8.decode b/tests/decode/err_field8.decode
new file mode 100644
index 0000000..cc47c08
--- /dev/null
+++ b/tests/decode/err_field8.decode
@@ -0,0 +1,8 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose patterns which refer to undefined fields
+&f1 f1 a
+%field1 field2:3
+@fmt ........ ........ ........ .... a:4 &f1
+insn 00000000 00000000 00000000 0000 .... @fmt f1=%field1
diff --git a/tests/decode/err_field9.decode b/tests/decode/err_field9.decode
new file mode 100644
index 0000000..e7361d5
--- /dev/null
+++ b/tests/decode/err_field9.decode
@@ -0,0 +1,14 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose fields where the format refers to a field defined in the
+# pattern and the pattern refers to a field defined in the format.
+# This is theoretically not impossible to implement, but is not
+# supported by the script at this time.
+&abcd a b c d
+%refa a:3
+%refc c:4
+# Format defines 'c' and sets 'b' to an indirect ref to 'a'
+@fmt ........ ........ ........ c:8 &abcd b=%refa
+# Pattern defines 'a' and sets 'd' to an indirect ref to 'c'
+insn 00000000 00000000 00000000 ........ @fmt d=%refc a=6
diff --git a/tests/decode/meson.build b/tests/decode/meson.build
new file mode 100644
index 0000000..38a0629
--- /dev/null
+++ b/tests/decode/meson.build
@@ -0,0 +1,64 @@
+err_tests = [
+ 'err_argset1.decode',
+ 'err_argset2.decode',
+ 'err_field1.decode',
+ 'err_field2.decode',
+ 'err_field3.decode',
+ 'err_field4.decode',
+ 'err_field5.decode',
+ 'err_field6.decode',
+ 'err_field7.decode',
+ 'err_field8.decode',
+ 'err_field9.decode',
+ 'err_field10.decode',
+ 'err_init1.decode',
+ 'err_init2.decode',
+ 'err_init3.decode',
+ 'err_init4.decode',
+ 'err_overlap1.decode',
+ 'err_overlap2.decode',
+ 'err_overlap3.decode',
+ 'err_overlap4.decode',
+ 'err_overlap5.decode',
+ 'err_overlap6.decode',
+ 'err_overlap7.decode',
+ 'err_overlap8.decode',
+ 'err_overlap9.decode',
+ 'err_pattern_group_empty.decode',
+ 'err_pattern_group_ident1.decode',
+ 'err_pattern_group_ident2.decode',
+ 'err_pattern_group_nest1.decode',
+ 'err_pattern_group_nest2.decode',
+ 'err_pattern_group_nest3.decode',
+ 'err_pattern_group_overlap1.decode',
+ 'err_width1.decode',
+ 'err_width2.decode',
+ 'err_width3.decode',
+ 'err_width4.decode',
+]
+
+succ_tests = [
+ 'succ_argset_type1.decode',
+ 'succ_function.decode',
+ 'succ_ident1.decode',
+ 'succ_named_field.decode',
+ 'succ_pattern_group_nest1.decode',
+ 'succ_pattern_group_nest2.decode',
+ 'succ_pattern_group_nest3.decode',
+ 'succ_pattern_group_nest4.decode',
+]
+
+suite = 'decodetree'
+decodetree = find_program(meson.project_source_root() / 'scripts/decodetree.py')
+
+foreach t: err_tests
+ test(fs.replace_suffix(t, ''),
+ decodetree, args: ['-o', '/dev/null', '--test-for-error', files(t)],
+ suite: suite)
+endforeach
+
+foreach t: succ_tests
+ test(fs.replace_suffix(t, ''),
+ decodetree, args: ['-o', '/dev/null', files(t)],
+ suite: suite)
+endforeach
diff --git a/tests/decode/succ_named_field.decode b/tests/decode/succ_named_field.decode
new file mode 100644
index 0000000..e64b3f9
--- /dev/null
+++ b/tests/decode/succ_named_field.decode
@@ -0,0 +1,19 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# field using a named_field
+%imm_sz 8:8 sz:3
+insn 00000000 00000000 ........ 00000000 imm_sz=%imm_sz sz=1
+
+# Ditto, via a format. Here a field in the format
+# references a named field defined in the insn pattern:
+&imm_a imm alpha
+%foo 0:16 alpha:4
+@foo 00000001 ........ ........ ........ &imm_a imm=%foo
+i1 ........ 00000000 ........ ........ @foo alpha=1
+i2 ........ 00000001 ........ ........ @foo alpha=2
+
+# Here the named field is defined in the format and referenced
+# from the insn pattern:
+@bar 00000010 ........ ........ ........ &imm_a alpha=4
+i3 ........ 00000000 ........ ........ @bar imm=%foo
diff --git a/tests/meson.build b/tests/meson.build
index 8e318ec..083f299 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -74,10 +74,7 @@ if have_tools and have_vhost_user and 'CONFIG_LINUX' in config_host
dependencies: [qemuutil, vhost_user])
endif
-test('decodetree', sh,
- args: [ files('decode/check.sh'), config_host['PYTHON'], files('../scripts/decodetree.py') ],
- workdir: meson.current_source_dir() / 'decode',
- suite: 'decodetree')
+subdir('decode')
if 'CONFIG_TCG' in config_all
subdir('fp')