diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2023-05-30 13:25:18 -0700 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2023-05-30 13:25:18 -0700 |
commit | 51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c (patch) | |
tree | 50fbedc5a85acaa17460515926605111b62b8f3b | |
parent | 7f027ee0ce1f79302acd7330d796fb7a9e2529b1 (diff) | |
parent | 276d77de503e8f5f5cbd3f7d94302ca12d1d982e (diff) | |
download | qemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.zip qemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.tar.gz qemu-51bdb0b57a2d9e84d6915fbae7b5d76c8820cf3c.tar.bz2 |
Merge tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu into staging
Improvements to 128-bit atomics:
- Separate __int128_t type and arithmetic detection
- Support 128-bit load/store in backend for i386, aarch64, ppc64, s390x
- Accelerate atomics via host/include/
Decodetree:
- Add named field syntax
- Move tests to meson
# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmR2R10dHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/bsgf/XLi8q+ITyoEAKwG4
# 6ML7DktLAdIs9Euah9twqe16U0BM0YzpKfymBfVVBKKaIa0524N4ZKIT3h6EeJo+
# f+ultqrpsnH+aQh4wc3ZCkEvRdhzhFT8VcoRTunJuJrbL3Y8n2ZSgODUL2a0tahT
# Nn+zEPm8rzQanSKQHq5kyNBLpgTUKjc5wKfvy/WwttnFmkTnqzcuEA6nPVOVwOHC
# lZBQCByIQWsHfFHUVJFvsFzBQbm0mAiW6FNKzPBkoXon0h/UZUI1lV+xXzgutFs+
# zR2O8IZwLYRu2wOWiTF8Nn2qQafkB3Dhwoq3JTEXhOqosOPExbIiWlsZDlPiKRJk
# bwmQlg==
# =XQMb
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 30 May 2023 11:58:37 AM PDT
# gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg: issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate]
* tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu: (27 commits)
tests/decode: Add tests for various named-field cases
scripts/decodetree: Implement named field support
scripts/decodetree: Implement a topological sort
scripts/decodetree: Pass lvalue-formatter function to str_extract()
docs: Document decodetree named field syntax
tests/decode: Convert tests to meson
decodetree: Do not remove output_file from /dev
decodetree: Diagnose empty pattern group
decodetree: Fix recursion in prop_format and build_tree
decodetree: Add --test-for-error
tcg: Remove TCG_TARGET_TLB_DISPLACEMENT_BITS
accel/tcg: Add aarch64 store_atom_insert_al16
accel/tcg: Add aarch64 lse2 load_atom_extract_al16_or_al8
accel/tcg: Add x86_64 load_atom_extract_al16_or_al8
accel/tcg: Extract store_atom_insert_al16 to host header
accel/tcg: Extract load_atom_extract_al16_or_al8 to host header
tcg/s390x: Support 128-bit load/store
tcg/ppc: Support 128-bit load/store
tcg/aarch64: Support 128-bit load/store
tcg/aarch64: Simplify constraints on qemu_ld/st
...
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
38 files changed, 1312 insertions, 225 deletions
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc index 0f6b3f8..2514899 100644 --- a/accel/tcg/ldst_atomicity.c.inc +++ b/accel/tcg/ldst_atomicity.c.inc @@ -9,6 +9,9 @@ * See the COPYING file in the top-level directory. */ +#include "host/load-extract-al16-al8.h" +#include "host/store-insert-al16.h" + #ifdef CONFIG_ATOMIC64 # define HAVE_al8 true #else @@ -156,7 +159,7 @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) * another process, because the fallback start_exclusive solution * provides no protection across processes. */ - if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { + if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) { uint64_t *p = __builtin_assume_aligned(pv, 8); return *p; } @@ -191,7 +194,7 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) * another process, because the fallback start_exclusive solution * provides no protection across processes. */ - if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { + if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) { return *p; } #endif @@ -312,40 +315,6 @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, } /** - * load_atom_extract_al16_or_al8: - * @p: host address - * @s: object size in bytes, @s <= 8. - * - * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not - * cross an 16-byte boundary then the access must be 16-byte atomic, - * otherwise the access must be 8-byte atomic. - */ -static inline uint64_t ATTRIBUTE_ATOMIC128_OPT -load_atom_extract_al16_or_al8(void *pv, int s) -{ - uintptr_t pi = (uintptr_t)pv; - int o = pi & 7; - int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; - Int128 r; - - pv = (void *)(pi & ~7); - if (pi & 8) { - uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); - uint64_t a = qatomic_read__nocheck(p8); - uint64_t b = qatomic_read__nocheck(p8 + 1); - - if (HOST_BIG_ENDIAN) { - r = int128_make128(b, a); - } else { - r = int128_make128(a, b); - } - } else { - r = atomic16_read_ro(pv); - } - return int128_getlo(int128_urshift(r, shr)); -} - -/** * load_atom_4_by_2: * @pv: host address * @@ -714,45 +683,6 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) } /** - * store_atom_insert_al16: - * @p: host address - * @val: shifted value to store - * @msk: mask for value to store - * - * Atomically store @val to @p masked by @msk. - */ -static void ATTRIBUTE_ATOMIC128_OPT -store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) -{ -#if defined(CONFIG_ATOMIC128) - __uint128_t *pu, old, new; - - /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ - pu = __builtin_assume_aligned(ps, 16); - old = *pu; - do { - new = (old & ~msk.u) | val.u; - } while (!__atomic_compare_exchange_n(pu, &old, new, true, - __ATOMIC_RELAXED, __ATOMIC_RELAXED)); -#elif defined(CONFIG_CMPXCHG128) - __uint128_t *pu, old, new; - - /* - * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always - * defer to libatomic, so we must use __sync_*_compare_and_swap_16 - * and accept the sequential consistency that comes with it. - */ - pu = __builtin_assume_aligned(ps, 16); - do { - old = *pu; - new = (old & ~msk.u) | val.u; - } while (!__sync_bool_compare_and_swap_16(pu, old, new)); -#else - qemu_build_not_reached(); -#endif -} - -/** * store_bytes_leN: * @pv: host address * @size: number of bytes to store diff --git a/docs/devel/decodetree.rst b/docs/devel/decodetree.rst index 49ea50c..e3392aa 100644 --- a/docs/devel/decodetree.rst +++ b/docs/devel/decodetree.rst @@ -23,22 +23,42 @@ Fields Syntax:: - field_def := '%' identifier ( unnamed_field )* ( !function=identifier )? + field_def := '%' identifier ( field )* ( !function=identifier )? + field := unnamed_field | named_field unnamed_field := number ':' ( 's' ) number + named_field := identifier ':' ( 's' ) number For *unnamed_field*, the first number is the least-significant bit position of the field and the second number is the length of the field. If the 's' is -present, the field is considered signed. If multiple ``unnamed_fields`` are -present, they are concatenated. In this way one can define disjoint fields. +present, the field is considered signed. + +A *named_field* refers to some other field in the instruction pattern +or format. Regardless of the length of the other field where it is +defined, it will be inserted into this field with the specified +signedness and bit width. + +Field definitions that involve loops (i.e. where a field is defined +directly or indirectly in terms of itself) are errors. + +A format can include fields that refer to named fields that are +defined in the instruction pattern(s) that use the format. +Conversely, an instruction pattern can include fields that refer to +named fields that are defined in the format it uses. However you +cannot currently do both at once (i.e. pattern P uses format F; F has +a field A that refers to a named field B that is defined in P, and P +has a field C that refers to a named field D that is defined in F). + +If multiple ``fields`` are present, they are concatenated. +In this way one can define disjoint fields. If ``!function`` is specified, the concatenated result is passed through the named function, taking and returning an integral value. -One may use ``!function`` with zero ``unnamed_fields``. This case is called +One may use ``!function`` with zero ``fields``. This case is called a *parameter*, and the named function is only passed the ``DisasContext`` and returns an integral value extracted from there. -A field with no ``unnamed_fields`` and no ``!function`` is in error. +A field with no ``fields`` and no ``!function`` is in error. Field examples: @@ -56,6 +76,9 @@ Field examples: | %shimm8 5:s8 13:1 | expand_shimm8(sextract(i, 5, 8) << 1 | | | !function=expand_shimm8 | extract(i, 13, 1)) | +---------------------------+---------------------------------------------+ +| %sz_imm 10:2 sz:3 | expand_sz_imm(extract(i, 10, 2) << 3 | | +| !function=expand_sz_imm | extract(a->sz, 0, 3)) | ++---------------------------+---------------------------------------------+ Argument Sets ============= diff --git a/host/include/aarch64/host/load-extract-al16-al8.h b/host/include/aarch64/host/load-extract-al16-al8.h new file mode 100644 index 0000000..bd677c5 --- /dev/null +++ b/host/include/aarch64/host/load-extract-al16-al8.h @@ -0,0 +1,40 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Atomic extract 64 from 128-bit, AArch64 version. + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H +#define AARCH64_LOAD_EXTRACT_AL16_AL8_H + +#include "host/cpuinfo.h" +#include "tcg/debug-assert.h" + +/** + * load_atom_extract_al16_or_al8: + * @pv: host address + * @s: object size in bytes, @s <= 8. + * + * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not + * cross an 16-byte boundary then the access must be 16-byte atomic, + * otherwise the access must be 8-byte atomic. + */ +static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s) +{ + uintptr_t pi = (uintptr_t)pv; + __int128_t *ptr_align = (__int128_t *)(pi & ~7); + int shr = (pi & 7) * 8; + uint64_t l, h; + + /* + * With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned + * and single-copy atomic on the parts if 8-byte aligned. + * All we need do is align the pointer mod 8. + */ + tcg_debug_assert(HAVE_ATOMIC128_RO); + asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align)); + return (l >> shr) | (h << (-shr & 63)); +} + +#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */ diff --git a/host/include/aarch64/host/store-insert-al16.h b/host/include/aarch64/host/store-insert-al16.h new file mode 100644 index 0000000..1943155 --- /dev/null +++ b/host/include/aarch64/host/store-insert-al16.h @@ -0,0 +1,47 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Atomic store insert into 128-bit, AArch64 version. + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef AARCH64_STORE_INSERT_AL16_H +#define AARCH64_STORE_INSERT_AL16_H + +/** + * store_atom_insert_al16: + * @p: host address + * @val: shifted value to store + * @msk: mask for value to store + * + * Atomically store @val to @p masked by @msk. + */ +static inline void ATTRIBUTE_ATOMIC128_OPT +store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk) +{ + /* + * GCC only implements __sync* primitives for int128 on aarch64. + * We can do better without the barriers, and integrating the + * arithmetic into the load-exclusive/store-conditional pair. + */ + uint64_t tl, th, vl, vh, ml, mh; + uint32_t fail; + + qemu_build_assert(!HOST_BIG_ENDIAN); + vl = int128_getlo(val); + vh = int128_gethi(val); + ml = int128_getlo(msk); + mh = int128_gethi(msk); + + asm("0: ldxp %[l], %[h], %[mem]\n\t" + "bic %[l], %[l], %[ml]\n\t" + "bic %[h], %[h], %[mh]\n\t" + "orr %[l], %[l], %[vl]\n\t" + "orr %[h], %[h], %[vh]\n\t" + "stxp %w[f], %[l], %[h], %[mem]\n\t" + "cbnz %w[f], 0b\n" + : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th) + : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh)); +} + +#endif /* AARCH64_STORE_INSERT_AL16_H */ diff --git a/host/include/generic/host/load-extract-al16-al8.h b/host/include/generic/host/load-extract-al16-al8.h new file mode 100644 index 0000000..d955561 --- /dev/null +++ b/host/include/generic/host/load-extract-al16-al8.h @@ -0,0 +1,45 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Atomic extract 64 from 128-bit, generic version. + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H +#define HOST_LOAD_EXTRACT_AL16_AL8_H + +/** + * load_atom_extract_al16_or_al8: + * @pv: host address + * @s: object size in bytes, @s <= 8. + * + * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not + * cross an 16-byte boundary then the access must be 16-byte atomic, + * otherwise the access must be 8-byte atomic. + */ +static inline uint64_t ATTRIBUTE_ATOMIC128_OPT +load_atom_extract_al16_or_al8(void *pv, int s) +{ + uintptr_t pi = (uintptr_t)pv; + int o = pi & 7; + int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; + Int128 r; + + pv = (void *)(pi & ~7); + if (pi & 8) { + uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); + uint64_t a = qatomic_read__nocheck(p8); + uint64_t b = qatomic_read__nocheck(p8 + 1); + + if (HOST_BIG_ENDIAN) { + r = int128_make128(b, a); + } else { + r = int128_make128(a, b); + } + } else { + r = atomic16_read_ro(pv); + } + return int128_getlo(int128_urshift(r, shr)); +} + +#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */ diff --git a/host/include/generic/host/store-insert-al16.h b/host/include/generic/host/store-insert-al16.h new file mode 100644 index 0000000..4a16621 --- /dev/null +++ b/host/include/generic/host/store-insert-al16.h @@ -0,0 +1,50 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Atomic store insert into 128-bit, generic version. + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef HOST_STORE_INSERT_AL16_H +#define HOST_STORE_INSERT_AL16_H + +/** + * store_atom_insert_al16: + * @p: host address + * @val: shifted value to store + * @msk: mask for value to store + * + * Atomically store @val to @p masked by @msk. + */ +static inline void ATTRIBUTE_ATOMIC128_OPT +store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk) +{ +#if defined(CONFIG_ATOMIC128) + __uint128_t *pu; + Int128Alias old, new; + + /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ + pu = __builtin_assume_aligned(ps, 16); + old.u = *pu; + msk = int128_not(msk); + do { + new.s = int128_and(old.s, msk); + new.s = int128_or(new.s, val); + } while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +#else + Int128 old, new, cmp; + + ps = __builtin_assume_aligned(ps, 16); + old = *ps; + msk = int128_not(msk); + do { + cmp = old; + new = int128_and(old, msk); + new = int128_or(new, val); + old = atomic16_cmpxchg(ps, cmp, new); + } while (int128_ne(cmp, old)); +#endif +} + +#endif /* HOST_STORE_INSERT_AL16_H */ diff --git a/host/include/x86_64/host/atomic128-ldst.h b/host/include/x86_64/host/atomic128-ldst.h new file mode 100644 index 0000000..adc9332 --- /dev/null +++ b/host/include/x86_64/host/atomic128-ldst.h @@ -0,0 +1,68 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Load/store for 128-bit atomic operations, x86_64 version. + * + * Copyright (C) 2023 Linaro, Ltd. + * + * See docs/devel/atomics.rst for discussion about the guarantees each + * atomic primitive is meant to provide. + */ + +#ifndef AARCH64_ATOMIC128_LDST_H +#define AARCH64_ATOMIC128_LDST_H + +#ifdef CONFIG_INT128_TYPE +#include "host/cpuinfo.h" +#include "tcg/debug-assert.h" + +/* + * Through clang 16, with -mcx16, __atomic_load_n is incorrectly + * expanded to a read-write operation: lock cmpxchg16b. + */ + +#define HAVE_ATOMIC128_RO likely(cpuinfo & CPUINFO_ATOMIC_VMOVDQA) +#define HAVE_ATOMIC128_RW 1 + +static inline Int128 atomic16_read_ro(const Int128 *ptr) +{ + Int128Alias r; + + tcg_debug_assert(HAVE_ATOMIC128_RO); + asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr)); + + return r.s; +} + +static inline Int128 atomic16_read_rw(Int128 *ptr) +{ + __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16); + Int128Alias r; + + if (HAVE_ATOMIC128_RO) { + asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align)); + } else { + r.i = __sync_val_compare_and_swap_16(ptr_align, 0, 0); + } + return r.s; +} + +static inline void atomic16_set(Int128 *ptr, Int128 val) +{ + __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16); + Int128Alias new = { .s = val }; + + if (HAVE_ATOMIC128_RO) { + asm("vmovdqa %1, %0" : "=m"(*ptr_align) : "x" (new.i)); + } else { + __int128_t old; + do { + old = *ptr_align; + } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i)); + } +} +#else +/* Provide QEMU_ERROR stubs. */ +#include "host/include/generic/host/atomic128-ldst.h" +#endif + +#endif /* AARCH64_ATOMIC128_LDST_H */ diff --git a/host/include/x86_64/host/load-extract-al16-al8.h b/host/include/x86_64/host/load-extract-al16-al8.h new file mode 100644 index 0000000..31b6fe8 --- /dev/null +++ b/host/include/x86_64/host/load-extract-al16-al8.h @@ -0,0 +1,50 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * Atomic extract 64 from 128-bit, x86_64 version. + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef X86_64_LOAD_EXTRACT_AL16_AL8_H +#define X86_64_LOAD_EXTRACT_AL16_AL8_H + +#ifdef CONFIG_INT128_TYPE +#include "host/cpuinfo.h" + +/** + * load_atom_extract_al16_or_al8: + * @pv: host address + * @s: object size in bytes, @s <= 8. + * + * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not + * cross an 16-byte boundary then the access must be 16-byte atomic, + * otherwise the access must be 8-byte atomic. + */ +static inline uint64_t ATTRIBUTE_ATOMIC128_OPT +load_atom_extract_al16_or_al8(void *pv, int s) +{ + uintptr_t pi = (uintptr_t)pv; + __int128_t *ptr_align = (__int128_t *)(pi & ~7); + int shr = (pi & 7) * 8; + Int128Alias r; + + /* + * ptr_align % 16 is now only 0 or 8. + * If the host supports atomic loads with VMOVDQU, then always use that, + * making the branch highly predictable. Otherwise we must use VMOVDQA + * when ptr_align % 16 == 0 for 16-byte atomicity. + */ + if ((cpuinfo & CPUINFO_ATOMIC_VMOVDQU) || (pi & 8)) { + asm("vmovdqu %1, %0" : "=x" (r.i) : "m" (*ptr_align)); + } else { + asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align)); + } + return int128_getlo(int128_urshift(r.s, shr)); +} +#else +/* Fallback definition that must be optimized away, or error. */ +uint64_t QEMU_ERROR("unsupported atomic") + load_atom_extract_al16_or_al8(void *pv, int s); +#endif + +#endif /* X86_64_LOAD_EXTRACT_AL16_AL8_H */ diff --git a/include/qemu/int128.h b/include/qemu/int128.h index 9e46cfae..73624e8 100644 --- a/include/qemu/int128.h +++ b/include/qemu/int128.h @@ -481,7 +481,7 @@ static inline void bswap128s(Int128 *s) * a possible structure and the native types. Ease parameter passing * via use of the transparent union extension. */ -#ifdef CONFIG_INT128 +#ifdef CONFIG_INT128_TYPE typedef union { __uint128_t u; __int128_t i; @@ -489,6 +489,6 @@ typedef union { } Int128Alias __attribute__((transparent_union)); #else typedef Int128 Int128Alias; -#endif /* CONFIG_INT128 */ +#endif /* CONFIG_INT128_TYPE */ #endif /* INT128_H */ diff --git a/meson.build b/meson.build index 2d48aa1..bc76ea9 100644 --- a/meson.build +++ b/meson.build @@ -2543,7 +2543,13 @@ config_host_data.set('CONFIG_ATOMIC64', cc.links(''' return 0; }''')) -has_int128 = cc.links(''' +has_int128_type = cc.compiles(''' + __int128_t a; + __uint128_t b; + int main(void) { b = a; }''') +config_host_data.set('CONFIG_INT128_TYPE', has_int128_type) + +has_int128 = has_int128_type and cc.links(''' __int128_t a; __uint128_t b; int main (void) { @@ -2552,10 +2558,9 @@ has_int128 = cc.links(''' a = a * a; return 0; }''') - config_host_data.set('CONFIG_INT128', has_int128) -if has_int128 +if has_int128_type # "do we have 128-bit atomics which are handled inline and specifically not # via libatomic". The reason we can't use libatomic is documented in the # comment starting "GCC is a house divided" in include/qemu/atomic128.h. @@ -2564,7 +2569,7 @@ if has_int128 # __alignof(unsigned __int128) for the host. atomic_test_128 = ''' int main(int ac, char **av) { - unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], 16); + __uint128_t *p = __builtin_assume_aligned(av[ac - 1], 16); p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED); __atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED); __atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); @@ -2586,7 +2591,7 @@ if has_int128 config_host_data.set('CONFIG_CMPXCHG128', cc.links(''' int main(void) { - unsigned __int128 x = 0, y = 0; + __uint128_t x = 0, y = 0; __sync_val_compare_and_swap_16(&x, y, x); return 0; } diff --git a/scripts/decodetree.py b/scripts/decodetree.py index a03dc6b..13db585 100644 --- a/scripts/decodetree.py +++ b/scripts/decodetree.py @@ -35,6 +35,7 @@ arguments = {} formats = {} allpatterns = [] anyextern = False +testforerror = False translate_prefix = 'trans' translate_scope = 'static ' @@ -53,6 +54,80 @@ re_fld_ident = '%[a-zA-Z0-9_]*' re_fmt_ident = '@[a-zA-Z0-9_]*' re_pat_ident = '[a-zA-Z0-9_]*' +# Local implementation of a topological sort. We use the same API that +# the Python graphlib does, so that when QEMU moves forward to a +# baseline of Python 3.9 or newer this code can all be dropped and +# replaced with: +# from graphlib import TopologicalSorter, CycleError +# +# https://docs.python.org/3.9/library/graphlib.html#graphlib.TopologicalSorter +# +# We only implement the parts of TopologicalSorter we care about: +# ts = TopologicalSorter(graph=None) +# create the sorter. graph is a dictionary whose keys are +# nodes and whose values are lists of the predecessors of that node. +# (That is, if graph contains "A" -> ["B", "C"] then we must output +# B and C before A.) +# ts.static_order() +# returns a list of all the nodes in sorted order, or raises CycleError +# CycleError +# exception raised if there are cycles in the graph. The second +# element in the args attribute is a list of nodes which form a +# cycle; the first and last element are the same, eg [a, b, c, a] +# (Our implementation doesn't give the order correctly.) +# +# For our purposes we can assume that the data set is always small +# (typically 10 nodes or less, actual links in the graph very rare), +# so we don't need to worry about efficiency of implementation. +# +# The core of this implementation is from +# https://code.activestate.com/recipes/578272-topological-sort/ +# (but updated to Python 3), and is under the MIT license. + +class CycleError(ValueError): + """Subclass of ValueError raised if cycles exist in the graph""" + pass + +class TopologicalSorter: + """Topologically sort a graph""" + def __init__(self, graph=None): + self.graph = graph + + def static_order(self): + # We do the sort right here, unlike the stdlib version + from functools import reduce + data = {} + r = [] + + if not self.graph: + return [] + + # This code wants the values in the dict to be specifically sets + for k, v in self.graph.items(): + data[k] = set(v) + + # Find all items that don't depend on anything. + extra_items_in_deps = (reduce(set.union, data.values()) + - set(data.keys())) + # Add empty dependencies where needed + data.update({item:{} for item in extra_items_in_deps}) + while True: + ordered = set(item for item, dep in data.items() if not dep) + if not ordered: + break + r.extend(ordered) + data = {item: (dep - ordered) + for item, dep in data.items() + if item not in ordered} + if data: + # This doesn't give as nice results as the stdlib, which + # gives you the cycle by listing the nodes in order. Here + # we only know the nodes in the cycle but not their order. + raise CycleError(f'nodes are in a cycle', list(data.keys())) + + return r +# end TopologicalSorter + def error_with_file(file, lineno, *args): """Print an error message from file:line and args and exit.""" global output_file @@ -70,8 +145,13 @@ def error_with_file(file, lineno, *args): if output_file and output_fd: output_fd.close() - os.remove(output_file) - exit(1) + # Do not try to remove e.g. -o /dev/null + if not output_file.startswith("/dev"): + try: + os.remove(output_file) + except PermissionError: + pass + exit(0 if testforerror else 1) # end error_with_file @@ -205,11 +285,14 @@ class Field: s = '' return str(self.pos) + ':' + s + str(self.len) - def str_extract(self): + def str_extract(self, lvalue_formatter): global bitop_width s = 's' if self.sign else '' return f'{s}extract{bitop_width}(insn, {self.pos}, {self.len})' + def referenced_fields(self): + return [] + def __eq__(self, other): return self.sign == other.sign and self.mask == other.mask @@ -228,12 +311,12 @@ class MultiField: def __str__(self): return str(self.subs) - def str_extract(self): + def str_extract(self, lvalue_formatter): global bitop_width ret = '0' pos = 0 for f in reversed(self.subs): - ext = f.str_extract() + ext = f.str_extract(lvalue_formatter) if pos == 0: ret = ext else: @@ -241,6 +324,12 @@ class MultiField: pos += f.len return ret + def referenced_fields(self): + l = [] + for f in self.subs: + l.extend(f.referenced_fields()) + return l + def __ne__(self, other): if len(self.subs) != len(other.subs): return True @@ -264,9 +353,12 @@ class ConstField: def __str__(self): return str(self.value) - def str_extract(self): + def str_extract(self, lvalue_formatter): return str(self.value) + def referenced_fields(self): + return [] + def __cmp__(self, other): return self.value - other.value # end ConstField @@ -283,8 +375,12 @@ class FunctionField: def __str__(self): return self.func + '(' + str(self.base) + ')' - def str_extract(self): - return self.func + '(ctx, ' + self.base.str_extract() + ')' + def str_extract(self, lvalue_formatter): + return (self.func + '(ctx, ' + + self.base.str_extract(lvalue_formatter) + ')') + + def referenced_fields(self): + return self.base.referenced_fields() def __eq__(self, other): return self.func == other.func and self.base == other.base @@ -304,9 +400,12 @@ class ParameterField: def __str__(self): return self.func - def str_extract(self): + def str_extract(self, lvalue_formatter): return self.func + '(ctx)' + def referenced_fields(self): + return [] + def __eq__(self, other): return self.func == other.func @@ -314,6 +413,32 @@ class ParameterField: return not self.__eq__(other) # end ParameterField +class NamedField: + """Class representing a field already named in the pattern""" + def __init__(self, name, sign, len): + self.mask = 0 + self.sign = sign + self.len = len + self.name = name + + def __str__(self): + return self.name + + def str_extract(self, lvalue_formatter): + global bitop_width + s = 's' if self.sign else '' + lvalue = lvalue_formatter(self.name) + return f'{s}extract{bitop_width}({lvalue}, 0, {self.len})' + + def referenced_fields(self): + return [self.name] + + def __eq__(self, other): + return self.name == other.name + + def __ne__(self, other): + return not self.__eq__(other) +# end NamedField class Arguments: """Class representing the extracted fields of a format""" @@ -337,7 +462,6 @@ class Arguments: output('} ', self.struct_name(), ';\n\n') # end Arguments - class General: """Common code between instruction formats and instruction patterns""" def __init__(self, name, lineno, base, fixb, fixm, udfm, fldm, flds, w): @@ -351,12 +475,59 @@ class General: self.fieldmask = fldm self.fields = flds self.width = w + self.dangling = None def __str__(self): return self.name + ' ' + str_match_bits(self.fixedbits, self.fixedmask) def str1(self, i): return str_indent(i) + self.__str__() + + def dangling_references(self): + # Return a list of all named references which aren't satisfied + # directly by this format/pattern. This will be either: + # * a format referring to a field which is specified by the + # pattern(s) using it + # * a pattern referring to a field which is specified by the + # format it uses + # * a user error (referring to a field that doesn't exist at all) + if self.dangling is None: + # Compute this once and cache the answer + dangling = [] + for n, f in self.fields.items(): + for r in f.referenced_fields(): + if r not in self.fields: + dangling.append(r) + self.dangling = dangling + return self.dangling + + def output_fields(self, indent, lvalue_formatter): + # We use a topological sort to ensure that any use of NamedField + # comes after the initialization of the field it is referencing. + graph = {} + for n, f in self.fields.items(): + refs = f.referenced_fields() + graph[n] = refs + + try: + ts = TopologicalSorter(graph) + for n in ts.static_order(): + # We only want to emit assignments for the keys + # in our fields list, not for anything that ends up + # in the tsort graph only because it was referenced as + # a NamedField. + try: + f = self.fields[n] + output(indent, lvalue_formatter(n), ' = ', + f.str_extract(lvalue_formatter), ';\n') + except KeyError: + pass + except CycleError as e: + # The second element of args is a list of nodes which form + # a cycle (there might be others too, but only one is reported). + # Pretty-print it to tell the user. + cycle = ' => '.join(e.args[1]) + error(self.lineno, 'field definitions form a cycle: ' + cycle) # end General @@ -370,8 +541,7 @@ class Format(General): def output_extract(self): output('static void ', self.extract_name(), '(DisasContext *ctx, ', self.base.struct_name(), ' *a, ', insntype, ' insn)\n{\n') - for n, f in self.fields.items(): - output(' a->', n, ' = ', f.str_extract(), ';\n') + self.output_fields(str_indent(4), lambda n: 'a->' + n) output('}\n\n') # end Format @@ -392,11 +562,36 @@ class Pattern(General): ind = str_indent(i) arg = self.base.base.name output(ind, '/* ', self.file, ':', str(self.lineno), ' */\n') + # We might have named references in the format that refer to fields + # in the pattern, or named references in the pattern that refer + # to fields in the format. This affects whether we extract the fields + # for the format before or after the ones for the pattern. + # For simplicity we don't allow cross references in both directions. + # This is also where we catch the syntax error of referring to + # a nonexistent field. + fmt_refs = self.base.dangling_references() + for r in fmt_refs: + if r not in self.fields: + error(self.lineno, f'format refers to undefined field {r}') + pat_refs = self.dangling_references() + for r in pat_refs: + if r not in self.base.fields: + error(self.lineno, f'pattern refers to undefined field {r}') + if pat_refs and fmt_refs: + error(self.lineno, ('pattern that uses fields defined in format ' + 'cannot use format that uses fields defined ' + 'in pattern')) + if fmt_refs: + # pattern fields first + self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n) + assert not extracted, "dangling fmt refs but it was already extracted" if not extracted: output(ind, self.base.extract_name(), '(ctx, &u.f_', arg, ', insn);\n') - for n, f in self.fields.items(): - output(ind, 'u.f_', arg, '.', n, ' = ', f.str_extract(), ';\n') + if not fmt_refs: + # pattern fields last + self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n) + output(ind, 'if (', translate_prefix, '_', self.name, '(ctx, &u.f_', arg, ')) return true;\n') @@ -473,7 +668,7 @@ class MultiPattern(General): def prop_format(self): for p in self.pats: - p.build_tree() + p.prop_format() def prop_width(self): width = None @@ -505,6 +700,12 @@ class IncMultiPattern(MultiPattern): output(ind, '}\n') else: p.output_code(i, extracted, p.fixedbits, p.fixedmask) + + def build_tree(self): + if not self.pats: + error_with_file(self.file, self.lineno, 'empty pattern group') + super().build_tree() + #end IncMultiPattern @@ -536,8 +737,10 @@ class Tree: ind = str_indent(i) # If we identified all nodes below have the same format, - # extract the fields now. - if not extracted and self.base: + # extract the fields now. But don't do it if the format relies + # on named fields from the insn pattern, as those won't have + # been initialised at this point. + if not extracted and self.base and not self.base.dangling_references(): output(ind, self.base.extract_name(), '(ctx, &u.f_', self.base.base.name, ', insn);\n') extracted = True @@ -623,7 +826,7 @@ class ExcMultiPattern(MultiPattern): return t def build_tree(self): - super().prop_format() + super().build_tree() self.tree = self.__build_tree(self.pats, self.fixedbits, self.fixedmask) @@ -659,6 +862,7 @@ def parse_field(lineno, name, toks): """Parse one instruction field from TOKS at LINENO""" global fields global insnwidth + global re_C_ident # A "simple" field will have only one entry; # a "multifield" will have several. @@ -673,6 +877,25 @@ def parse_field(lineno, name, toks): func = func[1] continue + if re.fullmatch(re_C_ident + ':s[0-9]+', t): + # Signed named field + subtoks = t.split(':') + n = subtoks[0] + le = int(subtoks[1]) + f = NamedField(n, True, le) + subs.append(f) + width += le + continue + if re.fullmatch(re_C_ident + ':[0-9]+', t): + # Unsigned named field + subtoks = t.split(':') + n = subtoks[0] + le = int(subtoks[1]) + f = NamedField(n, False, le) + subs.append(f) + width += le + continue + if re.fullmatch('[0-9]+:s[0-9]+', t): # Signed field extract subtoks = t.split(':s') @@ -1286,11 +1509,12 @@ def main(): global bitop_width global variablewidth global anyextern + global testforerror decode_scope = 'static ' long_opts = ['decode=', 'translate=', 'output=', 'insnwidth=', - 'static-decode=', 'varinsnwidth='] + 'static-decode=', 'varinsnwidth=', 'test-for-error'] try: (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'o:vw:', long_opts) except getopt.GetoptError as err: @@ -1319,6 +1543,8 @@ def main(): bitop_width = 64 elif insnwidth != 32: error(0, 'cannot handle insns of width', insnwidth) + elif o == '--test-for-error': + testforerror = True else: assert False, 'unhandled option' @@ -1417,6 +1643,7 @@ def main(): if output_file: output_fd.close() + exit(1 if testforerror else 0) # end main diff --git a/tcg/aarch64/tcg-target-con-set.h b/tcg/aarch64/tcg-target-con-set.h index d6c6866..3fdee26 100644 --- a/tcg/aarch64/tcg-target-con-set.h +++ b/tcg/aarch64/tcg-target-con-set.h @@ -10,11 +10,10 @@ * tcg-target-con-str.h; the constraint combination is inclusive or. */ C_O0_I1(r) -C_O0_I2(lZ, l) C_O0_I2(r, rA) C_O0_I2(rZ, r) C_O0_I2(w, r) -C_O1_I1(r, l) +C_O0_I3(rZ, rZ, r) C_O1_I1(r, r) C_O1_I1(w, r) C_O1_I1(w, w) @@ -33,4 +32,5 @@ C_O1_I2(w, w, wO) C_O1_I2(w, w, wZ) C_O1_I3(w, w, w, w) C_O1_I4(r, r, rA, rZ, rZ) +C_O2_I1(r, r, r) C_O2_I4(r, r, rZ, rZ, rA, rMZ) diff --git a/tcg/aarch64/tcg-target-con-str.h b/tcg/aarch64/tcg-target-con-str.h index 00adb64..fb1a845 100644 --- a/tcg/aarch64/tcg-target-con-str.h +++ b/tcg/aarch64/tcg-target-con-str.h @@ -9,7 +9,6 @@ * REGS(letter, register_mask) */ REGS('r', ALL_GENERAL_REGS) -REGS('l', ALL_QLDST_REGS) REGS('w', ALL_VECTOR_REGS) /* diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index 8428366..261ad25 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -40,11 +40,12 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11, TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15, - TCG_REG_X16, TCG_REG_X17, TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3, TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7, + /* X16 reserved as temporary */ + /* X17 reserved as temporary */ /* X18 reserved by system */ /* X19 reserved for AREG0 */ /* X29 reserved as fp */ @@ -71,8 +72,10 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) return TCG_REG_X0 + slot; } -#define TCG_REG_TMP TCG_REG_X30 -#define TCG_VEC_TMP TCG_REG_V31 +#define TCG_REG_TMP0 TCG_REG_X16 +#define TCG_REG_TMP1 TCG_REG_X17 +#define TCG_REG_TMP2 TCG_REG_X30 +#define TCG_VEC_TMP0 TCG_REG_V31 #ifndef CONFIG_SOFTMMU #define TCG_REG_GUEST_BASE TCG_REG_X28 @@ -129,14 +132,6 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, #define ALL_GENERAL_REGS 0xffffffffu #define ALL_VECTOR_REGS 0xffffffff00000000ull -#ifdef CONFIG_SOFTMMU -#define ALL_QLDST_REGS \ - (ALL_GENERAL_REGS & ~((1 << TCG_REG_X0) | (1 << TCG_REG_X1) | \ - (1 << TCG_REG_X2) | (1 << TCG_REG_X3))) -#else -#define ALL_QLDST_REGS ALL_GENERAL_REGS -#endif - /* Match a constant valid for addition (12-bit, optionally shifted). */ static inline bool is_aimm(uint64_t val) { @@ -390,6 +385,10 @@ typedef enum { I3305_LDR_v64 = 0x5c000000, I3305_LDR_v128 = 0x9c000000, + /* Load/store exclusive. */ + I3306_LDXP = 0xc8600000, + I3306_STXP = 0xc8200000, + /* Load/store register. Described here as 3.3.12, but the helper that emits them can transform to 3.3.10 or 3.3.13. */ I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30, @@ -454,6 +453,9 @@ typedef enum { I3406_ADR = 0x10000000, I3406_ADRP = 0x90000000, + /* Add/subtract extended register instructions. */ + I3501_ADD = 0x0b200000, + /* Add/subtract shifted register instructions (without a shift). */ I3502_ADD = 0x0b000000, I3502_ADDS = 0x2b000000, @@ -624,6 +626,12 @@ static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn, tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt); } +static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs, + TCGReg rt, TCGReg rt2, TCGReg rn) +{ + tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt); +} + static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext, TCGReg rt, int imm19) { @@ -706,6 +714,14 @@ static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn, tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd); } +static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn, + TCGType sf, TCGReg rd, TCGReg rn, + TCGReg rm, int opt, int imm3) +{ + tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 | + imm3 << 10 | rn << 5 | rd); +} + /* This function is for both 3.5.2 (Add/Subtract shifted register), for the rare occasion when we actually want to supply a shift amount. */ static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn, @@ -984,7 +1000,7 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg r, TCGReg base, intptr_t offset) { - TCGReg temp = TCG_REG_TMP; + TCGReg temp = TCG_REG_TMP0; if (offset < -0xffffff || offset > 0xffffff) { tcg_out_movi(s, TCG_TYPE_PTR, temp, offset); @@ -1136,8 +1152,8 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd, } /* Worst-case scenario, move offset to temp register, use reg offset. */ - tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset); - tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP); + tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset); + tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0); } static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) @@ -1353,8 +1369,8 @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target) if (offset == sextract64(offset, 0, 26)) { tcg_out_insn(s, 3206, BL, offset); } else { - tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target); - tcg_out_insn(s, 3207, BLR, TCG_REG_TMP); + tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target); + tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0); } } @@ -1491,7 +1507,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl, AArch64Insn insn; if (rl == ah || (!const_bh && rl == bh)) { - rl = TCG_REG_TMP; + rl = TCG_REG_TMP0; } if (const_bl) { @@ -1508,7 +1524,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl, possibility of adding 0+const in the low part, and the immediate add instructions encode XSP not XZR. Don't try anything more elaborate here than loading another zero. */ - al = TCG_REG_TMP; + al = TCG_REG_TMP0; tcg_out_movi(s, ext, al, 0); } tcg_out_insn_3401(s, insn, ext, rl, al, bl); @@ -1549,7 +1565,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d, { TCGReg a1 = a0; if (is_ctz) { - a1 = TCG_REG_TMP; + a1 = TCG_REG_TMP0; tcg_out_insn(s, 3507, RBIT, ext, a1, a0); } if (const_b && b == (ext ? 64 : 32)) { @@ -1558,7 +1574,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d, AArch64Insn sel = I3506_CSEL; tcg_out_cmp(s, ext, a0, 0, 1); - tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP, a1); + tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1); if (const_b) { if (b == -1) { @@ -1571,7 +1587,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d, b = d; } } - tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP, b, TCG_COND_NE); + tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE); } } @@ -1588,7 +1604,7 @@ bool tcg_target_has_memory_bswap(MemOp memop) } static const TCGLdstHelperParam ldst_helper_param = { - .ntmp = 1, .tmp = { TCG_REG_TMP } + .ntmp = 1, .tmp = { TCG_REG_TMP0 } }; static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) @@ -1633,19 +1649,19 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGType addr_type = s->addr_type; TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); + MemOp s_bits = opc & MO_SIZE; unsigned a_mask; h->aa = atom_and_align_for_opc(s, opc, have_lse2 ? MO_ATOM_WITHIN16 : MO_ATOM_IFALIGN, - false); + s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU - unsigned s_bits = opc & MO_SIZE; unsigned s_mask = (1u << s_bits) - 1; unsigned mem_index = get_mmuidx(oi); - TCGReg x3; + TCGReg addr_adj; TCGType mask_type; uint64_t compare_mask; @@ -1657,27 +1673,27 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32 ? TCG_TYPE_I64 : TCG_TYPE_I32); - /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {x0,x1}. */ + /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {tmp0,tmp1}. */ QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512); QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8); - tcg_out_insn(s, 3314, LDP, TCG_REG_X0, TCG_REG_X1, TCG_AREG0, + tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0, TLB_MASK_TABLE_OFS(mem_index), 1, 0); /* Extract the TLB index from the address into X0. */ tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64, - TCG_REG_X0, TCG_REG_X0, addr_reg, + TCG_REG_TMP0, TCG_REG_TMP0, addr_reg, s->page_bits - CPU_TLB_ENTRY_BITS); - /* Add the tlb_table pointer, creating the CPUTLBEntry address into X1. */ - tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0); + /* Add the tlb_table pointer, forming the CPUTLBEntry address in TMP1. */ + tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0); - /* Load the tlb comparator into X0, and the fast path addend into X1. */ - tcg_out_ld(s, addr_type, TCG_REG_X0, TCG_REG_X1, + /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */ + tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1, is_ld ? offsetof(CPUTLBEntry, addr_read) : offsetof(CPUTLBEntry, addr_write)); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1, + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, offsetof(CPUTLBEntry, addend)); /* @@ -1686,25 +1702,26 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * cross pages using the address of the last byte of the access. */ if (a_mask >= s_mask) { - x3 = addr_reg; + addr_adj = addr_reg; } else { + addr_adj = TCG_REG_TMP2; tcg_out_insn(s, 3401, ADDI, addr_type, - TCG_REG_X3, addr_reg, s_mask - a_mask); - x3 = TCG_REG_X3; + addr_adj, addr_reg, s_mask - a_mask); } compare_mask = (uint64_t)s->page_mask | a_mask; - /* Store the page mask part of the address into X3. */ - tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_X3, x3, compare_mask); + /* Store the page mask part of the address into TMP2. */ + tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2, + addr_adj, compare_mask); /* Perform the address comparison. */ - tcg_out_cmp(s, addr_type, TCG_REG_X0, TCG_REG_X3, 0); + tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0); /* If not equal, we jump to the slow path. */ ldst->label_ptr[0] = s->code_ptr; tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); - h->base = TCG_REG_X1, + h->base = TCG_REG_TMP1; h->index = addr_reg; h->index_ext = addr_type; #else @@ -1822,6 +1839,108 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg, } } +static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi, + TCGReg addr_reg, MemOpIdx oi, bool is_ld) +{ + TCGLabelQemuLdst *ldst; + HostAddress h; + TCGReg base; + bool use_pair; + + ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld); + + /* Compose the final address, as LDP/STP have no indexing. */ + if (h.index == TCG_REG_XZR) { + base = h.base; + } else { + base = TCG_REG_TMP2; + if (h.index_ext == TCG_TYPE_I32) { + /* add base, base, index, uxtw */ + tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base, + h.base, h.index, MO_32, 0); + } else { + /* add base, base, index */ + tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index); + } + } + + use_pair = h.aa.atom < MO_128 || have_lse2; + + if (!use_pair) { + tcg_insn_unit *branch = NULL; + TCGReg ll, lh, sl, sh; + + /* + * If we have already checked for 16-byte alignment, that's all + * we need. Otherwise we have determined that misaligned atomicity + * may be handled with two 8-byte loads. + */ + if (h.aa.align < MO_128) { + /* + * TODO: align should be MO_64, so we only need test bit 3, + * which means we could use TBNZ instead of ANDS+B_C. + */ + tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15); + branch = s->code_ptr; + tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); + use_pair = true; + } + + if (is_ld) { + /* + * 16-byte atomicity without LSE2 requires LDXP+STXP loop: + * ldxp lo, hi, [base] + * stxp t0, lo, hi, [base] + * cbnz t0, .-8 + * Require no overlap between data{lo,hi} and base. + */ + if (datalo == base || datahi == base) { + tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base); + base = TCG_REG_TMP2; + } + ll = sl = datalo; + lh = sh = datahi; + } else { + /* + * 16-byte atomicity without LSE2 requires LDXP+STXP loop: + * 1: ldxp t0, t1, [base] + * stxp t0, lo, hi, [base] + * cbnz t0, 1b + */ + tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1); + ll = TCG_REG_TMP0; + lh = TCG_REG_TMP1; + sl = datalo; + sh = datahi; + } + + tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base); + tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base); + tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2); + + if (use_pair) { + /* "b .+8", branching across the one insn of use_pair. */ + tcg_out_insn(s, 3206, B, 2); + reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr)); + } + } + + if (use_pair) { + if (is_ld) { + tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0); + } else { + tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0); + } + } + + if (ldst) { + ldst->type = TCG_TYPE_I128; + ldst->datalo_reg = datalo; + ldst->datahi_reg = datahi; + ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); + } +} + static const tcg_insn_unit *tb_ret_addr; static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) @@ -1847,7 +1966,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which) set_jmp_insn_offset(s, which); tcg_out32(s, I3206_B); - tcg_out_insn(s, 3207, BR, TCG_REG_TMP); + tcg_out_insn(s, 3207, BR, TCG_REG_TMP0); set_jmp_reset_offset(s, which); } @@ -1866,7 +1985,7 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n, ptrdiff_t i_offset = i_addr - jmp_rx; /* Note that we asserted this in range in tcg_out_goto_tb. */ - insn = deposit32(I3305_LDR | TCG_REG_TMP, 5, 19, i_offset >> 2); + insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2); } qatomic_set((uint32_t *)jmp_rw, insn); flush_idcache_range(jmp_rx, jmp_rw, 4); @@ -2060,13 +2179,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_rem_i64: case INDEX_op_rem_i32: - tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP, a1, a2); - tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1); + tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2); + tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1); break; case INDEX_op_remu_i64: case INDEX_op_remu_i32: - tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP, a1, a2); - tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1); + tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2); + tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1); break; case INDEX_op_shl_i64: @@ -2110,8 +2229,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, if (c2) { tcg_out_rotl(s, ext, a0, a1, a2); } else { - tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP, TCG_REG_XZR, a2); - tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP); + tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2); + tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0); } break; @@ -2161,6 +2280,14 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, REG0(0), a1, a2, ext); break; + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true); + break; + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false); + break; case INDEX_op_bswap64_i64: tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1); @@ -2517,8 +2644,8 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, break; } } - tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0); - a2 = TCG_VEC_TMP; + tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0); + a2 = TCG_VEC_TMP0; } if (is_scalar) { insn = cmp_scalar_insn[cond]; @@ -2799,12 +2926,18 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_ld_a64_i32: case INDEX_op_qemu_ld_a32_i64: case INDEX_op_qemu_ld_a64_i64: - return C_O1_I1(r, l); + return C_O1_I1(r, r); + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + return C_O2_I1(r, r, r); case INDEX_op_qemu_st_a32_i32: case INDEX_op_qemu_st_a64_i32: case INDEX_op_qemu_st_a32_i64: case INDEX_op_qemu_st_a64_i64: - return C_O0_I2(lZ, l); + return C_O0_I2(rZ, r); + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + return C_O0_I3(rZ, rZ, r); case INDEX_op_deposit_i32: case INDEX_op_deposit_i64: @@ -2900,9 +3033,11 @@ static void tcg_target_init(TCGContext *s) s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP); - tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */ - tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP); + tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0); + tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1); + tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2); + tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0); } /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */ diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index d5f7614..ce64de0 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -16,7 +16,6 @@ #include "host/cpuinfo.h" #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 24 #define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) typedef enum { @@ -131,7 +130,16 @@ typedef enum { #define TCG_TARGET_HAS_muluh_i64 1 #define TCG_TARGET_HAS_mulsh_i64 1 -#define TCG_TARGET_HAS_qemu_ldst_i128 0 +/* + * Without FEAT_LSE2, we must use LDXP+STXP to implement atomic 128-bit load, + * which requires writable pages. We must defer to the helper for user-only, + * but in system mode all ram is writable for the host. + */ +#ifdef CONFIG_USER_ONLY +#define TCG_TARGET_HAS_qemu_ldst_i128 have_lse2 +#else +#define TCG_TARGET_HAS_qemu_ldst_i128 1 +#endif #define TCG_TARGET_HAS_v64 1 #define TCG_TARGET_HAS_v128 1 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 65efc53..c649db7 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -31,7 +31,6 @@ extern int arm_arch; #define use_armv7_instructions (__ARM_ARCH >= 7 || arm_arch >= 7) #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 #define MAX_CODE_GEN_BUFFER_SIZE UINT32_MAX typedef enum { diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index bfe9d98..ae54e5f 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -91,6 +91,8 @@ static const int tcg_target_reg_alloc_order[] = { #endif }; +#define TCG_TMP_VEC TCG_REG_XMM5 + static const int tcg_target_call_iarg_regs[] = { #if TCG_TARGET_REG_BITS == 64 #if defined(_WIN64) @@ -319,6 +321,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) +#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16) +#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16) #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) #define OPC_PMAXSW (0xee | P_EXT | P_DATA16) #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) @@ -1753,7 +1757,21 @@ typedef struct { bool tcg_target_has_memory_bswap(MemOp memop) { - return have_movbe; + TCGAtomAlign aa; + + if (!have_movbe) { + return false; + } + if ((memop & MO_SIZE) < MO_128) { + return true; + } + + /* + * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA, + * but do allow a pair of 64-bit operations, i.e. MOVBEQ. + */ + aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); + return aa.atom < MO_128; } /* @@ -1781,6 +1799,30 @@ static const TCGLdstHelperParam ldst_helper_param = { static const TCGLdstHelperParam ldst_helper_param = { }; #endif +static void tcg_out_vec_to_pair(TCGContext *s, TCGType type, + TCGReg l, TCGReg h, TCGReg v) +{ + int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; + + /* vpmov{d,q} %v, %l */ + tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l); + /* vpextr{d,q} $1, %v, %h */ + tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h); + tcg_out8(s, 1); +} + +static void tcg_out_pair_to_vec(TCGContext *s, TCGType type, + TCGReg v, TCGReg l, TCGReg h) +{ + int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW; + + /* vmov{d,q} %l, %v */ + tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l); + /* vpinsr{d,q} $1, %h, %v, %v */ + tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h); + tcg_out8(s, 1); +} + /* * Generate code for the slow path for a load at the end of block */ @@ -1870,6 +1912,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); + MemOp s_bits = opc & MO_SIZE; unsigned a_mask; #ifdef CONFIG_SOFTMMU @@ -1880,7 +1923,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, *h = x86_guest_base; #endif h->base = addrlo; - h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU @@ -1890,7 +1933,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGType tlbtype = TCG_TYPE_I32; int trexw = 0, hrexw = 0, tlbrexw = 0; unsigned mem_index = get_mmuidx(oi); - unsigned s_bits = opc & MO_SIZE; unsigned s_mask = (1 << s_bits) - 1; int tlb_mask; @@ -2070,6 +2112,72 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, h.base, h.index, 0, h.ofs + 4); } break; + + case MO_128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + + /* + * Without 16-byte atomicity, use integer regs. + * That is where we want the data, and it allows bswaps. + */ + if (h.aa.atom < MO_128) { + if (use_movbe) { + TCGReg t = datalo; + datalo = datahi; + datahi = t; + } + if (h.base == datalo || h.index == datalo) { + tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi, + h.base, h.index, 0, h.ofs); + tcg_out_modrm_offset(s, movop + P_REXW + h.seg, + datalo, datahi, 0); + tcg_out_modrm_offset(s, movop + P_REXW + h.seg, + datahi, datahi, 8); + } else { + tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, + h.base, h.index, 0, h.ofs); + tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, + h.base, h.index, 0, h.ofs + 8); + } + break; + } + + /* + * With 16-byte atomicity, a vector load is required. + * If we already have 16-byte alignment, then VMOVDQA always works. + * Else if VMOVDQU has atomicity with dynamic alignment, use that. + * Else use we require a runtime test for alignment for VMOVDQA; + * use VMOVDQU on the unaligned nonatomic path for simplicity. + */ + if (h.aa.align >= MO_128) { + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + } else { + TCGLabel *l1 = gen_new_label(); + TCGLabel *l2 = gen_new_label(); + + tcg_out_testi(s, h.base, 15); + tcg_out_jxx(s, JCC_JNE, l1, true); + + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + tcg_out_jxx(s, JCC_JMP, l2, true); + + tcg_out_label(s, l1); + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + tcg_out_label(s, l2); + } + tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC); + break; + default: g_assert_not_reached(); } @@ -2140,6 +2248,63 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, h.base, h.index, 0, h.ofs + 4); } break; + + case MO_128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + + /* + * Without 16-byte atomicity, use integer regs. + * That is where we have the data, and it allows bswaps. + */ + if (h.aa.atom < MO_128) { + if (use_movbe) { + TCGReg t = datalo; + datalo = datahi; + datahi = t; + } + tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo, + h.base, h.index, 0, h.ofs); + tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi, + h.base, h.index, 0, h.ofs + 8); + break; + } + + /* + * With 16-byte atomicity, a vector store is required. + * If we already have 16-byte alignment, then VMOVDQA always works. + * Else if VMOVDQU has atomicity with dynamic alignment, use that. + * Else use we require a runtime test for alignment for VMOVDQA; + * use VMOVDQU on the unaligned nonatomic path for simplicity. + */ + tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi); + if (h.aa.align >= MO_128) { + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) { + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + } else { + TCGLabel *l1 = gen_new_label(); + TCGLabel *l2 = gen_new_label(); + + tcg_out_testi(s, h.base, 15); + tcg_out_jxx(s, JCC_JNE, l1, true); + + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + tcg_out_jxx(s, JCC_JMP, l2, true); + + tcg_out_label(s, l1); + tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg, + TCG_TMP_VEC, 0, + h.base, h.index, 0, h.ofs); + tcg_out_label(s, l2); + } + break; + default: g_assert_not_reached(); } @@ -2470,6 +2635,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } break; + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); + break; case INDEX_op_qemu_st_a64_i32: case INDEX_op_qemu_st8_a64_i32: @@ -2496,6 +2666,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } break; + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128); + break; OP_32_64(mulu2): tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]); @@ -3193,6 +3368,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_a64_i64: return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + return C_O2_I1(r, r, L); + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + return C_O0_I3(L, L, L); + case INDEX_op_brcond2_i32: return C_O0_I4(r, r, ri, ri); @@ -3962,6 +4146,7 @@ static void tcg_target_init(TCGContext *s) s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); + tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC); #ifdef _WIN64 /* These are call saved, and we don't save them, so don't use them. */ tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6); diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 0106946..1468f8e 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -28,7 +28,6 @@ #include "host/cpuinfo.h" #define TCG_TARGET_INSN_UNIT_SIZE 1 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 31 #ifdef __x86_64__ # define TCG_TARGET_REG_BITS 64 @@ -118,7 +117,6 @@ typedef enum { #define have_avx1 (cpuinfo & CPUINFO_AVX1) #define have_avx2 (cpuinfo & CPUINFO_AVX2) #define have_movbe (cpuinfo & CPUINFO_MOVBE) -#define have_atomic16 (cpuinfo & CPUINFO_ATOMIC_VMOVDQA) /* * There are interesting instructions in AVX512, so long as we have AVX512VL, @@ -202,7 +200,8 @@ typedef enum { #define TCG_TARGET_HAS_qemu_st8_i32 1 #endif -#define TCG_TARGET_HAS_qemu_ldst_i128 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 \ + (TCG_TARGET_REG_BITS == 64 && (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)) /* We do not support older SSE systems, only beginning with AVX1. */ #define TCG_TARGET_HAS_v64 have_avx1 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h index 8fbb6c6..e4806f6 100644 --- a/tcg/mips/tcg-target.h +++ b/tcg/mips/tcg-target.h @@ -36,7 +36,6 @@ #endif #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 #define TCG_TARGET_NB_REGS 32 #define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) diff --git a/tcg/ppc/tcg-target-con-set.h b/tcg/ppc/tcg-target-con-set.h index f206b29..bbd7b21 100644 --- a/tcg/ppc/tcg-target-con-set.h +++ b/tcg/ppc/tcg-target-con-set.h @@ -14,6 +14,7 @@ C_O0_I2(r, r) C_O0_I2(r, ri) C_O0_I2(v, r) C_O0_I3(r, r, r) +C_O0_I3(o, m, r) C_O0_I4(r, r, ri, ri) C_O0_I4(r, r, r, r) C_O1_I1(r, r) @@ -34,6 +35,7 @@ C_O1_I3(v, v, v, v) C_O1_I4(r, r, ri, rZ, rZ) C_O1_I4(r, r, r, ri, ri) C_O2_I1(r, r, r) +C_O2_I1(o, m, r) C_O2_I2(r, r, r, r) C_O2_I4(r, r, rI, rZM, r, r) C_O2_I4(r, r, r, r, rI, rZM) diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h index 094613c..2084690 100644 --- a/tcg/ppc/tcg-target-con-str.h +++ b/tcg/ppc/tcg-target-con-str.h @@ -9,6 +9,7 @@ * REGS(letter, register_mask) */ REGS('r', ALL_GENERAL_REGS) +REGS('o', ALL_GENERAL_REGS & 0xAAAAAAAAu) /* odd registers */ REGS('v', ALL_VECTOR_REGS) /* diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index d4269df..d47a9e3 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -295,25 +295,27 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define B OPCD( 18) #define BC OPCD( 16) + #define LBZ OPCD( 34) #define LHZ OPCD( 40) #define LHA OPCD( 42) #define LWZ OPCD( 32) #define LWZUX XO31( 55) -#define STB OPCD( 38) -#define STH OPCD( 44) -#define STW OPCD( 36) - -#define STD XO62( 0) -#define STDU XO62( 1) -#define STDX XO31(149) - #define LD XO58( 0) #define LDX XO31( 21) #define LDU XO58( 1) #define LDUX XO31( 53) #define LWA XO58( 2) #define LWAX XO31(341) +#define LQ OPCD( 56) + +#define STB OPCD( 38) +#define STH OPCD( 44) +#define STW OPCD( 36) +#define STD XO62( 0) +#define STDU XO62( 1) +#define STDX XO31(149) +#define STQ XO62( 2) #define ADDIC OPCD( 12) #define ADDI OPCD( 14) @@ -2020,7 +2022,18 @@ typedef struct { bool tcg_target_has_memory_bswap(MemOp memop) { - return true; + TCGAtomAlign aa; + + if ((memop & MO_SIZE) <= MO_64) { + return true; + } + + /* + * Reject 16-byte memop with 16-byte atomicity, + * but do allow a pair of 64-bit operations. + */ + aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); + return aa.atom <= MO_64; } /* @@ -2035,7 +2048,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - MemOp a_bits; + MemOp a_bits, s_bits; /* * Book II, Section 1.4, Single-Copy Atomicity, specifies: @@ -2047,10 +2060,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * As of 3.0, "the non-atomic access is performed as described in * the corresponding list", which matches MO_ATOM_SUBALIGN. */ + s_bits = opc & MO_SIZE; h->aa = atom_and_align_for_opc(s, opc, have_isa_3_00 ? MO_ATOM_SUBALIGN : MO_ATOM_IFALIGN, - false); + s_bits == MO_128); a_bits = h->aa.align; #ifdef CONFIG_SOFTMMU @@ -2060,7 +2074,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, int fast_off = TLB_MASK_TABLE_OFS(mem_index); int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); int table_off = fast_off + offsetof(CPUTLBDescFast, table); - unsigned s_bits = opc & MO_SIZE; ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -2303,6 +2316,60 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, } } +static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi, + TCGReg addr_reg, MemOpIdx oi, bool is_ld) +{ + TCGLabelQemuLdst *ldst; + HostAddress h; + bool need_bswap; + uint32_t insn; + TCGReg index; + + ldst = prepare_host_addr(s, &h, addr_reg, -1, oi, is_ld); + + /* Compose the final address, as LQ/STQ have no indexing. */ + index = h.index; + if (h.base != 0) { + index = TCG_REG_TMP1; + tcg_out32(s, ADD | TAB(index, h.base, h.index)); + } + need_bswap = get_memop(oi) & MO_BSWAP; + + if (h.aa.atom == MO_128) { + tcg_debug_assert(!need_bswap); + tcg_debug_assert(datalo & 1); + tcg_debug_assert(datahi == datalo - 1); + insn = is_ld ? LQ : STQ; + tcg_out32(s, insn | TAI(datahi, index, 0)); + } else { + TCGReg d1, d2; + + if (HOST_BIG_ENDIAN ^ need_bswap) { + d1 = datahi, d2 = datalo; + } else { + d1 = datalo, d2 = datahi; + } + + if (need_bswap) { + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 8); + insn = is_ld ? LDBRX : STDBRX; + tcg_out32(s, insn | TAB(d1, 0, index)); + tcg_out32(s, insn | TAB(d2, index, TCG_REG_R0)); + } else { + insn = is_ld ? LD : STD; + tcg_out32(s, insn | TAI(d1, index, 0)); + tcg_out32(s, insn | TAI(d2, index, 8)); + } + } + + if (ldst) { + ldst->type = TCG_TYPE_I128; + ldst->datalo_reg = datalo; + ldst->datahi_reg = datahi; + ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); + } +} + static void tcg_out_nop_fill(tcg_insn_unit *p, int count) { int i; @@ -2860,6 +2927,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, args[4], TCG_TYPE_I64); } break; + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true); + break; case INDEX_op_qemu_st_a64_i32: if (TCG_TARGET_REG_BITS == 32) { @@ -2889,6 +2961,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, args[4], TCG_TYPE_I64); } break; + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false); + break; case INDEX_op_setcond_i32: tcg_out_setcond(s, TCG_TYPE_I32, args[3], args[0], args[1], args[2], @@ -3722,6 +3799,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_a64_i64: return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I4(r, r, r, r); + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + return C_O2_I1(o, m, r); + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + return C_O0_I3(o, m, r); + case INDEX_op_add_vec: case INDEX_op_sub_vec: case INDEX_op_mul_vec: diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index 0914380..40f20b0 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -34,7 +34,6 @@ #define TCG_TARGET_NB_REGS 64 #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16 typedef enum { TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3, @@ -149,7 +148,8 @@ extern bool have_vsx; #define TCG_TARGET_HAS_mulsh_i64 1 #endif -#define TCG_TARGET_HAS_qemu_ldst_i128 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 \ + (TCG_TARGET_REG_BITS == 64 && have_isa_2_07) /* * While technically Altivec could support V64, it has no 64-bit store diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h index 62fe61a..54fdff0 100644 --- a/tcg/riscv/tcg-target.h +++ b/tcg/riscv/tcg-target.h @@ -35,7 +35,6 @@ #define TCG_TARGET_REG_BITS 64 #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 20 #define TCG_TARGET_NB_REGS 32 #define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h index ecc079b..cbad91b 100644 --- a/tcg/s390x/tcg-target-con-set.h +++ b/tcg/s390x/tcg-target-con-set.h @@ -14,6 +14,7 @@ C_O0_I2(r, r) C_O0_I2(r, ri) C_O0_I2(r, rA) C_O0_I2(v, r) +C_O0_I3(o, m, r) C_O1_I1(r, r) C_O1_I1(v, r) C_O1_I1(v, v) @@ -36,6 +37,7 @@ C_O1_I2(v, v, v) C_O1_I3(v, v, v, v) C_O1_I4(r, r, ri, rI, r) C_O1_I4(r, r, rA, rI, r) +C_O2_I1(o, m, r) C_O2_I2(o, m, 0, r) C_O2_I2(o, m, r, r) C_O2_I3(o, m, 0, 1, r) diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index dfaa34c..503126c 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -243,6 +243,7 @@ typedef enum S390Opcode { RXY_LLGF = 0xe316, RXY_LLGH = 0xe391, RXY_LMG = 0xeb04, + RXY_LPQ = 0xe38f, RXY_LRV = 0xe31e, RXY_LRVG = 0xe30f, RXY_LRVH = 0xe31f, @@ -253,6 +254,7 @@ typedef enum S390Opcode { RXY_STG = 0xe324, RXY_STHY = 0xe370, RXY_STMG = 0xeb24, + RXY_STPQ = 0xe38e, RXY_STRV = 0xe33e, RXY_STRVG = 0xe32f, RXY_STRVH = 0xe33f, @@ -1577,7 +1579,18 @@ typedef struct { bool tcg_target_has_memory_bswap(MemOp memop) { - return true; + TCGAtomAlign aa; + + if ((memop & MO_SIZE) <= MO_64) { + return true; + } + + /* + * Reject 16-byte memop with 16-byte atomicity, + * but do allow a pair of 64-bit operations. + */ + aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true); + return aa.atom <= MO_64; } static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data, @@ -1734,13 +1747,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); + MemOp s_bits = opc & MO_SIZE; unsigned a_mask; - h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU - unsigned s_bits = opc & MO_SIZE; unsigned s_mask = (1 << s_bits) - 1; int mem_index = get_mmuidx(oi); int fast_off = TLB_MASK_TABLE_OFS(mem_index); @@ -1865,6 +1878,80 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg, } } +static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi, + TCGReg addr_reg, MemOpIdx oi, bool is_ld) +{ + TCGLabel *l1 = NULL, *l2 = NULL; + TCGLabelQemuLdst *ldst; + HostAddress h; + bool need_bswap; + bool use_pair; + S390Opcode insn; + + ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld); + + use_pair = h.aa.atom < MO_128; + need_bswap = get_memop(oi) & MO_BSWAP; + + if (!use_pair) { + /* + * Atomicity requires we use LPQ. If we've already checked for + * 16-byte alignment, that's all we need. If we arrive with + * lesser alignment, we have determined that less than 16-byte + * alignment can be satisfied with two 8-byte loads. + */ + if (h.aa.align < MO_128) { + use_pair = true; + l1 = gen_new_label(); + l2 = gen_new_label(); + + tcg_out_insn(s, RI, TMLL, addr_reg, 15); + tgen_branch(s, 7, l1); /* CC in {1,2,3} */ + } + + tcg_debug_assert(!need_bswap); + tcg_debug_assert(datalo & 1); + tcg_debug_assert(datahi == datalo - 1); + insn = is_ld ? RXY_LPQ : RXY_STPQ; + tcg_out_insn_RXY(s, insn, datahi, h.base, h.index, h.disp); + + if (use_pair) { + tgen_branch(s, S390_CC_ALWAYS, l2); + tcg_out_label(s, l1); + } + } + if (use_pair) { + TCGReg d1, d2; + + if (need_bswap) { + d1 = datalo, d2 = datahi; + insn = is_ld ? RXY_LRVG : RXY_STRVG; + } else { + d1 = datahi, d2 = datalo; + insn = is_ld ? RXY_LG : RXY_STG; + } + + if (h.base == d1 || h.index == d1) { + tcg_out_insn(s, RXY, LAY, TCG_TMP0, h.base, h.index, h.disp); + h.base = TCG_TMP0; + h.index = TCG_REG_NONE; + h.disp = 0; + } + tcg_out_insn_RXY(s, insn, d1, h.base, h.index, h.disp); + tcg_out_insn_RXY(s, insn, d2, h.base, h.index, h.disp + 8); + } + if (l2) { + tcg_out_label(s, l2); + } + + if (ldst) { + ldst->type = TCG_TYPE_I128; + ldst->datalo_reg = datalo; + ldst->datahi_reg = datahi; + ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); + } +} + static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) { /* Reuse the zeroing that exists for goto_ptr. */ @@ -2226,6 +2313,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I64); break; + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true); + break; + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false); + break; case INDEX_op_ld16s_i64: tcg_out_mem(s, 0, RXY_LGH, args[0], args[1], TCG_REG_NONE, args[2]); @@ -3107,6 +3202,12 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_a32_i32: case INDEX_op_qemu_st_a64_i32: return C_O0_I2(r, r); + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + return C_O2_I1(o, m, r); + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + return C_O0_I3(o, m, r); case INDEX_op_deposit_i32: case INDEX_op_deposit_i64: diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h index 170007b..9a40500 100644 --- a/tcg/s390x/tcg-target.h +++ b/tcg/s390x/tcg-target.h @@ -26,7 +26,6 @@ #define S390_TCG_TARGET_H #define TCG_TARGET_INSN_UNIT_SIZE 2 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 19 /* We have a +- 4GB range on the branches; leave some slop. */ #define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB) @@ -140,7 +139,7 @@ extern uint64_t s390_facilities[3]; #define TCG_TARGET_HAS_muluh_i64 0 #define TCG_TARGET_HAS_mulsh_i64 0 -#define TCG_TARGET_HAS_qemu_ldst_i128 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 1 #define TCG_TARGET_HAS_v64 HAVE_FACILITY(VECTOR) #define TCG_TARGET_HAS_v128 HAVE_FACILITY(VECTOR) diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h index 31c5537..d454278 100644 --- a/tcg/sparc64/tcg-target.h +++ b/tcg/sparc64/tcg-target.h @@ -26,7 +26,6 @@ #define SPARC_TCG_TARGET_H #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32 #define TCG_TARGET_NB_REGS 32 #define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) @@ -5736,8 +5736,8 @@ static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *ldst, mov[0].dst = ldst->datalo_reg; mov[0].src = tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, HOST_BIG_ENDIAN); - mov[0].dst_type = TCG_TYPE_I32; - mov[0].src_type = TCG_TYPE_I32; + mov[0].dst_type = TCG_TYPE_REG; + mov[0].src_type = TCG_TYPE_REG; mov[0].src_ext = TCG_TARGET_REG_BITS == 32 ? MO_32 : MO_64; mov[1].dst = ldst->datahi_reg; diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h index 28dc6d5..60a6ed6 100644 --- a/tcg/tci/tcg-target.h +++ b/tcg/tci/tcg-target.h @@ -42,7 +42,6 @@ #define TCG_TARGET_INTERPRETER 1 #define TCG_TARGET_INSN_UNIT_SIZE 4 -#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32 #define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) #if UINTPTR_MAX == UINT32_MAX diff --git a/tests/decode/check.sh b/tests/decode/check.sh deleted file mode 100755 index 95445a0..0000000 --- a/tests/decode/check.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -# This work is licensed under the terms of the GNU LGPL, version 2 or later. -# See the COPYING.LIB file in the top-level directory. - -PYTHON=$1 -DECODETREE=$2 -E=0 - -# All of these tests should produce errors -for i in err_*.decode; do - if $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then - # Pass, aka failed to fail. - echo FAIL: $i 1>&2 - E=1 - fi -done - -for i in succ_*.decode; do - if ! $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then - echo FAIL:$i 1>&2 - fi -done - -exit $E diff --git a/tests/decode/err_field10.decode b/tests/decode/err_field10.decode new file mode 100644 index 0000000..3e672b7 --- /dev/null +++ b/tests/decode/err_field10.decode @@ -0,0 +1,7 @@ +# This work is licensed under the terms of the GNU LGPL, version 2 or later. +# See the COPYING.LIB file in the top-level directory. + +# Diagnose formats which refer to undefined fields +%field1 field2:3 +@fmt ........ ........ ........ ........ %field1 +insn 00000000 00000000 00000000 00000000 @fmt diff --git a/tests/decode/err_field7.decode b/tests/decode/err_field7.decode new file mode 100644 index 0000000..51fad7c --- /dev/null +++ b/tests/decode/err_field7.decode @@ -0,0 +1,7 @@ +# This work is licensed under the terms of the GNU LGPL, version 2 or later. +# See the COPYING.LIB file in the top-level directory. + +# Diagnose fields whose definitions form a loop +%field1 field2:3 +%field2 field1:4 +insn 00000000 00000000 00000000 00000000 %field1 %field2 diff --git a/tests/decode/err_field8.decode b/tests/decode/err_field8.decode new file mode 100644 index 0000000..cc47c08 --- /dev/null +++ b/tests/decode/err_field8.decode @@ -0,0 +1,8 @@ +# This work is licensed under the terms of the GNU LGPL, version 2 or later. +# See the COPYING.LIB file in the top-level directory. + +# Diagnose patterns which refer to undefined fields +&f1 f1 a +%field1 field2:3 +@fmt ........ ........ ........ .... a:4 &f1 +insn 00000000 00000000 00000000 0000 .... @fmt f1=%field1 diff --git a/tests/decode/err_field9.decode b/tests/decode/err_field9.decode new file mode 100644 index 0000000..e7361d5 --- /dev/null +++ b/tests/decode/err_field9.decode @@ -0,0 +1,14 @@ +# This work is licensed under the terms of the GNU LGPL, version 2 or later. +# See the COPYING.LIB file in the top-level directory. + +# Diagnose fields where the format refers to a field defined in the +# pattern and the pattern refers to a field defined in the format. +# This is theoretically not impossible to implement, but is not +# supported by the script at this time. +&abcd a b c d +%refa a:3 +%refc c:4 +# Format defines 'c' and sets 'b' to an indirect ref to 'a' +@fmt ........ ........ ........ c:8 &abcd b=%refa +# Pattern defines 'a' and sets 'd' to an indirect ref to 'c' +insn 00000000 00000000 00000000 ........ @fmt d=%refc a=6 diff --git a/tests/decode/meson.build b/tests/decode/meson.build new file mode 100644 index 0000000..38a0629 --- /dev/null +++ b/tests/decode/meson.build @@ -0,0 +1,64 @@ +err_tests = [ + 'err_argset1.decode', + 'err_argset2.decode', + 'err_field1.decode', + 'err_field2.decode', + 'err_field3.decode', + 'err_field4.decode', + 'err_field5.decode', + 'err_field6.decode', + 'err_field7.decode', + 'err_field8.decode', + 'err_field9.decode', + 'err_field10.decode', + 'err_init1.decode', + 'err_init2.decode', + 'err_init3.decode', + 'err_init4.decode', + 'err_overlap1.decode', + 'err_overlap2.decode', + 'err_overlap3.decode', + 'err_overlap4.decode', + 'err_overlap5.decode', + 'err_overlap6.decode', + 'err_overlap7.decode', + 'err_overlap8.decode', + 'err_overlap9.decode', + 'err_pattern_group_empty.decode', + 'err_pattern_group_ident1.decode', + 'err_pattern_group_ident2.decode', + 'err_pattern_group_nest1.decode', + 'err_pattern_group_nest2.decode', + 'err_pattern_group_nest3.decode', + 'err_pattern_group_overlap1.decode', + 'err_width1.decode', + 'err_width2.decode', + 'err_width3.decode', + 'err_width4.decode', +] + +succ_tests = [ + 'succ_argset_type1.decode', + 'succ_function.decode', + 'succ_ident1.decode', + 'succ_named_field.decode', + 'succ_pattern_group_nest1.decode', + 'succ_pattern_group_nest2.decode', + 'succ_pattern_group_nest3.decode', + 'succ_pattern_group_nest4.decode', +] + +suite = 'decodetree' +decodetree = find_program(meson.project_source_root() / 'scripts/decodetree.py') + +foreach t: err_tests + test(fs.replace_suffix(t, ''), + decodetree, args: ['-o', '/dev/null', '--test-for-error', files(t)], + suite: suite) +endforeach + +foreach t: succ_tests + test(fs.replace_suffix(t, ''), + decodetree, args: ['-o', '/dev/null', files(t)], + suite: suite) +endforeach diff --git a/tests/decode/succ_named_field.decode b/tests/decode/succ_named_field.decode new file mode 100644 index 0000000..e64b3f9 --- /dev/null +++ b/tests/decode/succ_named_field.decode @@ -0,0 +1,19 @@ +# This work is licensed under the terms of the GNU LGPL, version 2 or later. +# See the COPYING.LIB file in the top-level directory. + +# field using a named_field +%imm_sz 8:8 sz:3 +insn 00000000 00000000 ........ 00000000 imm_sz=%imm_sz sz=1 + +# Ditto, via a format. Here a field in the format +# references a named field defined in the insn pattern: +&imm_a imm alpha +%foo 0:16 alpha:4 +@foo 00000001 ........ ........ ........ &imm_a imm=%foo +i1 ........ 00000000 ........ ........ @foo alpha=1 +i2 ........ 00000001 ........ ........ @foo alpha=2 + +# Here the named field is defined in the format and referenced +# from the insn pattern: +@bar 00000010 ........ ........ ........ &imm_a alpha=4 +i3 ........ 00000000 ........ ........ @bar imm=%foo diff --git a/tests/meson.build b/tests/meson.build index 8e318ec..083f299 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -74,10 +74,7 @@ if have_tools and have_vhost_user and 'CONFIG_LINUX' in config_host dependencies: [qemuutil, vhost_user]) endif -test('decodetree', sh, - args: [ files('decode/check.sh'), config_host['PYTHON'], files('../scripts/decodetree.py') ], - workdir: meson.current_source_dir() / 'decode', - suite: 'decodetree') +subdir('decode') if 'CONFIG_TCG' in config_all subdir('fp') |