diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2024-12-25 08:33:33 -0500 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2024-12-25 08:33:33 -0500 |
commit | a7f77545d401266a6415e6e03c7738c95314f0e6 (patch) | |
tree | 20354b9913086ac3a535e06e4d28057f88710d14 | |
parent | aa3a285b5bc56a4208b3b57d4a55291e9c260107 (diff) | |
parent | e4a8e093dc74be049f4829831dce76e5edab0003 (diff) | |
download | qemu-a7f77545d401266a6415e6e03c7738c95314f0e6.zip qemu-a7f77545d401266a6415e6e03c7738c95314f0e6.tar.gz qemu-a7f77545d401266a6415e6e03c7738c95314f0e6.tar.bz2 |
Merge tag 'pull-tcg-20241224' of https://gitlab.com/rth7680/qemu into staging
tcg/optimize: Remove in-flight mask data from OptContext
fpu: Add float*_muladd_scalbn
fpu: Remove float_muladd_halve_result
fpu: Add float_round_nearest_even_max
fpu: Add float_muladd_suppress_add_product_zero
target/hexagon: Use float32_muladd
accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmdrE7QdHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+l2Qf/aECUfMn07wns7WjX
# ebWxzIRKp//ktsIJg9InL8zrCStyRqrBj0VQE9LUfO2Vhvqf8faUdh+uh2ek/Ewa
# f1hfo0kDK7e7oWnCicSbHmdC0FQIrKpg2i+YXIsbd4XWOkmFAhkNenISuQfCrL3k
# 3UYAA12seK9uCls+fljvhK6iid3h+4ReDFW7DPg7mumFCCz6CwzYYW/4cnhcAmOn
# qVehtts8W+6SFMjTE04S8NV8OBaMisf8AbCcZf2PedRl1cHGSumLOjvjOxcQU8Hw
# nGUjL8/hYWkEetzU4YzJyfHOe6F9lPJBMnDattwIswwYrTOD/Sq7VbBWFbW0EwUy
# 7XIZ8Q==
# =DZgo
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 24 Dec 2024 15:04:04 EST
# gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg: issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F
* tag 'pull-tcg-20241224' of https://gitlab.com/rth7680/qemu: (72 commits)
accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
target/hexagon: Simplify internal_mpyhh setup
target/hexagon: Use mulu64 for int128_mul_6464
target/hexagon: Remove Double
target/hexagon: Remove Float
target/hexagon: Expand GEN_XF_ROUND
target/hexagon: Remove internal_fmafx
target/hexagon: Use float32_muladd for helper_sffm[as]_lib
target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
target/hexagon: Use float32_muladd for helper_sffms
target/hexagon: Use float32_muladd for helper_sffma
target/hexagon: Use float32_mul in helper_sfmpy
softfloat: Add float_muladd_suppress_add_product_zero
softfloat: Add float_round_nearest_even_max
softfloat: Remove float_muladd_halve_result
target/sparc: Use float*_muladd_scalbn
target/arm: Use float*_muladd_scalbn
softfloat: Add float{16,32,64}_muladd_scalbn
tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
tcg/optimize: Move fold_bitsel_vec into alphabetic sort
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
75 files changed, 848 insertions, 991 deletions
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index b507049..d48b82a 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -1088,11 +1088,13 @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp) if (!tcg_target_initialized) { /* Check mandatory TCGCPUOps handlers */ + const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops; #ifndef CONFIG_USER_ONLY - assert(cpu->cc->tcg_ops->cpu_exec_halt); - assert(cpu->cc->tcg_ops->cpu_exec_interrupt); + assert(tcg_ops->cpu_exec_halt); + assert(tcg_ops->cpu_exec_interrupt); #endif /* !CONFIG_USER_ONLY */ - cpu->cc->tcg_ops->initialize(); + assert(tcg_ops->translate_code); + tcg_ops->initialize(); tcg_target_initialized = true; } diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c index 1ef0755..7e5f040 100644 --- a/accel/tcg/plugin-gen.c +++ b/accel/tcg/plugin-gen.c @@ -102,6 +102,15 @@ static void gen_disable_mem_helper(void) static TCGv_i32 gen_cpu_index(void) { + /* + * Optimize when we run with a single vcpu. All values using cpu_index, + * including scoreboard index, will be optimized out. + * User-mode calls tb_flush when setting this flag. In system-mode, all + * vcpus are created before generating code. + */ + if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) { + return tcg_constant_i32(current_cpu->cpu_index); + } TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); tcg_gen_ld_i32(cpu_index, tcg_env, -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 572a8a8..453eb20 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -276,8 +276,10 @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb, tcg_func_start(tcg_ctx); - tcg_ctx->cpu = env_cpu(env); - gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc); + CPUState *cs = env_cpu(env); + tcg_ctx->cpu = cs; + cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc); + assert(tb->size != 0); tcg_ctx->cpu = NULL; *max_insns = tb->icount; @@ -364,7 +366,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, /* * Overflow of code_gen_buffer, or the current slice of it. * - * TODO: We don't need to re-do gen_intermediate_code, nor + * TODO: We don't need to re-do tcg_ops->translate_code, nor * should we re-do the tcg optimization currently hidden * inside tcg_gen_code. All that should be required is to * flush the TBs, allocate a new TB, re-initialize it per diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index ba8de7b..ebde429 100644 --- a/fpu/softfloat-parts.c.inc +++ b/fpu/softfloat-parts.c.inc @@ -241,6 +241,9 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, int exp, flags = 0; switch (s->float_rounding_mode) { + case float_round_nearest_even_max: + overflow_norm = true; + /* fall through */ case float_round_nearest_even: if (N > 64 && frac_lsb == 0) { inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1 @@ -562,8 +565,9 @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, * Requires A and C extracted into a double-sized structure to provide the * extra space for the widening multiply. */ -static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b, - FloatPartsN *c, int flags, float_status *s) +static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b, + FloatPartsN *c, int scale, + int flags, float_status *s) { int ab_mask, abc_mask; FloatPartsW p_widen, c_widen; @@ -611,7 +615,9 @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b, goto return_normal; } if (c->cls == float_class_zero) { - if (a->sign != c->sign) { + if (flags & float_muladd_suppress_add_product_zero) { + a->sign = c->sign; + } else if (a->sign != c->sign) { goto return_sub_zero; } goto return_zero; @@ -652,9 +658,7 @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b, a->exp = p_widen.exp; return_normal: - if (flags & float_muladd_halve_result) { - a->exp -= 1; - } + a->exp += scale; finish_sign: if (flags & float_muladd_negate_result) { a->sign ^= 1; diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 8de8d5f..8d75d66 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -789,15 +789,15 @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, #define parts_mul(A, B, S) \ PARTS_GENERIC_64_128(mul, A)(A, B, S) -static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, - FloatParts64 *c, int flags, - float_status *s); -static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, - FloatParts128 *c, int flags, - float_status *s); +static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b, + FloatParts64 *c, int scale, + int flags, float_status *s); +static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b, + FloatParts128 *c, int scale, + int flags, float_status *s); -#define parts_muladd(A, B, C, Z, S) \ - PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) +#define parts_muladd_scalbn(A, B, C, Z, Y, S) \ + PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S) static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, float_status *s); @@ -2212,43 +2212,50 @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status) * Fused multiply-add */ -float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, - int flags, float_status *status) +float16 QEMU_FLATTEN +float16_muladd_scalbn(float16 a, float16 b, float16 c, + int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; float16_unpack_canonical(&pa, a, status); float16_unpack_canonical(&pb, b, status); float16_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); return float16_round_pack_canonical(pr, status); } -static float32 QEMU_SOFTFLOAT_ATTR -soft_f32_muladd(float32 a, float32 b, float32 c, int flags, - float_status *status) +float16 float16_muladd(float16 a, float16 b, float16 c, + int flags, float_status *status) +{ + return float16_muladd_scalbn(a, b, c, 0, flags, status); +} + +float32 QEMU_SOFTFLOAT_ATTR +float32_muladd_scalbn(float32 a, float32 b, float32 c, + int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; float32_unpack_canonical(&pa, a, status); float32_unpack_canonical(&pb, b, status); float32_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); return float32_round_pack_canonical(pr, status); } -static float64 QEMU_SOFTFLOAT_ATTR -soft_f64_muladd(float64 a, float64 b, float64 c, int flags, - float_status *status) +float64 QEMU_SOFTFLOAT_ATTR +float64_muladd_scalbn(float64 a, float64 b, float64 c, + int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; float64_unpack_canonical(&pa, a, status); float64_unpack_canonical(&pb, b, status); float64_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); return float64_round_pack_canonical(pr, status); } @@ -2267,7 +2274,7 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) if (unlikely(!can_use_fpu(s))) { goto soft; } - if (unlikely(flags & float_muladd_halve_result)) { + if (unlikely(flags & float_muladd_suppress_add_product_zero)) { goto soft; } @@ -2323,7 +2330,7 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) return ur.s; soft: - return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); + return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s); } float64 QEMU_FLATTEN @@ -2338,9 +2345,6 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) if (unlikely(!can_use_fpu(s))) { goto soft; } - if (unlikely(flags & float_muladd_halve_result)) { - goto soft; - } float64_input_flush3(&ua.s, &ub.s, &uc.s, s); if (unlikely(!f64_is_zon3(ua, ub, uc))) { @@ -2394,7 +2398,7 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) return ur.s; soft: - return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); + return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s); } float64 float64r32_muladd(float64 a, float64 b, float64 c, @@ -2405,7 +2409,7 @@ float64 float64r32_muladd(float64 a, float64 b, float64 c, float64_unpack_canonical(&pa, a, status); float64_unpack_canonical(&pb, b, status); float64_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); return float64r32_round_pack_canonical(pr, status); } @@ -2418,7 +2422,7 @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, bfloat16_unpack_canonical(&pa, a, status); bfloat16_unpack_canonical(&pb, b, status); bfloat16_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); return bfloat16_round_pack_canonical(pr, status); } @@ -2431,7 +2435,7 @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, float128_unpack_canonical(&pa, a, status); float128_unpack_canonical(&pb, b, status); float128_unpack_canonical(&pc, c, status); - pr = parts_muladd(&pa, &pb, &pc, flags, status); + pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); return float128_round_pack_canonical(pr, status); } @@ -5249,8 +5253,9 @@ float32 float32_exp2(float32 a, float_status *status) float64_unpack_canonical(&rp, float64_one, status); for (i = 0 ; i < 15 ; i++) { + float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status); - rp = *parts_muladd(&tp, &xnp, &rp, 0, status); + rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status); xnp = *parts_mul(&xnp, &xp, status); } diff --git a/include/exec/translator.h b/include/exec/translator.h index 41e2a41..d70942a 100644 --- a/include/exec/translator.h +++ b/include/exec/translator.h @@ -22,20 +22,6 @@ #include "exec/vaddr.h" /** - * gen_intermediate_code - * @cpu: cpu context - * @tb: translation block - * @max_insns: max number of instructions to translate - * @pc: guest virtual program counter address - * @host_pc: host physical program counter address - * - * This function must be provided by the target, which should create - * the target-specific DisasContext, and then invoke translator_loop. - */ -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc); - -/** * DisasJumpType: * @DISAS_NEXT: Next instruction in program order. * @DISAS_TOO_MANY: Too many instructions translated. diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index 79ca44d..9d37cdf 100644 --- a/include/fpu/softfloat-types.h +++ b/include/fpu/softfloat-types.h @@ -140,6 +140,8 @@ typedef enum __attribute__((__packed__)) { float_round_to_odd = 5, /* Not an IEEE rounding mode: round to closest odd, overflow to inf */ float_round_to_odd_inf = 6, + /* Not an IEEE rounding mode: round to nearest even, overflow to max */ + float_round_nearest_even_max = 7, } FloatRoundMode; /* diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index eb64075..09a40b4 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -120,14 +120,16 @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status); | Using these differs from negating an input or output before calling | the muladd function in that this means that a NaN doesn't have its | sign bit inverted before it is propagated. -| We also support halving the result before rounding, as a special -| case to support the ARM fused-sqrt-step instruction FRSQRTS. +| +| With float_muladd_suppress_add_product_zero, if A or B is zero +| such that the product is a true zero, then return C without addition. +| This preserves the sign of C when C is +/- 0. Used for Hexagon. *----------------------------------------------------------------------------*/ enum { float_muladd_negate_c = 1, float_muladd_negate_product = 2, float_muladd_negate_result = 4, - float_muladd_halve_result = 8, + float_muladd_suppress_add_product_zero = 8, }; /*---------------------------------------------------------------------------- @@ -238,6 +240,8 @@ float16 float16_add(float16, float16, float_status *status); float16 float16_sub(float16, float16, float_status *status); float16 float16_mul(float16, float16, float_status *status); float16 float16_muladd(float16, float16, float16, int, float_status *status); +float16 float16_muladd_scalbn(float16, float16, float16, + int, int, float_status *status); float16 float16_div(float16, float16, float_status *status); float16 float16_scalbn(float16, int, float_status *status); float16 float16_min(float16, float16, float_status *status); @@ -597,6 +601,8 @@ float32 float32_mul(float32, float32, float_status *status); float32 float32_div(float32, float32, float_status *status); float32 float32_rem(float32, float32, float_status *status); float32 float32_muladd(float32, float32, float32, int, float_status *status); +float32 float32_muladd_scalbn(float32, float32, float32, + int, int, float_status *status); float32 float32_sqrt(float32, float_status *status); float32 float32_exp2(float32, float_status *status); float32 float32_log2(float32, float_status *status); @@ -792,6 +798,8 @@ float64 float64_mul(float64, float64, float_status *status); float64 float64_div(float64, float64, float_status *status); float64 float64_rem(float64, float64, float_status *status); float64 float64_muladd(float64, float64, float64, int, float_status *status); +float64 float64_muladd_scalbn(float64, float64, float64, + int, int, float_status *status); float64 float64_sqrt(float64, float_status *status); float64 float64_log2(float64, float_status *status); FloatRelation float64_compare(float64, float64, float_status *status); diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h index 663efb9..2e3f169 100644 --- a/include/hw/core/tcg-cpu-ops.h +++ b/include/hw/core/tcg-cpu-ops.h @@ -25,6 +25,19 @@ struct TCGCPUOps { */ void (*initialize)(void); /** + * @translate_code: Translate guest instructions to TCGOps + * @cpu: cpu context + * @tb: translation block + * @max_insns: max number of instructions to translate + * @pc: guest virtual program counter address + * @host_pc: host physical program counter address + * + * This function must be provided by the target, which should create + * the target-specific DisasContext, and then invoke translator_loop. + */ + void (*translate_code)(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); + /** * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock * * This is called when we abandon execution of a TB before starting it, diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index 9fa506b..e1b898e 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -224,6 +224,7 @@ static const struct SysemuCPUOps alpha_sysemu_ops = { static const TCGCPUOps alpha_tcg_ops = { .initialize = alpha_translate_init, + .translate_code = alpha_translate_code, .synchronize_from_tb = alpha_cpu_synchronize_from_tb, .restore_state_to_opc = alpha_restore_state_to_opc, diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h index 3556d32..80562ad 100644 --- a/target/alpha/cpu.h +++ b/target/alpha/cpu.h @@ -431,6 +431,8 @@ enum { }; void alpha_translate_init(void); +void alpha_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #define CPU_RESOLVING_TYPE TYPE_ALPHA_CPU diff --git a/target/alpha/translate.c b/target/alpha/translate.c index 629ff3c..2156c02 100644 --- a/target/alpha/translate.c +++ b/target/alpha/translate.c @@ -2955,8 +2955,8 @@ static const TranslatorOps alpha_tr_ops = { .tb_stop = alpha_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void alpha_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base); diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 019183c..dcedadc 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2682,6 +2682,7 @@ static const struct SysemuCPUOps arm_sysemu_ops = { #ifdef CONFIG_TCG static const TCGCPUOps arm_tcg_ops = { .initialize = arm_translate_init, + .translate_code = arm_translate_code, .synchronize_from_tb = arm_cpu_synchronize_from_tb, .debug_excp_handler = arm_debug_excp_handler, .restore_state_to_opc = arm_restore_state_to_opc, diff --git a/target/arm/internals.h b/target/arm/internals.h index c3a5b13..863a84e 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -357,6 +357,8 @@ void init_cpreg_list(ARMCPU *cpu); void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu); void arm_translate_init(void); +void arm_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void arm_cpu_register_gdb_commands(ARMCPU *cpu); void aarch64_cpu_register_gdb_commands(ARMCPU *cpu, GString *, diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c index 58e5457..03acdf8 100644 --- a/target/arm/tcg/cpu-v7m.c +++ b/target/arm/tcg/cpu-v7m.c @@ -234,6 +234,7 @@ static void cortex_m55_initfn(Object *obj) static const TCGCPUOps arm_v7m_tcg_ops = { .initialize = arm_translate_init, + .translate_code = arm_translate_code, .synchronize_from_tb = arm_cpu_synchronize_from_tb, .debug_excp_handler = arm_debug_excp_handler, .restore_state_to_opc = arm_restore_state_to_opc, diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c index 0e13050..3b226da 100644 --- a/target/arm/tcg/helper-a64.c +++ b/target/arm/tcg/helper-a64.c @@ -262,7 +262,7 @@ uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst) (float16_is_infinity(b) && float16_is_zero(a))) { return float16_one_point_five; } - return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst); + return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst); } float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst) @@ -275,7 +275,7 @@ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst) (float32_is_infinity(b) && float32_is_zero(a))) { return float32_one_point_five; } - return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst); + return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst); } float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst) @@ -288,7 +288,7 @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst) (float64_is_infinity(b) && float64_is_zero(a))) { return float64_one_point_five; } - return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst); + return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst); } /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */ diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index 9ee761f..c16b59a 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -8093,9 +8093,8 @@ static const TranslatorOps thumb_translator_ops = { .tb_stop = arm_tr_tb_stop, }; -/* generate intermediate code for basic block 'tb'. */ -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void arm_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc = { }; const TranslatorOps *ops = &arm_translator_ops; diff --git a/target/avr/cpu.c b/target/avr/cpu.c index 2dccb09..8a126ff 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -207,6 +207,7 @@ static const struct SysemuCPUOps avr_sysemu_ops = { static const TCGCPUOps avr_tcg_ops = { .initialize = avr_cpu_tcg_init, + .translate_code = avr_cpu_translate_code, .synchronize_from_tb = avr_cpu_synchronize_from_tb, .restore_state_to_opc = avr_restore_state_to_opc, .cpu_exec_interrupt = avr_cpu_exec_interrupt, diff --git a/target/avr/cpu.h b/target/avr/cpu.h index 4725535..06f5ae4 100644 --- a/target/avr/cpu.h +++ b/target/avr/cpu.h @@ -183,6 +183,8 @@ static inline void set_avr_feature(CPUAVRState *env, int feature) } void avr_cpu_tcg_init(void); +void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); int cpu_avr_exec(CPUState *cpu); diff --git a/target/avr/translate.c b/target/avr/translate.c index f13b997..4ab71d8 100644 --- a/target/avr/translate.c +++ b/target/avr/translate.c @@ -2599,7 +2599,7 @@ static bool trans_WDR(DisasContext *ctx, arg_WDR *a) * * - translate() * - canonicalize_skip() - * - gen_intermediate_code() + * - translate_code() * - restore_state_to_opc() * */ @@ -2795,8 +2795,8 @@ static const TranslatorOps avr_tr_ops = { .tb_stop = avr_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc = { }; translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base); diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c index c9817c7..0b7fc98 100644 --- a/target/hexagon/cpu.c +++ b/target/hexagon/cpu.c @@ -325,6 +325,7 @@ static void hexagon_cpu_init(Object *obj) static const TCGCPUOps hexagon_tcg_ops = { .initialize = hexagon_translate_init, + .translate_code = hexagon_translate_code, .synchronize_from_tb = hexagon_cpu_synchronize_from_tb, .restore_state_to_opc = hexagon_restore_state_to_opc, }; diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h index 14e6e81..79e60d4 100644 --- a/target/hexagon/cpu.h +++ b/target/hexagon/cpu.h @@ -150,6 +150,8 @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc, typedef HexagonCPU ArchCPU; void hexagon_translate_init(void); +void hexagon_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #include "exec/cpu-all.h" diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c index 05a56d8..c557141 100644 --- a/target/hexagon/fma_emu.c +++ b/target/hexagon/fma_emu.c @@ -43,112 +43,51 @@ #define WAY_BIG_EXP 4096 -typedef union { - double f; - uint64_t i; - struct { - uint64_t mant:52; - uint64_t exp:11; - uint64_t sign:1; - }; -} Double; - -typedef union { - float f; - uint32_t i; - struct { - uint32_t mant:23; - uint32_t exp:8; - uint32_t sign:1; - }; -} Float; - static uint64_t float64_getmant(float64 f64) { - Double a = { .i = f64 }; + uint64_t mant = extract64(f64, 0, 52); if (float64_is_normal(f64)) { - return a.mant | 1ULL << 52; + return mant | 1ULL << 52; } if (float64_is_zero(f64)) { return 0; } if (float64_is_denormal(f64)) { - return a.mant; + return mant; } return ~0ULL; } int32_t float64_getexp(float64 f64) { - Double a = { .i = f64 }; + int exp = extract64(f64, 52, 11); if (float64_is_normal(f64)) { - return a.exp; + return exp; } if (float64_is_denormal(f64)) { - return a.exp + 1; + return exp + 1; } return -1; } -static uint64_t float32_getmant(float32 f32) -{ - Float a = { .i = f32 }; - if (float32_is_normal(f32)) { - return a.mant | 1ULL << 23; - } - if (float32_is_zero(f32)) { - return 0; - } - if (float32_is_denormal(f32)) { - return a.mant; - } - return ~0ULL; -} - int32_t float32_getexp(float32 f32) { - Float a = { .i = f32 }; + int exp = float32_getexp_raw(f32); if (float32_is_normal(f32)) { - return a.exp; + return exp; } if (float32_is_denormal(f32)) { - return a.exp + 1; + return exp + 1; } return -1; } -static uint32_t int128_getw0(Int128 x) -{ - return int128_getlo(x); -} - -static uint32_t int128_getw1(Int128 x) -{ - return int128_getlo(x) >> 32; -} - static Int128 int128_mul_6464(uint64_t ai, uint64_t bi) { - Int128 a, b; - uint64_t pp0, pp1a, pp1b, pp1s, pp2; - - a = int128_make64(ai); - b = int128_make64(bi); - pp0 = (uint64_t)int128_getw0(a) * (uint64_t)int128_getw0(b); - pp1a = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw0(b); - pp1b = (uint64_t)int128_getw1(b) * (uint64_t)int128_getw0(a); - pp2 = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw1(b); - - pp1s = pp1a + pp1b; - if ((pp1s < pp1a) || (pp1s < pp1b)) { - pp2 += (1ULL << 32); - } - uint64_t ret_low = pp0 + (pp1s << 32); - if ((ret_low < pp0) || (ret_low < (pp1s << 32))) { - pp2 += 1; - } + uint64_t l, h; - return int128_make128(ret_low, pp2 + (pp1s >> 32)); + mulu64(&l, &h, ai, bi); + return int128_make128(l, h); } static Int128 int128_sub_borrow(Int128 a, Int128 b, int borrow) @@ -369,298 +308,129 @@ float32 infinite_float32(uint8_t sign) } /* Return a maximum finite value with the requested sign */ -static float32 maxfinite_float32(uint8_t sign) +static float64 accum_round_float64(Accum a, float_status *fp_status) { - if (sign) { - return make_float32(SF_MINUS_MAXF); - } else { - return make_float32(SF_MAXF); - } -} - -/* Return a zero value with requested sign */ -static float32 zero_float32(uint8_t sign) -{ - if (sign) { - return make_float32(0x80000000); - } else { - return float32_zero; + uint64_t ret; + + if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) + && ((a.guard | a.round | a.sticky) == 0)) { + /* result zero */ + switch (fp_status->float_rounding_mode) { + case float_round_down: + return zero_float64(1); + default: + return zero_float64(0); + } } -} - -#define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \ -static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \ -{ \ - if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) \ - && ((a.guard | a.round | a.sticky) == 0)) { \ - /* result zero */ \ - switch (fp_status->float_rounding_mode) { \ - case float_round_down: \ - return zero_##SUFFIX(1); \ - default: \ - return zero_##SUFFIX(0); \ - } \ - } \ - /* Normalize right */ \ - /* We want MANTBITS bits of mantissa plus the leading one. */ \ - /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \ - /* So we need to normalize right while the high word is non-zero and \ - * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \ - while ((int128_gethi(a.mant) != 0) || \ - ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0)) { \ - a = accum_norm_right(a, 1); \ - } \ - /* \ - * OK, now normalize left \ - * We want to normalize left until we have a leading one in bit 24 \ - * Theoretically, we only need to shift a maximum of one to the left if we \ - * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \ - * should be 0 \ - */ \ - while ((int128_getlo(a.mant) & (1ULL << MANTBITS)) == 0) { \ - a = accum_norm_left(a); \ - } \ - /* \ - * OK, now we might need to denormalize because of potential underflow. \ - * We need to do this before rounding, and rounding might make us normal \ - * again \ - */ \ - while (a.exp <= 0) { \ - a = accum_norm_right(a, 1 - a.exp); \ - /* \ - * Do we have underflow? \ - * That's when we get an inexact answer because we ran out of bits \ - * in a denormal. \ - */ \ - if (a.guard || a.round || a.sticky) { \ - float_raise(float_flag_underflow, fp_status); \ - } \ - } \ - /* OK, we're relatively canonical... now we need to round */ \ - if (a.guard || a.round || a.sticky) { \ - float_raise(float_flag_inexact, fp_status); \ - switch (fp_status->float_rounding_mode) { \ - case float_round_to_zero: \ - /* Chop and we're done */ \ - break; \ - case float_round_up: \ - if (a.sign == 0) { \ - a.mant = int128_add(a.mant, int128_one()); \ - } \ - break; \ - case float_round_down: \ - if (a.sign != 0) { \ - a.mant = int128_add(a.mant, int128_one()); \ - } \ - break; \ - default: \ - if (a.round || a.sticky) { \ - /* round up if guard is 1, down if guard is zero */ \ - a.mant = int128_add(a.mant, int128_make64(a.guard)); \ - } else if (a.guard) { \ - /* exactly .5, round up if odd */ \ - a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); \ - } \ - break; \ - } \ - } \ - /* \ - * OK, now we might have carried all the way up. \ - * So we might need to shr once \ - * at least we know that the lsb should be zero if we rounded and \ - * got a carry out... \ - */ \ - if ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0) { \ - a = accum_norm_right(a, 1); \ - } \ - /* Overflow? */ \ - if (a.exp >= INF_EXP) { \ - /* Yep, inf result */ \ - float_raise(float_flag_overflow, fp_status); \ - float_raise(float_flag_inexact, fp_status); \ - switch (fp_status->float_rounding_mode) { \ - case float_round_to_zero: \ - return maxfinite_##SUFFIX(a.sign); \ - case float_round_up: \ - if (a.sign == 0) { \ - return infinite_##SUFFIX(a.sign); \ - } else { \ - return maxfinite_##SUFFIX(a.sign); \ - } \ - case float_round_down: \ - if (a.sign != 0) { \ - return infinite_##SUFFIX(a.sign); \ - } else { \ - return maxfinite_##SUFFIX(a.sign); \ - } \ - default: \ - return infinite_##SUFFIX(a.sign); \ - } \ - } \ - /* Underflow? */ \ - if (int128_getlo(a.mant) & (1ULL << MANTBITS)) { \ - /* Leading one means: No, we're normal. So, we should be done... */ \ - INTERNAL_TYPE ret; \ - ret.i = 0; \ - ret.sign = a.sign; \ - ret.exp = a.exp; \ - ret.mant = int128_getlo(a.mant); \ - return ret.i; \ - } \ - assert(a.exp == 1); \ - INTERNAL_TYPE ret; \ - ret.i = 0; \ - ret.sign = a.sign; \ - ret.exp = 0; \ - ret.mant = int128_getlo(a.mant); \ - return ret.i; \ -} - -GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double) -GEN_XF_ROUND(float32, SF_MANTBITS, SF_INF_EXP, Float) - -static bool is_inf_prod(float64 a, float64 b) -{ - return ((float64_is_infinity(a) && float64_is_infinity(b)) || - (float64_is_infinity(a) && is_finite(b) && (!float64_is_zero(b))) || - (float64_is_infinity(b) && is_finite(a) && (!float64_is_zero(a)))); -} - -static float64 special_fma(float64 a, float64 b, float64 c, - float_status *fp_status) -{ - float64 ret = make_float64(0); - /* - * If A multiplied by B is an exact infinity and C is also an infinity - * but with the opposite sign, FMA returns NaN and raises invalid. + * Normalize right + * We want DF_MANTBITS bits of mantissa plus the leading one. + * That means that we want DF_MANTBITS+1 bits, or 0x000000000000FF_FFFF + * So we need to normalize right while the high word is non-zero and + * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ - uint8_t a_sign = float64_is_neg(a); - uint8_t b_sign = float64_is_neg(b); - uint8_t c_sign = float64_is_neg(c); - if (is_inf_prod(a, b) && float64_is_infinity(c)) { - if ((a_sign ^ b_sign) != c_sign) { - ret = make_float64(DF_NAN); - float_raise(float_flag_invalid, fp_status); - return ret; - } - } - if ((float64_is_infinity(a) && float64_is_zero(b)) || - (float64_is_zero(a) && float64_is_infinity(b))) { - ret = make_float64(DF_NAN); - float_raise(float_flag_invalid, fp_status); - return ret; + while ((int128_gethi(a.mant) != 0) || + ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0)) { + a = accum_norm_right(a, 1); } /* - * If none of the above checks are true and C is a NaN, - * a NaN shall be returned - * If A or B are NaN, a NAN shall be returned. + * OK, now normalize left + * We want to normalize left until we have a leading one in bit 24 + * Theoretically, we only need to shift a maximum of one to the left if we + * shifted out lots of bits from B, or if we had no shift / 1 shift sticky + * should be 0 */ - if (float64_is_any_nan(a) || - float64_is_any_nan(b) || - float64_is_any_nan(c)) { - if (float64_is_any_nan(a) && (fGETBIT(51, a) == 0)) { - float_raise(float_flag_invalid, fp_status); - } - if (float64_is_any_nan(b) && (fGETBIT(51, b) == 0)) { - float_raise(float_flag_invalid, fp_status); - } - if (float64_is_any_nan(c) && (fGETBIT(51, c) == 0)) { - float_raise(float_flag_invalid, fp_status); - } - ret = make_float64(DF_NAN); - return ret; + while ((int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) == 0) { + a = accum_norm_left(a); } /* - * We have checked for adding opposite-signed infinities. - * Other infinities return infinity with the correct sign + * OK, now we might need to denormalize because of potential underflow. + * We need to do this before rounding, and rounding might make us normal + * again */ - if (float64_is_infinity(c)) { - ret = infinite_float64(c_sign); - return ret; + while (a.exp <= 0) { + a = accum_norm_right(a, 1 - a.exp); + /* + * Do we have underflow? + * That's when we get an inexact answer because we ran out of bits + * in a denormal. + */ + if (a.guard || a.round || a.sticky) { + float_raise(float_flag_underflow, fp_status); + } } - if (float64_is_infinity(a) || float64_is_infinity(b)) { - ret = infinite_float64(a_sign ^ b_sign); - return ret; + /* OK, we're relatively canonical... now we need to round */ + if (a.guard || a.round || a.sticky) { + float_raise(float_flag_inexact, fp_status); + switch (fp_status->float_rounding_mode) { + case float_round_to_zero: + /* Chop and we're done */ + break; + case float_round_up: + if (a.sign == 0) { + a.mant = int128_add(a.mant, int128_one()); + } + break; + case float_round_down: + if (a.sign != 0) { + a.mant = int128_add(a.mant, int128_one()); + } + break; + default: + if (a.round || a.sticky) { + /* round up if guard is 1, down if guard is zero */ + a.mant = int128_add(a.mant, int128_make64(a.guard)); + } else if (a.guard) { + /* exactly .5, round up if odd */ + a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); + } + break; + } } - g_assert_not_reached(); -} - -static float32 special_fmaf(float32 a, float32 b, float32 c, - float_status *fp_status) -{ - float64 aa, bb, cc; - aa = float32_to_float64(a, fp_status); - bb = float32_to_float64(b, fp_status); - cc = float32_to_float64(c, fp_status); - return float64_to_float32(special_fma(aa, bb, cc, fp_status), fp_status); -} - -float32 internal_fmafx(float32 a, float32 b, float32 c, int scale, - float_status *fp_status) -{ - Accum prod; - Accum acc; - Accum result; - accum_init(&prod); - accum_init(&acc); - accum_init(&result); - - uint8_t a_sign = float32_is_neg(a); - uint8_t b_sign = float32_is_neg(b); - uint8_t c_sign = float32_is_neg(c); - if (float32_is_infinity(a) || - float32_is_infinity(b) || - float32_is_infinity(c)) { - return special_fmaf(a, b, c, fp_status); - } - if (float32_is_any_nan(a) || - float32_is_any_nan(b) || - float32_is_any_nan(c)) { - return special_fmaf(a, b, c, fp_status); - } - if ((scale == 0) && (float32_is_zero(a) || float32_is_zero(b))) { - float32 tmp = float32_mul(a, b, fp_status); - tmp = float32_add(tmp, c, fp_status); - return tmp; - } - - /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */ - prod.mant = int128_mul_6464(float32_getmant(a), float32_getmant(b)); - /* - * Note: extracting the mantissa into an int is multiplying by - * 2**23, so adjust here + * OK, now we might have carried all the way up. + * So we might need to shr once + * at least we know that the lsb should be zero if we rounded and + * got a carry out... */ - prod.exp = float32_getexp(a) + float32_getexp(b) - SF_BIAS - 23; - prod.sign = a_sign ^ b_sign; - if (float32_is_zero(a) || float32_is_zero(b)) { - prod.exp = -2 * WAY_BIG_EXP; - } - if ((scale > 0) && float32_is_denormal(c)) { - acc.mant = int128_mul_6464(0, 0); - acc.exp = -WAY_BIG_EXP; - acc.sign = c_sign; - acc.sticky = 1; - result = accum_add(prod, acc); - } else if (!float32_is_zero(c)) { - acc.mant = int128_mul_6464(float32_getmant(c), 1); - acc.exp = float32_getexp(c); - acc.sign = c_sign; - result = accum_add(prod, acc); - } else { - result = prod; + if ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0) { + a = accum_norm_right(a, 1); + } + /* Overflow? */ + if (a.exp >= DF_INF_EXP) { + /* Yep, inf result */ + float_raise(float_flag_overflow, fp_status); + float_raise(float_flag_inexact, fp_status); + switch (fp_status->float_rounding_mode) { + case float_round_to_zero: + return maxfinite_float64(a.sign); + case float_round_up: + if (a.sign == 0) { + return infinite_float64(a.sign); + } else { + return maxfinite_float64(a.sign); + } + case float_round_down: + if (a.sign != 0) { + return infinite_float64(a.sign); + } else { + return maxfinite_float64(a.sign); + } + default: + return infinite_float64(a.sign); + } } - result.exp += scale; - return accum_round_float32(result, fp_status); -} - -float32 internal_mpyf(float32 a, float32 b, float_status *fp_status) -{ - if (float32_is_zero(a) || float32_is_zero(b)) { - return float32_mul(a, b, fp_status); + /* Underflow? */ + ret = int128_getlo(a.mant); + if (ret & (1ULL << DF_MANTBITS)) { + /* Leading one means: No, we're normal. So, we should be done... */ + ret = deposit64(ret, 52, 11, a.exp); + } else { + assert(a.exp == 1); + ret = deposit64(ret, 52, 11, 0); } - return internal_fmafx(a, b, float32_zero, 0, fp_status); + ret = deposit64(ret, 63, 1, a.sign); + return ret; } float64 internal_mpyhh(float64 a, float64 b, @@ -685,7 +455,7 @@ float64 internal_mpyhh(float64 a, float64 b, float64_is_infinity(b)) { return float64_mul(a, b, fp_status); } - x.mant = int128_mul_6464(accumulated, 1); + x.mant = int128_make64(accumulated); x.sticky = sticky; prod = fGETUWORD(1, float64_getmant(a)) * fGETUWORD(1, float64_getmant(b)); x.mant = int128_add(x.mant, int128_mul_6464(prod, 0x100000000ULL)); diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h index 91591d6..fed054b 100644 --- a/target/hexagon/fma_emu.h +++ b/target/hexagon/fma_emu.h @@ -30,9 +30,6 @@ static inline uint32_t float32_getexp_raw(float32 f32) } int32_t float32_getexp(float32 f32); float32 infinite_float32(uint8_t sign); -float32 internal_fmafx(float32 a, float32 b, float32 c, - int scale, float_status *fp_status); -float32 internal_mpyf(float32 a, float32 b, float_status *fp_status); float64 internal_mpyhh(float64 a, float64 b, unsigned long long int accumulated, float_status *fp_status); diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c index 01d1a1b..6da8db8 100644 --- a/target/hexagon/op_helper.c +++ b/target/hexagon/op_helper.c @@ -1045,7 +1045,7 @@ float32 HELPER(sfmpy)(CPUHexagonState *env, float32 RsV, float32 RtV) { float32 RdV; arch_fpop_start(env); - RdV = internal_mpyf(RsV, RtV, &env->fp_status); + RdV = float32_mul(RsV, RtV, &env->fp_status); arch_fpop_end(env); return RdV; } @@ -1054,41 +1054,18 @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV, float32 RsV, float32 RtV) { arch_fpop_start(env); - RxV = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status); + RxV = float32_muladd(RsV, RtV, RxV, 0, &env->fp_status); arch_fpop_end(env); return RxV; } -static bool is_zero_prod(float32 a, float32 b) -{ - return ((float32_is_zero(a) && is_finite(b)) || - (float32_is_zero(b) && is_finite(a))); -} - -static float32 check_nan(float32 dst, float32 x, float_status *fp_status) -{ - float32 ret = dst; - if (float32_is_any_nan(x)) { - if (extract32(x, 22, 1) == 0) { - float_raise(float_flag_invalid, fp_status); - } - ret = make_float32(0xffffffff); /* nan */ - } - return ret; -} - float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV, float32 RsV, float32 RtV, float32 PuV) { - size4s_t tmp; arch_fpop_start(env); - RxV = check_nan(RxV, RxV, &env->fp_status); - RxV = check_nan(RxV, RsV, &env->fp_status); - RxV = check_nan(RxV, RtV, &env->fp_status); - tmp = internal_fmafx(RsV, RtV, RxV, fSXTN(8, 64, PuV), &env->fp_status); - if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) { - RxV = tmp; - } + RxV = float32_muladd_scalbn(RsV, RtV, RxV, fSXTN(8, 64, PuV), + float_muladd_suppress_add_product_zero, + &env->fp_status); arch_fpop_end(env); return RxV; } @@ -1096,86 +1073,50 @@ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV, float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV, float32 RsV, float32 RtV) { - float32 neg_RsV; arch_fpop_start(env); - neg_RsV = float32_set_sign(RsV, float32_is_neg(RsV) ? 0 : 1); - RxV = internal_fmafx(neg_RsV, RtV, RxV, 0, &env->fp_status); + RxV = float32_muladd(RsV, RtV, RxV, float_muladd_negate_product, + &env->fp_status); arch_fpop_end(env); return RxV; } -static bool is_inf_prod(int32_t a, int32_t b) +static float32 do_sffma_lib(CPUHexagonState *env, float32 RxV, + float32 RsV, float32 RtV, int negate) { - return (float32_is_infinity(a) && float32_is_infinity(b)) || - (float32_is_infinity(a) && is_finite(b) && !float32_is_zero(b)) || - (float32_is_infinity(b) && is_finite(a) && !float32_is_zero(a)); -} - -float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV, - float32 RsV, float32 RtV) -{ - bool infinp; - bool infminusinf; - float32 tmp; + int flags; arch_fpop_start(env); - set_float_rounding_mode(float_round_nearest_even, &env->fp_status); - infminusinf = float32_is_infinity(RxV) && - is_inf_prod(RsV, RtV) && - (fGETBIT(31, RsV ^ RxV ^ RtV) != 0); - infinp = float32_is_infinity(RxV) || - float32_is_infinity(RtV) || - float32_is_infinity(RsV); - RxV = check_nan(RxV, RxV, &env->fp_status); - RxV = check_nan(RxV, RsV, &env->fp_status); - RxV = check_nan(RxV, RtV, &env->fp_status); - tmp = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status); - if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) { - RxV = tmp; - } - set_float_exception_flags(0, &env->fp_status); - if (float32_is_infinity(RxV) && !infinp) { - RxV = RxV - 1; - } - if (infminusinf) { - RxV = 0; + + set_float_rounding_mode(float_round_nearest_even_max, &env->fp_status); + RxV = float32_muladd(RsV, RtV, RxV, + negate | float_muladd_suppress_add_product_zero, + &env->fp_status); + + flags = get_float_exception_flags(&env->fp_status); + if (flags) { + /* Flags are suppressed by this instruction. */ + set_float_exception_flags(0, &env->fp_status); + + /* Return 0 for Inf - Inf. */ + if (flags & float_flag_invalid_isi) { + RxV = 0; + } } + arch_fpop_end(env); return RxV; } -float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV, +float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV, float32 RsV, float32 RtV) { - bool infinp; - bool infminusinf; - float32 tmp; + return do_sffma_lib(env, RxV, RsV, RtV, 0); +} - arch_fpop_start(env); - set_float_rounding_mode(float_round_nearest_even, &env->fp_status); - infminusinf = float32_is_infinity(RxV) && - is_inf_prod(RsV, RtV) && - (fGETBIT(31, RsV ^ RxV ^ RtV) == 0); - infinp = float32_is_infinity(RxV) || - float32_is_infinity(RtV) || - float32_is_infinity(RsV); - RxV = check_nan(RxV, RxV, &env->fp_status); - RxV = check_nan(RxV, RsV, &env->fp_status); - RxV = check_nan(RxV, RtV, &env->fp_status); - float32 minus_RsV = float32_sub(float32_zero, RsV, &env->fp_status); - tmp = internal_fmafx(minus_RsV, RtV, RxV, 0, &env->fp_status); - if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) { - RxV = tmp; - } - set_float_exception_flags(0, &env->fp_status); - if (float32_is_infinity(RxV) && !infinp) { - RxV = RxV - 1; - } - if (infminusinf) { - RxV = 0; - } - arch_fpop_end(env); - return RxV; +float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV, + float32 RsV, float32 RtV) +{ + return do_sffma_lib(env, RxV, RsV, RtV, float_muladd_negate_product); } float64 HELPER(dfmpyfix)(CPUHexagonState *env, float64 RssV, float64 RttV) diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c index 5621057..fe78587 100644 --- a/target/hexagon/translate.c +++ b/target/hexagon/translate.c @@ -1026,8 +1026,8 @@ static const TranslatorOps hexagon_tr_ops = { .tb_stop = hexagon_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void hexagon_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index c9062e6..47d0160 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -223,6 +223,7 @@ static const struct SysemuCPUOps hppa_sysemu_ops = { static const TCGCPUOps hppa_tcg_ops = { .initialize = hppa_translate_init, + .translate_code = hppa_translate_code, .synchronize_from_tb = hppa_cpu_synchronize_from_tb, .restore_state_to_opc = hppa_restore_state_to_opc, diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h index e45ba50..22a6510 100644 --- a/target/hppa/cpu.h +++ b/target/hppa/cpu.h @@ -303,6 +303,8 @@ static inline int HPPA_BTLB_ENTRIES(CPUHPPAState *env) } void hppa_translate_init(void); +void hppa_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #define CPU_RESOLVING_TYPE TYPE_HPPA_CPU diff --git a/target/hppa/translate.c b/target/hppa/translate.c index d13f80f..dc04f9f 100644 --- a/target/hppa/translate.c +++ b/target/hppa/translate.c @@ -4869,8 +4869,8 @@ static const TranslatorOps hppa_tr_ops = { #endif }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void hppa_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx = { }; translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base); diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h index 696d6ef..54d8453 100644 --- a/target/i386/tcg/helper-tcg.h +++ b/target/i386/tcg/helper-tcg.h @@ -59,6 +59,8 @@ static inline target_long lshift(target_long x, int n) /* translate.c */ void tcg_x86_init(void); +void x86_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); /* excp_helper.c */ G_NORETURN void raise_exception(CPUX86State *env, int exception_index); diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c index 231ecac..14ee038 100644 --- a/target/i386/tcg/tcg-cpu.c +++ b/target/i386/tcg/tcg-cpu.c @@ -109,6 +109,7 @@ static bool x86_debug_check_breakpoint(CPUState *cs) static const TCGCPUOps x86_tcg_ops = { .initialize = tcg_x86_init, + .translate_code = x86_translate_code, .synchronize_from_tb = x86_cpu_synchronize_from_tb, .restore_state_to_opc = x86_restore_state_to_opc, .cpu_exec_enter = x86_cpu_exec_enter, diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 903553d..834aea1 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3814,9 +3814,8 @@ static const TranslatorOps i386_tr_ops = { .tb_stop = i386_tr_tb_stop, }; -/* generate intermediate code for basic block 'tb'. */ -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void x86_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index f5bc872..58415ff 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -795,6 +795,7 @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) static const TCGCPUOps loongarch_tcg_ops = { .initialize = loongarch_translate_init, + .translate_code = loongarch_translate_code, .synchronize_from_tb = loongarch_cpu_synchronize_from_tb, .restore_state_to_opc = loongarch_restore_state_to_opc, diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index 0655ac9..ad9cf4f 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -17,6 +17,8 @@ #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS) void loongarch_translate_init(void); +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void G_NORETURN do_raise_exception(CPULoongArchState *env, uint32_t exception, diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c index 1fca4af..68be999 100644 --- a/target/loongarch/tcg/translate.c +++ b/target/loongarch/tcg/translate.c @@ -333,8 +333,8 @@ static const TranslatorOps loongarch_tr_ops = { .tb_stop = loongarch_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 9de8ce6..41dfdf5 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -551,6 +551,7 @@ static const struct SysemuCPUOps m68k_sysemu_ops = { static const TCGCPUOps m68k_tcg_ops = { .initialize = m68k_tcg_init, + .translate_code = m68k_translate_code, .restore_state_to_opc = m68k_restore_state_to_opc, #ifndef CONFIG_USER_ONLY diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h index b5bbeed..ddb0f29 100644 --- a/target/m68k/cpu.h +++ b/target/m68k/cpu.h @@ -193,6 +193,8 @@ int m68k_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg); int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); void m68k_tcg_init(void); +void m68k_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void m68k_cpu_init_gdb(M68kCPU *cpu); uint32_t cpu_m68k_get_ccr(CPUM68KState *env); void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t); diff --git a/target/m68k/translate.c b/target/m68k/translate.c index 077151c..dec2967 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -6118,8 +6118,8 @@ static const TranslatorOps m68k_tr_ops = { .tb_stop = m68k_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void m68k_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; translator_loop(cpu, tb, max_insns, pc, host_pc, &m68k_tr_ops, &dc.base); diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c index eba8193..f114789 100644 --- a/target/microblaze/cpu.c +++ b/target/microblaze/cpu.c @@ -423,6 +423,7 @@ static const struct SysemuCPUOps mb_sysemu_ops = { static const TCGCPUOps mb_tcg_ops = { .initialize = mb_tcg_init, + .translate_code = mb_translate_code, .synchronize_from_tb = mb_cpu_synchronize_from_tb, .restore_state_to_opc = mb_restore_state_to_opc, diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h index 3e5a3e5..f6879ee 100644 --- a/target/microblaze/cpu.h +++ b/target/microblaze/cpu.h @@ -398,6 +398,8 @@ static inline void mb_cpu_write_msr(CPUMBState *env, uint32_t val) } void mb_tcg_init(void); +void mb_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #define CPU_RESOLVING_TYPE TYPE_MICROBLAZE_CPU diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c index d53995c..24005f0 100644 --- a/target/microblaze/translate.c +++ b/target/microblaze/translate.c @@ -1779,8 +1779,8 @@ static const TranslatorOps mb_tr_ops = { .tb_stop = mb_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void mb_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; translator_loop(cpu, tb, max_insns, pc, host_pc, &mb_tr_ops, &dc.base); diff --git a/target/mips/cpu.c b/target/mips/cpu.c index 1b0cf6d..e3af02a 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -547,6 +547,7 @@ static const Property mips_cpu_properties[] = { #include "hw/core/tcg-cpu-ops.h" static const TCGCPUOps mips_tcg_ops = { .initialize = mips_tcg_init, + .translate_code = mips_translate_code, .synchronize_from_tb = mips_cpu_synchronize_from_tb, .restore_state_to_opc = mips_restore_state_to_opc, diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h index aef032c..74fc130 100644 --- a/target/mips/tcg/tcg-internal.h +++ b/target/mips/tcg/tcg-internal.h @@ -16,6 +16,8 @@ #include "cpu.h" void mips_tcg_init(void); +void mips_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb); G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr, diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c index bd1ef4e..78b848a 100644 --- a/target/mips/tcg/translate.c +++ b/target/mips/tcg/translate.c @@ -15231,8 +15231,8 @@ static const TranslatorOps mips_tr_ops = { .tb_stop = mips_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void mips_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c index 7913a0c..b7bab0d 100644 --- a/target/openrisc/cpu.c +++ b/target/openrisc/cpu.c @@ -236,6 +236,7 @@ static const struct SysemuCPUOps openrisc_sysemu_ops = { static const TCGCPUOps openrisc_tcg_ops = { .initialize = openrisc_translate_init, + .translate_code = openrisc_translate_code, .synchronize_from_tb = openrisc_cpu_synchronize_from_tb, .restore_state_to_opc = openrisc_restore_state_to_opc, diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h index c9fe9ae..b97d2ff 100644 --- a/target/openrisc/cpu.h +++ b/target/openrisc/cpu.h @@ -301,6 +301,8 @@ void openrisc_cpu_dump_state(CPUState *cpu, FILE *f, int flags); int openrisc_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg); int openrisc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); void openrisc_translate_init(void); +void openrisc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); int print_insn_or1k(bfd_vma addr, disassemble_info *info); #ifndef CONFIG_USER_ONLY diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c index 028ba66..7a6af18 100644 --- a/target/openrisc/translate.c +++ b/target/openrisc/translate.c @@ -1646,8 +1646,8 @@ static const TranslatorOps openrisc_tr_ops = { .tb_stop = openrisc_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void openrisc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 2ffac2e..0b8b4c0 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -1581,6 +1581,8 @@ extern const VMStateDescription vmstate_ppc_cpu; /*****************************************************************************/ void ppc_translate_init(void); +void ppc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #if !defined(CONFIG_USER_ONLY) void ppc_store_sdr1(CPUPPCState *env, target_ulong value); diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index 0fcef63..c05c2dc 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -7431,6 +7431,7 @@ static const struct SysemuCPUOps ppc_sysemu_ops = { static const TCGCPUOps ppc_tcg_ops = { .initialize = ppc_translate_init, + .translate_code = ppc_translate_code, .restore_state_to_opc = ppc_restore_state_to_opc, #ifdef CONFIG_USER_ONLY diff --git a/target/ppc/translate.c b/target/ppc/translate.c index 8ab87f4..80638ab 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -6669,8 +6669,8 @@ static const TranslatorOps ppc_tr_ops = { .tb_stop = ppc_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void ppc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h index 284b112..252fdb8 100644 --- a/target/riscv/cpu.h +++ b/target/riscv/cpu.h @@ -602,6 +602,9 @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int index, uint64_t bit); void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool virt_en); void riscv_translate_init(void); +void riscv_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); + G_NORETURN void riscv_raise_exception(CPURISCVState *env, uint32_t exception, uintptr_t pc); diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c index f012981..8b89c99 100644 --- a/target/riscv/tcg/tcg-cpu.c +++ b/target/riscv/tcg/tcg-cpu.c @@ -135,6 +135,7 @@ static void riscv_restore_state_to_opc(CPUState *cs, static const TCGCPUOps riscv_tcg_ops = { .initialize = riscv_translate_init, + .translate_code = riscv_translate_code, .synchronize_from_tb = riscv_cpu_synchronize_from_tb, .restore_state_to_opc = riscv_restore_state_to_opc, diff --git a/target/riscv/translate.c b/target/riscv/translate.c index a76f67c..a992d4f 100644 --- a/target/riscv/translate.c +++ b/target/riscv/translate.c @@ -1346,8 +1346,8 @@ static const TranslatorOps riscv_tr_ops = { .tb_stop = riscv_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void riscv_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/rx/cpu.c b/target/rx/cpu.c index 558280c..8c50c7a 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -196,6 +196,7 @@ static const struct SysemuCPUOps rx_sysemu_ops = { static const TCGCPUOps rx_tcg_ops = { .initialize = rx_translate_init, + .translate_code = rx_translate_code, .synchronize_from_tb = rx_cpu_synchronize_from_tb, .restore_state_to_opc = rx_restore_state_to_opc, .tlb_fill = rx_cpu_tlb_fill, diff --git a/target/rx/cpu.h b/target/rx/cpu.h index c53593d..5ba1874 100644 --- a/target/rx/cpu.h +++ b/target/rx/cpu.h @@ -139,6 +139,8 @@ int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg); int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); void rx_translate_init(void); +void rx_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte); #include "exec/cpu-all.h" diff --git a/target/rx/translate.c b/target/rx/translate.c index 4f43654..bbda703 100644 --- a/target/rx/translate.c +++ b/target/rx/translate.c @@ -2258,8 +2258,8 @@ static const TranslatorOps rx_tr_ops = { .tb_stop = rx_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void rx_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 0a6847b..97d41c2 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -362,6 +362,7 @@ void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc, static const TCGCPUOps s390_tcg_ops = { .initialize = s390x_translate_init, + .translate_code = s390x_translate_code, .restore_state_to_opc = s390x_restore_state_to_opc, #ifdef CONFIG_USER_ONLY diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h index 4cc4350..a750e7a 100644 --- a/target/s390x/s390x-internal.h +++ b/target/s390x/s390x-internal.h @@ -399,6 +399,8 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, /* translate.c */ void s390x_translate_init(void); +void s390x_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void s390x_restore_state_to_opc(CPUState *cs, const TranslationBlock *tb, const uint64_t *data); diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c index 81554f2..00073c5 100644 --- a/target/s390x/tcg/translate.c +++ b/target/s390x/tcg/translate.c @@ -6481,8 +6481,8 @@ static const TranslatorOps s390x_tr_ops = { .disas_log = s390x_tr_disas_log, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void s390x_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc; diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index e9d3e12..24a2272 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -251,6 +251,7 @@ static const struct SysemuCPUOps sh4_sysemu_ops = { static const TCGCPUOps superh_tcg_ops = { .initialize = sh4_translate_init, + .translate_code = sh4_translate_code, .synchronize_from_tb = superh_cpu_synchronize_from_tb, .restore_state_to_opc = superh_restore_state_to_opc, diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h index d928bcf..d536d5d 100644 --- a/target/sh4/cpu.h +++ b/target/sh4/cpu.h @@ -248,6 +248,8 @@ G_NORETURN void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr, uintptr_t retaddr); void sh4_translate_init(void); +void sh4_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); #if !defined(CONFIG_USER_ONLY) hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); diff --git a/target/sh4/translate.c b/target/sh4/translate.c index f076da9..bcdd558 100644 --- a/target/sh4/translate.c +++ b/target/sh4/translate.c @@ -2318,8 +2318,8 @@ static const TranslatorOps sh4_tr_ops = { .tb_stop = sh4_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void sh4_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c index 373a335..fbd38ec 100644 --- a/target/sparc/cpu.c +++ b/target/sparc/cpu.c @@ -996,6 +996,7 @@ static const struct SysemuCPUOps sparc_sysemu_ops = { static const TCGCPUOps sparc_tcg_ops = { .initialize = sparc_tcg_init, + .translate_code = sparc_translate_code, .synchronize_from_tb = sparc_cpu_synchronize_from_tb, .restore_state_to_opc = sparc_restore_state_to_opc, diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h index 5c98123..dda8115 100644 --- a/target/sparc/cpu.h +++ b/target/sparc/cpu.h @@ -609,6 +609,8 @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr, /* translate.c */ void sparc_tcg_init(void); +void sparc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); /* fop_helper.c */ target_ulong cpu_get_fsr(CPUSPARCState *); diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c index 236d27b..c25097d 100644 --- a/target/sparc/fop_helper.c +++ b/target/sparc/fop_helper.c @@ -344,17 +344,17 @@ Int128 helper_fsqrtq(CPUSPARCState *env, Int128 src) } float32 helper_fmadds(CPUSPARCState *env, float32 s1, - float32 s2, float32 s3, uint32_t op) + float32 s2, float32 s3, int32_t sc, uint32_t op) { - float32 ret = float32_muladd(s1, s2, s3, op, &env->fp_status); + float32 ret = float32_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status); check_ieee_exceptions(env, GETPC()); return ret; } float64 helper_fmaddd(CPUSPARCState *env, float64 s1, - float64 s2, float64 s3, uint32_t op) + float64 s2, float64 s3, int32_t sc, uint32_t op) { - float64 ret = float64_muladd(s1, s2, s3, op, &env->fp_status); + float64 ret = float64_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status); check_ieee_exceptions(env, GETPC()); return ret; } diff --git a/target/sparc/helper.h b/target/sparc/helper.h index 1ae3f0c..3a7f7dc 100644 --- a/target/sparc/helper.h +++ b/target/sparc/helper.h @@ -59,7 +59,7 @@ DEF_HELPER_FLAGS_3(faddd, TCG_CALL_NO_WG, f64, env, f64, f64) DEF_HELPER_FLAGS_3(fsubd, TCG_CALL_NO_WG, f64, env, f64, f64) DEF_HELPER_FLAGS_3(fmuld, TCG_CALL_NO_WG, f64, env, f64, f64) DEF_HELPER_FLAGS_3(fdivd, TCG_CALL_NO_WG, f64, env, f64, f64) -DEF_HELPER_FLAGS_5(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, i32) +DEF_HELPER_FLAGS_6(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, s32, i32) DEF_HELPER_FLAGS_3(fnaddd, TCG_CALL_NO_WG, f64, env, f64, f64) DEF_HELPER_FLAGS_3(fnmuld, TCG_CALL_NO_WG, f64, env, f64, f64) @@ -72,7 +72,7 @@ DEF_HELPER_FLAGS_3(fadds, TCG_CALL_NO_WG, f32, env, f32, f32) DEF_HELPER_FLAGS_3(fsubs, TCG_CALL_NO_WG, f32, env, f32, f32) DEF_HELPER_FLAGS_3(fmuls, TCG_CALL_NO_WG, f32, env, f32, f32) DEF_HELPER_FLAGS_3(fdivs, TCG_CALL_NO_WG, f32, env, f32, f32) -DEF_HELPER_FLAGS_5(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, i32) +DEF_HELPER_FLAGS_6(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, s32, i32) DEF_HELPER_FLAGS_3(fnadds, TCG_CALL_NO_WG, f32, env, f32, f32) DEF_HELPER_FLAGS_3(fnmuls, TCG_CALL_NO_WG, f32, env, f32, f32) diff --git a/target/sparc/translate.c b/target/sparc/translate.c index 9be26c8..7e5c735 100644 --- a/target/sparc/translate.c +++ b/target/sparc/translate.c @@ -1359,93 +1359,109 @@ static void gen_op_fabsq(TCGv_i128 dst, TCGv_i128 src) static void gen_op_fmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3) { - gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(0)); + TCGv_i32 z = tcg_constant_i32(0); + gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, z); } static void gen_op_fmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3) { - gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(0)); + TCGv_i32 z = tcg_constant_i32(0); + gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, z); } static void gen_op_fmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3) { - int op = float_muladd_negate_c; - gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c); + gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op); } static void gen_op_fmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3) { - int op = float_muladd_negate_c; - gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c); + gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op); } static void gen_op_fnmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3) { - int op = float_muladd_negate_c | float_muladd_negate_result; - gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c | + float_muladd_negate_result); + gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op); } static void gen_op_fnmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3) { - int op = float_muladd_negate_c | float_muladd_negate_result; - gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c | + float_muladd_negate_result); + gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op); } static void gen_op_fnmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3) { - int op = float_muladd_negate_result; - gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result); + gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op); } static void gen_op_fnmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3) { - int op = float_muladd_negate_result; - gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op)); + TCGv_i32 z = tcg_constant_i32(0); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result); + gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op); } /* Use muladd to compute (1 * src1) + src2 / 2 with one rounding. */ static void gen_op_fhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2) { - TCGv_i32 one = tcg_constant_i32(float32_one); - int op = float_muladd_halve_result; - gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i32 fone = tcg_constant_i32(float32_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(0); + gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op); } static void gen_op_fhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2) { - TCGv_i64 one = tcg_constant_i64(float64_one); - int op = float_muladd_halve_result; - gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i64 fone = tcg_constant_i64(float64_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(0); + gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op); } /* Use muladd to compute (1 * src1) - src2 / 2 with one rounding. */ static void gen_op_fhsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2) { - TCGv_i32 one = tcg_constant_i32(float32_one); - int op = float_muladd_negate_c | float_muladd_halve_result; - gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i32 fone = tcg_constant_i32(float32_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c); + gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op); } static void gen_op_fhsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2) { - TCGv_i64 one = tcg_constant_i64(float64_one); - int op = float_muladd_negate_c | float_muladd_halve_result; - gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i64 fone = tcg_constant_i64(float64_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c); + gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op); } /* Use muladd to compute -((1 * src1) + src2 / 2) with one rounding. */ static void gen_op_fnhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2) { - TCGv_i32 one = tcg_constant_i32(float32_one); - int op = float_muladd_negate_result | float_muladd_halve_result; - gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i32 fone = tcg_constant_i32(float32_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result); + gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op); } static void gen_op_fnhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2) { - TCGv_i64 one = tcg_constant_i64(float64_one); - int op = float_muladd_negate_result | float_muladd_halve_result; - gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op)); + TCGv_i64 fone = tcg_constant_i64(float64_one); + TCGv_i32 mone = tcg_constant_i32(-1); + TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result); + gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op); } static void gen_op_fpexception_im(DisasContext *dc, int ftt) @@ -5803,8 +5819,8 @@ static const TranslatorOps sparc_tr_ops = { .tb_stop = sparc_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void sparc_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc = {}; diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c index 95fb546..95202fa 100644 --- a/target/tricore/cpu.c +++ b/target/tricore/cpu.c @@ -172,6 +172,7 @@ static const struct SysemuCPUOps tricore_sysemu_ops = { static const TCGCPUOps tricore_tcg_ops = { .initialize = tricore_tcg_init, + .translate_code = tricore_translate_code, .synchronize_from_tb = tricore_cpu_synchronize_from_tb, .restore_state_to_opc = tricore_restore_state_to_opc, .tlb_fill = tricore_cpu_tlb_fill, diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h index 220af69..8e431d7 100644 --- a/target/tricore/cpu.h +++ b/target/tricore/cpu.h @@ -252,6 +252,8 @@ FIELD(TB_FLAGS, PRIV, 0, 2) void cpu_state_reset(CPUTriCoreState *s); void tricore_tcg_init(void); +void tricore_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); static inline void cpu_get_tb_cpu_state(CPUTriCoreState *env, vaddr *pc, uint64_t *cs_base, uint32_t *flags) diff --git a/target/tricore/translate.c b/target/tricore/translate.c index 2b67395..0ef3743 100644 --- a/target/tricore/translate.c +++ b/target/tricore/translate.c @@ -8460,9 +8460,8 @@ static const TranslatorOps tricore_tr_ops = { .tb_stop = tricore_tr_tb_stop, }; - -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void tricore_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext ctx; translator_loop(cs, tb, max_insns, pc, host_pc, diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c index 0d4d79b..0910a3d 100644 --- a/target/xtensa/cpu.c +++ b/target/xtensa/cpu.c @@ -232,6 +232,7 @@ static const struct SysemuCPUOps xtensa_sysemu_ops = { static const TCGCPUOps xtensa_tcg_ops = { .initialize = xtensa_translate_init, + .translate_code = xtensa_translate_code, .debug_excp_handler = xtensa_breakpoint_handler, .restore_state_to_opc = xtensa_restore_state_to_opc, diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h index 77e48ee..0e6302c 100644 --- a/target/xtensa/cpu.h +++ b/target/xtensa/cpu.h @@ -617,6 +617,8 @@ G_NORETURN void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr, void xtensa_collect_sr_names(const XtensaConfig *config); void xtensa_translate_init(void); +void xtensa_translate_code(CPUState *cs, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc); void **xtensa_get_regfile_by_name(const char *name, int entries, int bits); void xtensa_breakpoint_handler(CPUState *cs); void xtensa_register_core(XtensaConfigList *node); diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c index 3c62c99..4f02cef 100644 --- a/target/xtensa/translate.c +++ b/target/xtensa/translate.c @@ -1228,8 +1228,8 @@ static const TranslatorOps xtensa_translator_ops = { .tb_stop = xtensa_tr_tb_stop, }; -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns, - vaddr pc, void *host_pc) +void xtensa_translate_code(CPUState *cpu, TranslationBlock *tb, + int *max_insns, vaddr pc, void *host_pc) { DisasContext dc = {}; translator_loop(cpu, tb, max_insns, pc, host_pc, diff --git a/tcg/optimize.c b/tcg/optimize.c index e9ef16b..c23f0d1 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -52,7 +52,7 @@ typedef struct TempOptInfo { QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy; uint64_t val; uint64_t z_mask; /* mask bit is 0 if and only if value bit is 0 */ - uint64_t s_mask; /* a left-aligned mask of clrsb(value) bits. */ + uint64_t s_mask; /* mask bit is 1 if value bit matches msb */ } TempOptInfo; typedef struct OptContext { @@ -64,70 +64,42 @@ typedef struct OptContext { QSIMPLEQ_HEAD(, MemCopyInfo) mem_free; /* In flight values from optimization. */ - uint64_t a_mask; /* mask bit is 0 iff value identical to first input */ - uint64_t z_mask; /* mask bit is 0 iff value bit is 0 */ - uint64_t s_mask; /* mask of clrsb(value) bits */ TCGType type; } OptContext; -/* Calculate the smask for a specific value. */ -static uint64_t smask_from_value(uint64_t value) +static inline TempOptInfo *ts_info(TCGTemp *ts) { - int rep = clrsb64(value); - return ~(~0ull >> rep); + return ts->state_ptr; } -/* - * Calculate the smask for a given set of known-zeros. - * If there are lots of zeros on the left, we can consider the remainder - * an unsigned field, and thus the corresponding signed field is one bit - * larger. - */ -static uint64_t smask_from_zmask(uint64_t zmask) +static inline TempOptInfo *arg_info(TCGArg arg) { - /* - * Only the 0 bits are significant for zmask, thus the msb itself - * must be zero, else we have no sign information. - */ - int rep = clz64(zmask); - if (rep == 0) { - return 0; - } - rep -= 1; - return ~(~0ull >> rep); + return ts_info(arg_temp(arg)); } -/* - * Recreate a properly left-aligned smask after manipulation. - * Some bit-shuffling, particularly shifts and rotates, may - * retain sign bits on the left, but may scatter disconnected - * sign bits on the right. Retain only what remains to the left. - */ -static uint64_t smask_from_smask(int64_t smask) +static inline bool ti_is_const(TempOptInfo *ti) { - /* Only the 1 bits are significant for smask */ - return smask_from_zmask(~smask); + return ti->is_const; } -static inline TempOptInfo *ts_info(TCGTemp *ts) +static inline uint64_t ti_const_val(TempOptInfo *ti) { - return ts->state_ptr; + return ti->val; } -static inline TempOptInfo *arg_info(TCGArg arg) +static inline bool ti_is_const_val(TempOptInfo *ti, uint64_t val) { - return ts_info(arg_temp(arg)); + return ti_is_const(ti) && ti_const_val(ti) == val; } static inline bool ts_is_const(TCGTemp *ts) { - return ts_info(ts)->is_const; + return ti_is_const(ts_info(ts)); } static inline bool ts_is_const_val(TCGTemp *ts, uint64_t val) { - TempOptInfo *ti = ts_info(ts); - return ti->is_const && ti->val == val; + return ti_is_const_val(ts_info(ts), val); } static inline bool arg_is_const(TCGArg arg) @@ -174,7 +146,7 @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts) ti->is_const = true; ti->val = ts->val; ti->z_mask = ts->val; - ti->s_mask = smask_from_value(ts->val); + ti->s_mask = INT64_MIN >> clrsb64(ts->val); } else { ti->is_const = false; ti->z_mask = -1; @@ -964,37 +936,31 @@ static void copy_propagate(OptContext *ctx, TCGOp *op, } } -static void finish_folding(OptContext *ctx, TCGOp *op) +static void finish_bb(OptContext *ctx) +{ + /* We only optimize memory barriers across basic blocks. */ + ctx->prev_mb = NULL; +} + +static void finish_ebb(OptContext *ctx) +{ + finish_bb(ctx); + /* We only optimize across extended basic blocks. */ + memset(&ctx->temps_used, 0, sizeof(ctx->temps_used)); + remove_mem_copy_all(ctx); +} + +static bool finish_folding(OptContext *ctx, TCGOp *op) { const TCGOpDef *def = &tcg_op_defs[op->opc]; int i, nb_oargs; - /* - * We only optimize extended basic blocks. If the opcode ends a BB - * and is not a conditional branch, reset all temp data. - */ - if (def->flags & TCG_OPF_BB_END) { - ctx->prev_mb = NULL; - if (!(def->flags & TCG_OPF_COND_BRANCH)) { - memset(&ctx->temps_used, 0, sizeof(ctx->temps_used)); - remove_mem_copy_all(ctx); - } - return; - } - nb_oargs = def->nb_oargs; for (i = 0; i < nb_oargs; i++) { TCGTemp *ts = arg_temp(op->args[i]); reset_ts(ctx, ts); - /* - * Save the corresponding known-zero/sign bits mask for the - * first output argument (only one supported so far). - */ - if (i == 0) { - ts_info(ts)->z_mask = ctx->z_mask; - ts_info(ts)->s_mask = ctx->s_mask; - } } + return true; } /* @@ -1044,11 +1010,22 @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op) return fold_const2(ctx, op); } -static bool fold_masks(OptContext *ctx, TCGOp *op) +/* + * Record "zero" and "sign" masks for the single output of @op. + * See TempOptInfo definition of z_mask and s_mask. + * If z_mask allows, fold the output to constant zero. + * The passed s_mask may be augmented by z_mask. + */ +static bool fold_masks_zs(OptContext *ctx, TCGOp *op, + uint64_t z_mask, int64_t s_mask) { - uint64_t a_mask = ctx->a_mask; - uint64_t z_mask = ctx->z_mask; - uint64_t s_mask = ctx->s_mask; + const TCGOpDef *def = &tcg_op_defs[op->opc]; + TCGTemp *ts; + TempOptInfo *ti; + int rep; + + /* Only single-output opcodes are supported here. */ + tcg_debug_assert(def->nb_oargs == 1); /* * 32-bit ops generate 32-bit results, which for the purpose of @@ -1058,16 +1035,49 @@ static bool fold_masks(OptContext *ctx, TCGOp *op) * type changing opcodes. */ if (ctx->type == TCG_TYPE_I32) { - a_mask = (int32_t)a_mask; z_mask = (int32_t)z_mask; - s_mask |= MAKE_64BIT_MASK(32, 32); - ctx->z_mask = z_mask; - ctx->s_mask = s_mask; + s_mask |= INT32_MIN; } if (z_mask == 0) { return tcg_opt_gen_movi(ctx, op, op->args[0], 0); } + + ts = arg_temp(op->args[0]); + reset_ts(ctx, ts); + + ti = ts_info(ts); + ti->z_mask = z_mask; + + /* Canonicalize s_mask and incorporate data from z_mask. */ + rep = clz64(~s_mask); + rep = MAX(rep, clz64(z_mask)); + rep = MAX(rep - 1, 0); + ti->s_mask = INT64_MIN >> rep; + + return true; +} + +static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask) +{ + return fold_masks_zs(ctx, op, z_mask, 0); +} + +static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask) +{ + return fold_masks_zs(ctx, op, -1, s_mask); +} + +/* + * An "affected" mask bit is 0 if and only if the result is identical + * to the first input. Thus if the entire mask is 0, the operation + * is equivalent to a copy. + */ +static bool fold_affected_mask(OptContext *ctx, TCGOp *op, uint64_t a_mask) +{ + if (ctx->type == TCG_TYPE_I32) { + a_mask = (uint32_t)a_mask; + } if (a_mask == 0) { return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]); } @@ -1183,13 +1193,17 @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op) * 3) those that produce information about the result value. */ +static bool fold_or(OptContext *ctx, TCGOp *op); +static bool fold_orc(OptContext *ctx, TCGOp *op); +static bool fold_xor(OptContext *ctx, TCGOp *op); + static bool fold_add(OptContext *ctx, TCGOp *op) { if (fold_const2_commutative(ctx, op) || fold_xi_to_x(ctx, op, 0)) { return true; } - return false; + return finish_folding(ctx, op); } /* We cannot as yet do_constant_folding with vectors. */ @@ -1199,7 +1213,7 @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op) fold_xi_to_x(ctx, op, 0)) { return true; } - return false; + return finish_folding(ctx, op); } static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add) @@ -1266,7 +1280,7 @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add) op->args[4] = arg_new_constant(ctx, bl); op->args[5] = arg_new_constant(ctx, bh); } - return false; + return finish_folding(ctx, op); } static bool fold_add2(OptContext *ctx, TCGOp *op) @@ -1280,7 +1294,8 @@ static bool fold_add2(OptContext *ctx, TCGOp *op) static bool fold_and(OptContext *ctx, TCGOp *op) { - uint64_t z1, z2; + uint64_t z1, z2, z_mask, s_mask; + TempOptInfo *t1, *t2; if (fold_const2_commutative(ctx, op) || fold_xi_to_i(ctx, op, 0) || @@ -1289,31 +1304,34 @@ static bool fold_and(OptContext *ctx, TCGOp *op) return true; } - z1 = arg_info(op->args[1])->z_mask; - z2 = arg_info(op->args[2])->z_mask; - ctx->z_mask = z1 & z2; - - /* - * Sign repetitions are perforce all identical, whether they are 1 or 0. - * Bitwise operations preserve the relative quantity of the repetitions. - */ - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; + t1 = arg_info(op->args[1]); + t2 = arg_info(op->args[2]); + z1 = t1->z_mask; + z2 = t2->z_mask; /* * Known-zeros does not imply known-ones. Therefore unless * arg2 is constant, we can't infer affected bits from it. */ - if (arg_is_const(op->args[2])) { - ctx->a_mask = z1 & ~z2; + if (ti_is_const(t2) && fold_affected_mask(ctx, op, z1 & ~z2)) { + return true; } - return fold_masks(ctx, op); + z_mask = z1 & z2; + + /* + * Sign repetitions are perforce all identical, whether they are 1 or 0. + * Bitwise operations preserve the relative quantity of the repetitions. + */ + s_mask = t1->s_mask & t2->s_mask; + + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_andc(OptContext *ctx, TCGOp *op) { - uint64_t z1; + uint64_t z_mask, s_mask; + TempOptInfo *t1, *t2; if (fold_const2(ctx, op) || fold_xx_to_i(ctx, op, 0) || @@ -1322,22 +1340,79 @@ static bool fold_andc(OptContext *ctx, TCGOp *op) return true; } - z1 = arg_info(op->args[1])->z_mask; + t1 = arg_info(op->args[1]); + t2 = arg_info(op->args[2]); + z_mask = t1->z_mask; /* * Known-zeros does not imply known-ones. Therefore unless * arg2 is constant, we can't infer anything from it. */ - if (arg_is_const(op->args[2])) { - uint64_t z2 = ~arg_info(op->args[2])->z_mask; - ctx->a_mask = z1 & ~z2; - z1 &= z2; + if (ti_is_const(t2)) { + uint64_t v2 = ti_const_val(t2); + if (fold_affected_mask(ctx, op, z_mask & v2)) { + return true; + } + z_mask &= ~v2; } - ctx->z_mask = z1; - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return fold_masks(ctx, op); + s_mask = t1->s_mask & t2->s_mask; + return fold_masks_zs(ctx, op, z_mask, s_mask); +} + +static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op) +{ + /* If true and false values are the same, eliminate the cmp. */ + if (args_are_copies(op->args[2], op->args[3])) { + return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]); + } + + if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) { + uint64_t tv = arg_info(op->args[2])->val; + uint64_t fv = arg_info(op->args[3])->val; + + if (tv == -1 && fv == 0) { + return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]); + } + if (tv == 0 && fv == -1) { + if (TCG_TARGET_HAS_not_vec) { + op->opc = INDEX_op_not_vec; + return fold_not(ctx, op); + } else { + op->opc = INDEX_op_xor_vec; + op->args[2] = arg_new_constant(ctx, -1); + return fold_xor(ctx, op); + } + } + } + if (arg_is_const(op->args[2])) { + uint64_t tv = arg_info(op->args[2])->val; + if (tv == -1) { + op->opc = INDEX_op_or_vec; + op->args[2] = op->args[3]; + return fold_or(ctx, op); + } + if (tv == 0 && TCG_TARGET_HAS_andc_vec) { + op->opc = INDEX_op_andc_vec; + op->args[2] = op->args[1]; + op->args[1] = op->args[3]; + return fold_andc(ctx, op); + } + } + if (arg_is_const(op->args[3])) { + uint64_t fv = arg_info(op->args[3])->val; + if (fv == 0) { + op->opc = INDEX_op_and_vec; + return fold_and(ctx, op); + } + if (fv == -1 && TCG_TARGET_HAS_orc_vec) { + op->opc = INDEX_op_orc_vec; + op->args[2] = op->args[1]; + op->args[1] = op->args[3]; + return fold_orc(ctx, op); + } + } + return finish_folding(ctx, op); } static bool fold_brcond(OptContext *ctx, TCGOp *op) @@ -1351,8 +1426,11 @@ static bool fold_brcond(OptContext *ctx, TCGOp *op) if (i > 0) { op->opc = INDEX_op_br; op->args[0] = op->args[3]; + finish_ebb(ctx); + } else { + finish_bb(ctx); } - return false; + return true; } static bool fold_brcond2(OptContext *ctx, TCGOp *op) @@ -1443,24 +1521,27 @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op) } op->opc = INDEX_op_br; op->args[0] = label; - break; + finish_ebb(ctx); + return true; } - return false; + + finish_bb(ctx); + return true; } static bool fold_bswap(OptContext *ctx, TCGOp *op) { uint64_t z_mask, s_mask, sign; + TempOptInfo *t1 = arg_info(op->args[1]); - if (arg_is_const(op->args[1])) { - uint64_t t = arg_info(op->args[1])->val; - - t = do_constant_folding(op->opc, ctx->type, t, op->args[2]); - return tcg_opt_gen_movi(ctx, op, op->args[0], t); + if (ti_is_const(t1)) { + return tcg_opt_gen_movi(ctx, op, op->args[0], + do_constant_folding(op->opc, ctx->type, + ti_const_val(t1), + op->args[2])); } - z_mask = arg_info(op->args[1])->z_mask; - + z_mask = t1->z_mask; switch (op->opc) { case INDEX_op_bswap16_i32: case INDEX_op_bswap16_i64: @@ -1479,8 +1560,8 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op) default: g_assert_not_reached(); } - s_mask = smask_from_zmask(z_mask); + s_mask = 0; switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) { case TCG_BSWAP_OZ: break; @@ -1488,19 +1569,17 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op) /* If the sign bit may be 1, force all the bits above to 1. */ if (z_mask & sign) { z_mask |= sign; - s_mask = sign << 1; } + /* The value and therefore s_mask is explicitly sign-extended. */ + s_mask = sign; break; default: /* The high bits are undefined: force all bits above the sign to 1. */ z_mask |= sign << 1; - s_mask = 0; break; } - ctx->z_mask = z_mask; - ctx->s_mask = s_mask; - return fold_masks(ctx, op); + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_call(OptContext *ctx, TCGOp *op) @@ -1540,12 +1619,44 @@ static bool fold_call(OptContext *ctx, TCGOp *op) return true; } +static bool fold_cmp_vec(OptContext *ctx, TCGOp *op) +{ + /* Canonicalize the comparison to put immediate second. */ + if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) { + op->args[3] = tcg_swap_cond(op->args[3]); + } + return finish_folding(ctx, op); +} + +static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op) +{ + /* If true and false values are the same, eliminate the cmp. */ + if (args_are_copies(op->args[3], op->args[4])) { + return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]); + } + + /* Canonicalize the comparison to put immediate second. */ + if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) { + op->args[5] = tcg_swap_cond(op->args[5]); + } + /* + * Canonicalize the "false" input reg to match the destination, + * so that the tcg backend can implement "move if true". + */ + if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) { + op->args[5] = tcg_invert_cond(op->args[5]); + } + return finish_folding(ctx, op); +} + static bool fold_count_zeros(OptContext *ctx, TCGOp *op) { - uint64_t z_mask; + uint64_t z_mask, s_mask; + TempOptInfo *t1 = arg_info(op->args[1]); + TempOptInfo *t2 = arg_info(op->args[2]); - if (arg_is_const(op->args[1])) { - uint64_t t = arg_info(op->args[1])->val; + if (ti_is_const(t1)) { + uint64_t t = ti_const_val(t1); if (t != 0) { t = do_constant_folding(op->opc, ctx->type, t, 0); @@ -1564,79 +1675,91 @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op) default: g_assert_not_reached(); } - ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask; - ctx->s_mask = smask_from_zmask(ctx->z_mask); - return false; + s_mask = ~z_mask; + z_mask |= t2->z_mask; + s_mask &= t2->s_mask; + + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_ctpop(OptContext *ctx, TCGOp *op) { + uint64_t z_mask; + if (fold_const1(ctx, op)) { return true; } switch (ctx->type) { case TCG_TYPE_I32: - ctx->z_mask = 32 | 31; + z_mask = 32 | 31; break; case TCG_TYPE_I64: - ctx->z_mask = 64 | 63; + z_mask = 64 | 63; break; default: g_assert_not_reached(); } - ctx->s_mask = smask_from_zmask(ctx->z_mask); - return false; + return fold_masks_z(ctx, op, z_mask); } static bool fold_deposit(OptContext *ctx, TCGOp *op) { + TempOptInfo *t1 = arg_info(op->args[1]); + TempOptInfo *t2 = arg_info(op->args[2]); + int ofs = op->args[3]; + int len = op->args[4]; + int width; TCGOpcode and_opc; + uint64_t z_mask, s_mask; - if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) { - uint64_t t1 = arg_info(op->args[1])->val; - uint64_t t2 = arg_info(op->args[2])->val; - - t1 = deposit64(t1, op->args[3], op->args[4], t2); - return tcg_opt_gen_movi(ctx, op, op->args[0], t1); + if (ti_is_const(t1) && ti_is_const(t2)) { + return tcg_opt_gen_movi(ctx, op, op->args[0], + deposit64(ti_const_val(t1), ofs, len, + ti_const_val(t2))); } switch (ctx->type) { case TCG_TYPE_I32: and_opc = INDEX_op_and_i32; + width = 32; break; case TCG_TYPE_I64: and_opc = INDEX_op_and_i64; + width = 64; break; default: g_assert_not_reached(); } /* Inserting a value into zero at offset 0. */ - if (arg_is_const_val(op->args[1], 0) && op->args[3] == 0) { - uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]); + if (ti_is_const_val(t1, 0) && ofs == 0) { + uint64_t mask = MAKE_64BIT_MASK(0, len); op->opc = and_opc; op->args[1] = op->args[2]; op->args[2] = arg_new_constant(ctx, mask); - ctx->z_mask = mask & arg_info(op->args[1])->z_mask; - return false; + return fold_and(ctx, op); } /* Inserting zero into a value. */ - if (arg_is_const_val(op->args[2], 0)) { - uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0); + if (ti_is_const_val(t2, 0)) { + uint64_t mask = deposit64(-1, ofs, len, 0); op->opc = and_opc; op->args[2] = arg_new_constant(ctx, mask); - ctx->z_mask = mask & arg_info(op->args[1])->z_mask; - return false; + return fold_and(ctx, op); } - ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask, - op->args[3], op->args[4], - arg_info(op->args[2])->z_mask); - return false; + /* The s_mask from the top portion of the deposit is still valid. */ + if (ofs + len == width) { + s_mask = t2->s_mask << ofs; + } else { + s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len); + } + + z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask); + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_divide(OptContext *ctx, TCGOp *op) @@ -1645,7 +1768,7 @@ static bool fold_divide(OptContext *ctx, TCGOp *op) fold_xi_to_x(ctx, op, 1)) { return true; } - return false; + return finish_folding(ctx, op); } static bool fold_dup(OptContext *ctx, TCGOp *op) @@ -1655,7 +1778,7 @@ static bool fold_dup(OptContext *ctx, TCGOp *op) t = dup_const(TCGOP_VECE(op), t); return tcg_opt_gen_movi(ctx, op, op->args[0], t); } - return false; + return finish_folding(ctx, op); } static bool fold_dup2(OptContext *ctx, TCGOp *op) @@ -1670,45 +1793,43 @@ static bool fold_dup2(OptContext *ctx, TCGOp *op) op->opc = INDEX_op_dup_vec; TCGOP_VECE(op) = MO_32; } - return false; + return finish_folding(ctx, op); } static bool fold_eqv(OptContext *ctx, TCGOp *op) { + uint64_t s_mask; + if (fold_const2_commutative(ctx, op) || fold_xi_to_x(ctx, op, -1) || fold_xi_to_not(ctx, op, 0)) { return true; } - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return false; + s_mask = arg_info(op->args[1])->s_mask + & arg_info(op->args[2])->s_mask; + return fold_masks_s(ctx, op, s_mask); } static bool fold_extract(OptContext *ctx, TCGOp *op) { uint64_t z_mask_old, z_mask; + TempOptInfo *t1 = arg_info(op->args[1]); int pos = op->args[2]; int len = op->args[3]; - if (arg_is_const(op->args[1])) { - uint64_t t; - - t = arg_info(op->args[1])->val; - t = extract64(t, pos, len); - return tcg_opt_gen_movi(ctx, op, op->args[0], t); + if (ti_is_const(t1)) { + return tcg_opt_gen_movi(ctx, op, op->args[0], + extract64(ti_const_val(t1), pos, len)); } - z_mask_old = arg_info(op->args[1])->z_mask; + z_mask_old = t1->z_mask; z_mask = extract64(z_mask_old, pos, len); - if (pos == 0) { - ctx->a_mask = z_mask_old ^ z_mask; + if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) { + return true; } - ctx->z_mask = z_mask; - ctx->s_mask = smask_from_zmask(z_mask); - return fold_masks(ctx, op); + return fold_masks_z(ctx, op, z_mask); } static bool fold_extract2(OptContext *ctx, TCGOp *op) @@ -1727,54 +1848,49 @@ static bool fold_extract2(OptContext *ctx, TCGOp *op) } return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2); } - return false; + return finish_folding(ctx, op); } static bool fold_exts(OptContext *ctx, TCGOp *op) { - uint64_t s_mask_old, s_mask, z_mask, sign; + uint64_t s_mask_old, s_mask, z_mask; bool type_change = false; + TempOptInfo *t1; if (fold_const1(ctx, op)) { return true; } - z_mask = arg_info(op->args[1])->z_mask; - s_mask = arg_info(op->args[1])->s_mask; + t1 = arg_info(op->args[1]); + z_mask = t1->z_mask; + s_mask = t1->s_mask; s_mask_old = s_mask; switch (op->opc) { CASE_OP_32_64(ext8s): - sign = INT8_MIN; - z_mask = (uint8_t)z_mask; + s_mask |= INT8_MIN; + z_mask = (int8_t)z_mask; break; CASE_OP_32_64(ext16s): - sign = INT16_MIN; - z_mask = (uint16_t)z_mask; + s_mask |= INT16_MIN; + z_mask = (int16_t)z_mask; break; case INDEX_op_ext_i32_i64: type_change = true; QEMU_FALLTHROUGH; case INDEX_op_ext32s_i64: - sign = INT32_MIN; - z_mask = (uint32_t)z_mask; + s_mask |= INT32_MIN; + z_mask = (int32_t)z_mask; break; default: g_assert_not_reached(); } - if (z_mask & sign) { - z_mask |= sign; - } - s_mask |= sign << 1; - - ctx->z_mask = z_mask; - ctx->s_mask = s_mask; - if (!type_change) { - ctx->a_mask = s_mask & ~s_mask_old; + if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) { + return true; } - return fold_masks(ctx, op); + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_extu(OptContext *ctx, TCGOp *op) @@ -1810,12 +1926,11 @@ static bool fold_extu(OptContext *ctx, TCGOp *op) g_assert_not_reached(); } - ctx->z_mask = z_mask; - ctx->s_mask = smask_from_zmask(z_mask); - if (!type_change) { - ctx->a_mask = z_mask_old ^ z_mask; + if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) { + return true; } - return fold_masks(ctx, op); + + return fold_masks_z(ctx, op, z_mask); } static bool fold_mb(OptContext *ctx, TCGOp *op) @@ -1849,6 +1964,8 @@ static bool fold_mov(OptContext *ctx, TCGOp *op) static bool fold_movcond(OptContext *ctx, TCGOp *op) { + uint64_t z_mask, s_mask; + TempOptInfo *tt, *ft; int i; /* If true and false values are the same, eliminate the cmp. */ @@ -1870,14 +1987,14 @@ static bool fold_movcond(OptContext *ctx, TCGOp *op) return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]); } - ctx->z_mask = arg_info(op->args[3])->z_mask - | arg_info(op->args[4])->z_mask; - ctx->s_mask = arg_info(op->args[3])->s_mask - & arg_info(op->args[4])->s_mask; + tt = arg_info(op->args[3]); + ft = arg_info(op->args[4]); + z_mask = tt->z_mask | ft->z_mask; + s_mask = tt->s_mask & ft->s_mask; - if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) { - uint64_t tv = arg_info(op->args[3])->val; - uint64_t fv = arg_info(op->args[4])->val; + if (ti_is_const(tt) && ti_is_const(ft)) { + uint64_t tv = ti_const_val(tt); + uint64_t fv = ti_const_val(ft); TCGOpcode opc, negopc = 0; TCGCond cond = op->args[5]; @@ -1916,7 +2033,8 @@ static bool fold_movcond(OptContext *ctx, TCGOp *op) } } } - return false; + + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_mul(OptContext *ctx, TCGOp *op) @@ -1926,7 +2044,7 @@ static bool fold_mul(OptContext *ctx, TCGOp *op) fold_xi_to_x(ctx, op, 1)) { return true; } - return false; + return finish_folding(ctx, op); } static bool fold_mul_highpart(OptContext *ctx, TCGOp *op) @@ -1935,7 +2053,7 @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op) fold_xi_to_i(ctx, op, 0)) { return true; } - return false; + return finish_folding(ctx, op); } static bool fold_multiply2(OptContext *ctx, TCGOp *op) @@ -1980,33 +2098,30 @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op) tcg_opt_gen_movi(ctx, op2, rh, h); return true; } - return false; + return finish_folding(ctx, op); } static bool fold_nand(OptContext *ctx, TCGOp *op) { + uint64_t s_mask; + if (fold_const2_commutative(ctx, op) || fold_xi_to_not(ctx, op, -1)) { return true; } - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return false; + s_mask = arg_info(op->args[1])->s_mask + & arg_info(op->args[2])->s_mask; + return fold_masks_s(ctx, op, s_mask); } static bool fold_neg_no_const(OptContext *ctx, TCGOp *op) { /* Set to 1 all bits to the left of the rightmost. */ uint64_t z_mask = arg_info(op->args[1])->z_mask; - ctx->z_mask = -(z_mask & -z_mask); + z_mask = -(z_mask & -z_mask); - /* - * Because of fold_sub_to_neg, we want to always return true, - * via finish_folding. - */ - finish_folding(ctx, op); - return true; + return fold_masks_z(ctx, op, z_mask); } static bool fold_neg(OptContext *ctx, TCGOp *op) @@ -2016,14 +2131,16 @@ static bool fold_neg(OptContext *ctx, TCGOp *op) static bool fold_nor(OptContext *ctx, TCGOp *op) { + uint64_t s_mask; + if (fold_const2_commutative(ctx, op) || fold_xi_to_not(ctx, op, 0)) { return true; } - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return false; + s_mask = arg_info(op->args[1])->s_mask + & arg_info(op->args[2])->s_mask; + return fold_masks_s(ctx, op, s_mask); } static bool fold_not(OptContext *ctx, TCGOp *op) @@ -2031,31 +2148,31 @@ static bool fold_not(OptContext *ctx, TCGOp *op) if (fold_const1(ctx, op)) { return true; } - - ctx->s_mask = arg_info(op->args[1])->s_mask; - - /* Because of fold_to_not, we want to always return true, via finish. */ - finish_folding(ctx, op); - return true; + return fold_masks_s(ctx, op, arg_info(op->args[1])->s_mask); } static bool fold_or(OptContext *ctx, TCGOp *op) { + uint64_t z_mask, s_mask; + TempOptInfo *t1, *t2; + if (fold_const2_commutative(ctx, op) || fold_xi_to_x(ctx, op, 0) || fold_xx_to_x(ctx, op)) { return true; } - ctx->z_mask = arg_info(op->args[1])->z_mask - | arg_info(op->args[2])->z_mask; - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return fold_masks(ctx, op); + t1 = arg_info(op->args[1]); + t2 = arg_info(op->args[2]); + z_mask = t1->z_mask | t2->z_mask; + s_mask = t1->s_mask & t2->s_mask; + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_orc(OptContext *ctx, TCGOp *op) { + uint64_t s_mask; + if (fold_const2(ctx, op) || fold_xx_to_i(ctx, op, -1) || fold_xi_to_x(ctx, op, -1) || @@ -2063,36 +2180,45 @@ static bool fold_orc(OptContext *ctx, TCGOp *op) return true; } - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return false; + s_mask = arg_info(op->args[1])->s_mask + & arg_info(op->args[2])->s_mask; + return fold_masks_s(ctx, op, s_mask); } -static bool fold_qemu_ld(OptContext *ctx, TCGOp *op) +static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op) { const TCGOpDef *def = &tcg_op_defs[op->opc]; MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs]; MemOp mop = get_memop(oi); int width = 8 * memop_size(mop); + uint64_t z_mask = -1, s_mask = 0; if (width < 64) { - ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width); - if (!(mop & MO_SIGN)) { - ctx->z_mask = MAKE_64BIT_MASK(0, width); - ctx->s_mask <<= 1; + if (mop & MO_SIGN) { + s_mask = MAKE_64BIT_MASK(width - 1, 64 - (width - 1)); + } else { + z_mask = MAKE_64BIT_MASK(0, width); } } /* Opcodes that touch guest memory stop the mb optimization. */ ctx->prev_mb = NULL; - return false; + + return fold_masks_zs(ctx, op, z_mask, s_mask); +} + +static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op) +{ + /* Opcodes that touch guest memory stop the mb optimization. */ + ctx->prev_mb = NULL; + return finish_folding(ctx, op); } static bool fold_qemu_st(OptContext *ctx, TCGOp *op) { /* Opcodes that touch guest memory stop the mb optimization. */ ctx->prev_mb = NULL; - return false; + return true; } static bool fold_remainder(OptContext *ctx, TCGOp *op) @@ -2101,10 +2227,11 @@ static bool fold_remainder(OptContext *ctx, TCGOp *op) fold_xx_to_i(ctx, op, 0)) { return true; } - return false; + return finish_folding(ctx, op); } -static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg) +/* Return 1 if finished, -1 if simplified, 0 if unchanged. */ +static int fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg) { uint64_t a_zmask, b_val; TCGCond cond; @@ -2199,11 +2326,10 @@ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg) op->opc = xor_opc; op->args[2] = arg_new_constant(ctx, 1); } - return false; + return -1; } } - - return false; + return 0; } static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg) @@ -2308,14 +2434,15 @@ static bool fold_setcond(OptContext *ctx, TCGOp *op) return tcg_opt_gen_movi(ctx, op, op->args[0], i); } - if (fold_setcond_zmask(ctx, op, false)) { + i = fold_setcond_zmask(ctx, op, false); + if (i > 0) { return true; } - fold_setcond_tst_pow2(ctx, op, false); + if (i == 0) { + fold_setcond_tst_pow2(ctx, op, false); + } - ctx->z_mask = 1; - ctx->s_mask = smask_from_zmask(1); - return false; + return fold_masks_z(ctx, op, 1); } static bool fold_negsetcond(OptContext *ctx, TCGOp *op) @@ -2326,14 +2453,16 @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op) return tcg_opt_gen_movi(ctx, op, op->args[0], -i); } - if (fold_setcond_zmask(ctx, op, true)) { + i = fold_setcond_zmask(ctx, op, true); + if (i > 0) { return true; } - fold_setcond_tst_pow2(ctx, op, true); + if (i == 0) { + fold_setcond_tst_pow2(ctx, op, true); + } /* Value is {0,-1} so all bits are repetitions of the sign. */ - ctx->s_mask = -1; - return false; + return fold_masks_s(ctx, op, -1); } static bool fold_setcond2(OptContext *ctx, TCGOp *op) @@ -2414,77 +2543,40 @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op) return fold_setcond(ctx, op); } - ctx->z_mask = 1; - ctx->s_mask = smask_from_zmask(1); - return false; + return fold_masks_z(ctx, op, 1); do_setcond_const: return tcg_opt_gen_movi(ctx, op, op->args[0], i); } -static bool fold_cmp_vec(OptContext *ctx, TCGOp *op) -{ - /* Canonicalize the comparison to put immediate second. */ - if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) { - op->args[3] = tcg_swap_cond(op->args[3]); - } - return false; -} - -static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op) -{ - /* If true and false values are the same, eliminate the cmp. */ - if (args_are_copies(op->args[3], op->args[4])) { - return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]); - } - - /* Canonicalize the comparison to put immediate second. */ - if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) { - op->args[5] = tcg_swap_cond(op->args[5]); - } - /* - * Canonicalize the "false" input reg to match the destination, - * so that the tcg backend can implement "move if true". - */ - if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) { - op->args[5] = tcg_invert_cond(op->args[5]); - } - return false; -} - static bool fold_sextract(OptContext *ctx, TCGOp *op) { uint64_t z_mask, s_mask, s_mask_old; + TempOptInfo *t1 = arg_info(op->args[1]); int pos = op->args[2]; int len = op->args[3]; - if (arg_is_const(op->args[1])) { - uint64_t t; - - t = arg_info(op->args[1])->val; - t = sextract64(t, pos, len); - return tcg_opt_gen_movi(ctx, op, op->args[0], t); + if (ti_is_const(t1)) { + return tcg_opt_gen_movi(ctx, op, op->args[0], + sextract64(ti_const_val(t1), pos, len)); } - z_mask = arg_info(op->args[1])->z_mask; - z_mask = sextract64(z_mask, pos, len); - ctx->z_mask = z_mask; + s_mask_old = t1->s_mask; + s_mask = s_mask_old >> pos; + s_mask |= -1ull << (len - 1); - s_mask_old = arg_info(op->args[1])->s_mask; - s_mask = sextract64(s_mask_old, pos, len); - s_mask |= MAKE_64BIT_MASK(len, 64 - len); - ctx->s_mask = s_mask; - - if (pos == 0) { - ctx->a_mask = s_mask & ~s_mask_old; + if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) { + return true; } - return fold_masks(ctx, op); + z_mask = sextract64(t1->z_mask, pos, len); + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_shift(OptContext *ctx, TCGOp *op) { - uint64_t s_mask, z_mask, sign; + uint64_t s_mask, z_mask; + TempOptInfo *t1, *t2; if (fold_const2(ctx, op) || fold_ix_to_i(ctx, op, 0) || @@ -2492,18 +2584,18 @@ static bool fold_shift(OptContext *ctx, TCGOp *op) return true; } - s_mask = arg_info(op->args[1])->s_mask; - z_mask = arg_info(op->args[1])->z_mask; - - if (arg_is_const(op->args[2])) { - int sh = arg_info(op->args[2])->val; + t1 = arg_info(op->args[1]); + t2 = arg_info(op->args[2]); + s_mask = t1->s_mask; + z_mask = t1->z_mask; - ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh); + if (ti_is_const(t2)) { + int sh = ti_const_val(t2); + z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh); s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh); - ctx->s_mask = smask_from_smask(s_mask); - return fold_masks(ctx, op); + return fold_masks_zs(ctx, op, z_mask, s_mask); } switch (op->opc) { @@ -2512,23 +2604,21 @@ static bool fold_shift(OptContext *ctx, TCGOp *op) * Arithmetic right shift will not reduce the number of * input sign repetitions. */ - ctx->s_mask = s_mask; - break; + return fold_masks_s(ctx, op, s_mask); CASE_OP_32_64(shr): /* * If the sign bit is known zero, then logical right shift - * will not reduced the number of input sign repetitions. + * will not reduce the number of input sign repetitions. */ - sign = (s_mask & -s_mask) >> 1; - if (sign && !(z_mask & sign)) { - ctx->s_mask = s_mask; + if (~z_mask & -s_mask) { + return fold_masks_s(ctx, op, s_mask); } break; default: break; } - return false; + return finish_folding(ctx, op); } static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op) @@ -2575,12 +2665,15 @@ static bool fold_sub_vec(OptContext *ctx, TCGOp *op) fold_sub_to_neg(ctx, op)) { return true; } - return false; + return finish_folding(ctx, op); } static bool fold_sub(OptContext *ctx, TCGOp *op) { - if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) { + if (fold_const2(ctx, op) || + fold_xx_to_i(ctx, op, 0) || + fold_xi_to_x(ctx, op, 0) || + fold_sub_to_neg(ctx, op)) { return true; } @@ -2592,7 +2685,7 @@ static bool fold_sub(OptContext *ctx, TCGOp *op) ? INDEX_op_add_i32 : INDEX_op_add_i64); op->args[2] = arg_new_constant(ctx, -val); } - return false; + return finish_folding(ctx, op); } static bool fold_sub2(OptContext *ctx, TCGOp *op) @@ -2602,33 +2695,32 @@ static bool fold_sub2(OptContext *ctx, TCGOp *op) static bool fold_tcg_ld(OptContext *ctx, TCGOp *op) { + uint64_t z_mask = -1, s_mask = 0; + /* We can't do any folding with a load, but we can record bits. */ switch (op->opc) { CASE_OP_32_64(ld8s): - ctx->s_mask = MAKE_64BIT_MASK(8, 56); + s_mask = INT8_MIN; break; CASE_OP_32_64(ld8u): - ctx->z_mask = MAKE_64BIT_MASK(0, 8); - ctx->s_mask = MAKE_64BIT_MASK(9, 55); + z_mask = MAKE_64BIT_MASK(0, 8); break; CASE_OP_32_64(ld16s): - ctx->s_mask = MAKE_64BIT_MASK(16, 48); + s_mask = INT16_MIN; break; CASE_OP_32_64(ld16u): - ctx->z_mask = MAKE_64BIT_MASK(0, 16); - ctx->s_mask = MAKE_64BIT_MASK(17, 47); + z_mask = MAKE_64BIT_MASK(0, 16); break; case INDEX_op_ld32s_i64: - ctx->s_mask = MAKE_64BIT_MASK(32, 32); + s_mask = INT32_MIN; break; case INDEX_op_ld32u_i64: - ctx->z_mask = MAKE_64BIT_MASK(0, 32); - ctx->s_mask = MAKE_64BIT_MASK(33, 31); + z_mask = MAKE_64BIT_MASK(0, 32); break; default: g_assert_not_reached(); } - return false; + return fold_masks_zs(ctx, op, z_mask, s_mask); } static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op) @@ -2638,7 +2730,7 @@ static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op) TCGType type; if (op->args[1] != tcgv_ptr_arg(tcg_env)) { - return false; + return finish_folding(ctx, op); } type = ctx->type; @@ -2661,7 +2753,7 @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op) if (op->args[1] != tcgv_ptr_arg(tcg_env)) { remove_mem_copy_all(ctx); - return false; + return true; } switch (op->opc) { @@ -2685,7 +2777,7 @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op) g_assert_not_reached(); } remove_mem_copy_in(ctx, ofs, ofs + lm1); - return false; + return true; } static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op) @@ -2695,8 +2787,7 @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op) TCGType type; if (op->args[1] != tcgv_ptr_arg(tcg_env)) { - fold_tcg_st(ctx, op); - return false; + return fold_tcg_st(ctx, op); } src = arg_temp(op->args[0]); @@ -2718,11 +2809,14 @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op) last = ofs + tcg_type_size(type) - 1; remove_mem_copy_in(ctx, ofs, last); record_mem_copy(ctx, type, src, ofs, last); - return false; + return true; } static bool fold_xor(OptContext *ctx, TCGOp *op) { + uint64_t z_mask, s_mask; + TempOptInfo *t1, *t2; + if (fold_const2_commutative(ctx, op) || fold_xx_to_i(ctx, op, 0) || fold_xi_to_x(ctx, op, 0) || @@ -2730,66 +2824,11 @@ static bool fold_xor(OptContext *ctx, TCGOp *op) return true; } - ctx->z_mask = arg_info(op->args[1])->z_mask - | arg_info(op->args[2])->z_mask; - ctx->s_mask = arg_info(op->args[1])->s_mask - & arg_info(op->args[2])->s_mask; - return fold_masks(ctx, op); -} - -static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op) -{ - /* If true and false values are the same, eliminate the cmp. */ - if (args_are_copies(op->args[2], op->args[3])) { - return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]); - } - - if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) { - uint64_t tv = arg_info(op->args[2])->val; - uint64_t fv = arg_info(op->args[3])->val; - - if (tv == -1 && fv == 0) { - return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]); - } - if (tv == 0 && fv == -1) { - if (TCG_TARGET_HAS_not_vec) { - op->opc = INDEX_op_not_vec; - return fold_not(ctx, op); - } else { - op->opc = INDEX_op_xor_vec; - op->args[2] = arg_new_constant(ctx, -1); - return fold_xor(ctx, op); - } - } - } - if (arg_is_const(op->args[2])) { - uint64_t tv = arg_info(op->args[2])->val; - if (tv == -1) { - op->opc = INDEX_op_or_vec; - op->args[2] = op->args[3]; - return fold_or(ctx, op); - } - if (tv == 0 && TCG_TARGET_HAS_andc_vec) { - op->opc = INDEX_op_andc_vec; - op->args[2] = op->args[1]; - op->args[1] = op->args[3]; - return fold_andc(ctx, op); - } - } - if (arg_is_const(op->args[3])) { - uint64_t fv = arg_info(op->args[3])->val; - if (fv == 0) { - op->opc = INDEX_op_and_vec; - return fold_and(ctx, op); - } - if (fv == -1 && TCG_TARGET_HAS_orc_vec) { - op->opc = INDEX_op_orc_vec; - op->args[2] = op->args[1]; - op->args[1] = op->args[3]; - return fold_orc(ctx, op); - } - } - return false; + t1 = arg_info(op->args[1]); + t2 = arg_info(op->args[2]); + z_mask = t1->z_mask | t2->z_mask; + s_mask = t1->s_mask & t2->s_mask; + return fold_masks_zs(ctx, op, z_mask, s_mask); } /* Propagate constants and copies, fold constant expressions. */ @@ -2835,11 +2874,6 @@ void tcg_optimize(TCGContext *s) ctx.type = TCG_TYPE_I32; } - /* Assume all bits affected, no bits known zero, no sign reps. */ - ctx.a_mask = -1; - ctx.z_mask = -1; - ctx.s_mask = 0; - /* * Process each opcode. * Sorted alphabetically by opcode as much as possible. @@ -2977,11 +3011,18 @@ void tcg_optimize(TCGContext *s) break; case INDEX_op_qemu_ld_a32_i32: case INDEX_op_qemu_ld_a64_i32: + done = fold_qemu_ld_1reg(&ctx, op); + break; case INDEX_op_qemu_ld_a32_i64: case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + done = fold_qemu_ld_1reg(&ctx, op); + break; + } + QEMU_FALLTHROUGH; case INDEX_op_qemu_ld_a32_i128: case INDEX_op_qemu_ld_a64_i128: - done = fold_qemu_ld(&ctx, op); + done = fold_qemu_ld_2reg(&ctx, op); break; case INDEX_op_qemu_st8_a32_i32: case INDEX_op_qemu_st8_a64_i32: @@ -3037,12 +3078,18 @@ void tcg_optimize(TCGContext *s) CASE_OP_32_64_VEC(xor): done = fold_xor(&ctx, op); break; + case INDEX_op_set_label: + case INDEX_op_br: + case INDEX_op_exit_tb: + case INDEX_op_goto_tb: + case INDEX_op_goto_ptr: + finish_ebb(&ctx); + done = true; + break; default: + done = finish_folding(&ctx, op); break; } - - if (!done) { - finish_folding(&ctx, op); - } + tcg_debug_assert(done); } } diff --git a/tests/tcg/multiarch/system/memory.c b/tests/tcg/multiarch/system/memory.c index 65a6038..7508f6b 100644 --- a/tests/tcg/multiarch/system/memory.c +++ b/tests/tcg/multiarch/system/memory.c @@ -14,7 +14,6 @@ #include <stdint.h> #include <stdbool.h> -#include <inttypes.h> #include <minilib.h> #ifndef CHECK_UNALIGNED @@ -511,8 +510,8 @@ int main(void) int i; bool ok = true; - ml_printf("Test data start: 0x%"PRIxPTR"\n", &test_data[0]); - ml_printf("Test data end: 0x%"PRIxPTR"\n", &test_data[TEST_SIZE]); + ml_printf("Test data start: 0x%lx\n", (unsigned long)&test_data[0]); + ml_printf("Test data end: 0x%lx\n", (unsigned long)&test_data[TEST_SIZE]); /* Run through the unsigned tests first */ for (i = 0; i < ARRAY_SIZE(init_ufns) && ok; i++) { @@ -529,8 +528,8 @@ int main(void) ok = do_signed_reads(true); } - ml_printf("Test data read: %"PRId32"\n", test_read_count); - ml_printf("Test data write: %"PRId32"\n", test_write_count); + ml_printf("Test data read: %lu\n", (unsigned long)test_read_count); + ml_printf("Test data write: %lu\n", (unsigned long)test_write_count); ml_printf("Test complete: %s\n", ok ? "PASSED" : "FAILED"); return ok ? 0 : -1; } |