diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2024-09-19 14:15:15 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2024-09-19 14:15:15 +0100 |
commit | 01dc65a3bc262ab1bec8fe89775e9bbfa627becb (patch) | |
tree | e7bcd02ed90b67c9dd0c688bb7b59424fd84ab2e | |
parent | 14556211bc6d7125a44d5b5df90caba019b0ec0e (diff) | |
parent | 89b30b4921e51bb47313d2d8fdc3d7bce987e4c5 (diff) | |
download | qemu-01dc65a3bc262ab1bec8fe89775e9bbfa627becb.zip qemu-01dc65a3bc262ab1bec8fe89775e9bbfa627becb.tar.gz qemu-01dc65a3bc262ab1bec8fe89775e9bbfa627becb.tar.bz2 |
Merge tag 'pull-target-arm-20240919' of https://git.linaro.org/people/pmaydell/qemu-arm into staging
target-arm queue:
* target/arm: Correct ID_AA64ISAR1_EL1 value for neoverse-v1
* target/arm: More conversions to decodetree of A64 SIMD insns
* hw/char/stm32l4x5_usart.c: Enable USART ACK bit response
* tests: update aarch64/sbsa-ref tests
* kvm: minor Coverity nit fixes
* docs/devel: Remove nested-papr.txt
# -----BEGIN PGP SIGNATURE-----
#
# iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAmbsIloZHHBldGVyLm1h
# eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3k96EACgUaTSEX1BNee0Xsk8wIHb
# KebGryNymj7LPpWdRxiyQYmZbjfelZPJW7F2tr3iGfiphz+N5TIdRDQlPrRePNg4
# 5Ure0ShRgn+RlkjFe8r9yjrr3HEAXLnVb5fgzPqGEQ+UXWRVzr72+q+wrGFVuXtn
# zfqxpc6F0TCxeyH88X6mpKkTeEjfuE++TIf885VVX6nB8qkkDUifRLjdrDcJoxp3
# BRrE0Ntob2W0NAHm/QTbATLDErsYXIzm6pX2hWy3DNEQrVHw2rJ1FAzsjt2J/8ZU
# PC7hwmkPBsrnq5wcpBtOwloHzrIYuwUBI7ABPGctCPziCSw7N55vkgUmGWZ+nYHc
# DwYGu3H84su0hBi/E9gxl8z/ATBvuAIVa1RLHvbYiwdd088DqxdBe1YLLRaZeKzP
# oldQBzVegaN1n5n8tHO43b/38V7uknu3fDXGw/OrsO5DouDAj9NoRil1caRx4ZYd
# dr4IiWKzmlW8wpWgoBnrRbycuNsi6b9HblOX1umjwubCGO+GFesBRAInUeg9gbuv
# xolfYOScUE/nkTaqulAiPGqGZV8P0fqVMbXmXuowm7iIdw3JihEUm+mU18CTPFBd
# P/6SH47lXTaQA0JJQmD4LKraZQFYzm5rlSvW/15/mnzutZXMUlWzvxK/E5fX3vhL
# VUguR6XO8Cjb0cQJWohB1w==
# =JklH
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 19 Sep 2024 14:08:42 BST
# gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE
# gpg: issuer "peter.maydell@linaro.org"
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate]
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate]
# gpg: aka "Peter Maydell <peter@archaic.org.uk>" [ultimate]
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* tag 'pull-target-arm-20240919' of https://git.linaro.org/people/pmaydell/qemu-arm: (38 commits)
docs/devel: Remove nested-papr.txt
target/arm: Correct ID_AA64ISAR1_EL1 value for neoverse-v1
kvm: Remove unreachable code in kvm_dirty_ring_reaper_thread()
kvm: Make 'mmap_size' be 'int' in kvm_init_vcpu(), do_kvm_destroy_vcpu()
tests: drop OpenBSD tests for aarch64/sbsa-ref
tests: expand timeout information for aarch64/sbsa-ref
tests: add FreeBSD tests for aarch64/sbsa-ref
tests: use default cpu for aarch64/sbsa-ref
hw/char/stm32l4x5_usart.c: Enable USART ACK bit response
target/arm: Convert scalar [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree
target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree
target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree
target/arm: Widen NeonGenNarrowEnvFn return to 64 bits
target/arm: Convert VQSHL, VQSHLU to gvec
target/arm: Convert handle_scalar_simd_shli to decodetree
target/arm: Convert handle_scalar_simd_shri to decodetree
target/arm: Convert SHRN, RSHRN to decodetree
target/arm: Split out subroutines of handle_shri_with_rndacc
target/arm: Push tcg_rnd into handle_shri_with_rndacc
target/arm: Convert SSHLL, USHLL to decodetree
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | accel/kvm/kvm-all.c | 10 | ||||
-rw-r--r-- | docs/devel/nested-papr.txt | 119 | ||||
-rw-r--r-- | hw/char/stm32l4x5_usart.c | 16 | ||||
-rw-r--r-- | target/arm/helper.h | 34 | ||||
-rw-r--r-- | target/arm/tcg/a64.decode | 257 | ||||
-rw-r--r-- | target/arm/tcg/cpu64.c | 2 | ||||
-rw-r--r-- | target/arm/tcg/gengvec.c | 121 | ||||
-rw-r--r-- | target/arm/tcg/neon-dp.decode | 6 | ||||
-rw-r--r-- | target/arm/tcg/neon_helper.c | 76 | ||||
-rw-r--r-- | target/arm/tcg/translate-a64.c | 2079 | ||||
-rw-r--r-- | target/arm/tcg/translate-neon.c | 179 | ||||
-rw-r--r-- | target/arm/tcg/translate-sve.c | 128 | ||||
-rw-r--r-- | target/arm/tcg/translate.h | 14 | ||||
-rwxr-xr-x | tests/functional/test_aarch64_sbsaref.py | 58 | ||||
-rw-r--r-- | tests/qtest/stm32l4x5_usart-test.c | 36 |
15 files changed, 1478 insertions, 1657 deletions
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index beb1988..fe4cd72 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -414,7 +414,7 @@ int kvm_create_and_park_vcpu(CPUState *cpu) static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; - long mmap_size; + int mmap_size; int ret = 0; trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); @@ -459,7 +459,7 @@ void kvm_destroy_vcpu(CPUState *cpu) int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; - long mmap_size; + int mmap_size; int ret; trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); @@ -1525,11 +1525,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data) r->reaper_iteration++; } - trace_kvm_dirty_ring_reaper("exit"); - - rcu_unregister_thread(); - - return NULL; + g_assert_not_reached(); } static void kvm_dirty_ring_reaper_init(KVMState *s) diff --git a/docs/devel/nested-papr.txt b/docs/devel/nested-papr.txt deleted file mode 100644 index 9094365..0000000 --- a/docs/devel/nested-papr.txt +++ /dev/null @@ -1,119 +0,0 @@ -Nested PAPR API (aka KVM on PowerVM) -==================================== - -This API aims at providing support to enable nested virtualization with -KVM on PowerVM. While the existing support for nested KVM on PowerNV was -introduced with cap-nested-hv option, however, with a slight design change, -to enable this on papr/pseries, a new cap-nested-papr option is added. eg: - - qemu-system-ppc64 -cpu POWER10 -machine pseries,cap-nested-papr=true ... - -Work by: - Michael Neuling <mikey@neuling.org> - Vaibhav Jain <vaibhav@linux.ibm.com> - Jordan Niethe <jniethe5@gmail.com> - Harsh Prateek Bora <harshpb@linux.ibm.com> - Shivaprasad G Bhat <sbhat@linux.ibm.com> - Kautuk Consul <kconsul@linux.vnet.ibm.com> - -Below taken from the kernel documentation: - -Introduction -============ - -This document explains how a guest operating system can act as a -hypervisor and run nested guests through the use of hypercalls, if the -hypervisor has implemented them. The terms L0, L1, and L2 are used to -refer to different software entities. L0 is the hypervisor mode entity -that would normally be called the "host" or "hypervisor". L1 is a -guest virtual machine that is directly run under L0 and is initiated -and controlled by L0. L2 is a guest virtual machine that is initiated -and controlled by L1 acting as a hypervisor. A significant design change -wrt existing API is that now the entire L2 state is maintained within L0. - -Existing Nested-HV API -====================== - -Linux/KVM has had support for Nesting as an L0 or L1 since 2018 - -The L0 code was added:: - - commit 8e3f5fc1045dc49fd175b978c5457f5f51e7a2ce - Author: Paul Mackerras <paulus@ozlabs.org> - Date: Mon Oct 8 16:31:03 2018 +1100 - KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization - -The L1 code was added:: - - commit 360cae313702cdd0b90f82c261a8302fecef030a - Author: Paul Mackerras <paulus@ozlabs.org> - Date: Mon Oct 8 16:31:04 2018 +1100 - KVM: PPC: Book3S HV: Nested guest entry via hypercall - -This API works primarily using a signal hcall h_enter_nested(). This -call made by the L1 to tell the L0 to start an L2 vCPU with the given -state. The L0 then starts this L2 and runs until an L2 exit condition -is reached. Once the L2 exits, the state of the L2 is given back to -the L1 by the L0. The full L2 vCPU state is always transferred from -and to L1 when the L2 is run. The L0 doesn't keep any state on the L2 -vCPU (except in the short sequence in the L0 on L1 -> L2 entry and L2 --> L1 exit). - -The only state kept by the L0 is the partition table. The L1 registers -it's partition table using the h_set_partition_table() hcall. All -other state held by the L0 about the L2s is cached state (such as -shadow page tables). - -The L1 may run any L2 or vCPU without first informing the L0. It -simply starts the vCPU using h_enter_nested(). The creation of L2s and -vCPUs is done implicitly whenever h_enter_nested() is called. - -In this document, we call this existing API the v1 API. - -New PAPR API -=============== - -The new PAPR API changes from the v1 API such that the creating L2 and -associated vCPUs is explicit. In this document, we call this the v2 -API. - -h_enter_nested() is replaced with H_GUEST_VCPU_RUN(). Before this can -be called the L1 must explicitly create the L2 using h_guest_create() -and any associated vCPUs() created with h_guest_create_vCPU(). Getting -and setting vCPU state can also be performed using h_guest_{g|s}et -hcall. - -The basic execution flow is for an L1 to create an L2, run it, and -delete it is: - -- L1 and L0 negotiate capabilities with H_GUEST_{G,S}ET_CAPABILITIES() - (normally at L1 boot time). - -- L1 requests the L0 to create an L2 with H_GUEST_CREATE() and receives a token - -- L1 requests the L0 to create an L2 vCPU with H_GUEST_CREATE_VCPU() - -- L1 and L0 communicate the vCPU state using the H_GUEST_{G,S}ET() hcall - -- L1 requests the L0 to run the vCPU using H_GUEST_RUN_VCPU() hcall - -- L1 deletes L2 with H_GUEST_DELETE() - -For more details, please refer: - -[1] Linux Kernel documentation (upstream documentation commit): - -commit 476652297f94a2e5e5ef29e734b0da37ade94110 -Author: Michael Neuling <mikey@neuling.org> -Date: Thu Sep 14 13:06:00 2023 +1000 - - docs: powerpc: Document nested KVM on POWER - - Document support for nested KVM on POWER using the existing API as well - as the new PAPR API. This includes the new HCALL interface and how it - used by KVM. - - Signed-off-by: Michael Neuling <mikey@neuling.org> - Signed-off-by: Jordan Niethe <jniethe5@gmail.com> - Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> - Link: https://msgid.link/20230914030600.16993-12-jniethe5@gmail.com diff --git a/hw/char/stm32l4x5_usart.c b/hw/char/stm32l4x5_usart.c index fc5dcac..3cf200c 100644 --- a/hw/char/stm32l4x5_usart.c +++ b/hw/char/stm32l4x5_usart.c @@ -154,6 +154,21 @@ REG32(RDR, 0x24) REG32(TDR, 0x28) FIELD(TDR, TDR, 0, 9) +static void stm32l4x5_update_isr(Stm32l4x5UsartBaseState *s) +{ + if (s->cr1 & R_CR1_TE_MASK) { + s->isr |= R_ISR_TEACK_MASK; + } else { + s->isr &= ~R_ISR_TEACK_MASK; + } + + if (s->cr1 & R_CR1_RE_MASK) { + s->isr |= R_ISR_REACK_MASK; + } else { + s->isr &= ~R_ISR_REACK_MASK; + } +} + static void stm32l4x5_update_irq(Stm32l4x5UsartBaseState *s) { if (((s->isr & R_ISR_WUF_MASK) && (s->cr3 & R_CR3_WUFIE_MASK)) || @@ -456,6 +471,7 @@ static void stm32l4x5_usart_base_write(void *opaque, hwaddr addr, case A_CR1: s->cr1 = value; stm32l4x5_update_params(s); + stm32l4x5_update_isr(s); stm32l4x5_update_irq(s); return; case A_CR2: diff --git a/target/arm/helper.h b/target/arm/helper.h index b463be3..58919b6 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -324,6 +324,18 @@ DEF_HELPER_FLAGS_5(neon_uqrshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32 DEF_HELPER_FLAGS_5(neon_uqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(neon_uqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(neon_uqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_srshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -363,17 +375,17 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32) DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32) DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32) -DEF_HELPER_1(neon_narrow_u8, i32, i64) -DEF_HELPER_1(neon_narrow_u16, i32, i64) -DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64) -DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64) -DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64) -DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64) +DEF_HELPER_1(neon_narrow_u8, i64, i64) +DEF_HELPER_1(neon_narrow_u16, i64, i64) +DEF_HELPER_2(neon_unarrow_sat8, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u8, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s8, i64, env, i64) +DEF_HELPER_2(neon_unarrow_sat16, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u16, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s16, i64, env, i64) +DEF_HELPER_2(neon_unarrow_sat32, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u32, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s32, i64, env, i64) DEF_HELPER_1(neon_narrow_high_u8, i32, i64) DEF_HELPER_1(neon_narrow_high_u16, i32, i64) DEF_HELPER_1(neon_narrow_round_high_u8, i32, i64) diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode index 62df4c4..331a8e1 100644 --- a/target/arm/tcg/a64.decode +++ b/target/arm/tcg/a64.decode @@ -30,10 +30,12 @@ &rri_sf rd rn imm sf &i imm &rr_e rd rn esz +&rri_e rd rn imm esz &rrr_e rd rn rm esz &rrx_e rd rn rm idx esz &rrrr_e rd rn rm ra esz &qrr_e q rd rn esz +&qrri_e q rd rn imm esz &qrrr_e q rd rn rm esz &qrrx_e q rd rn rm idx esz &qrrrr_e q rd rn rm ra esz @@ -54,11 +56,15 @@ @rrx_d ........ .. . rm:5 .... idx:1 . rn:5 rd:5 &rrx_e esz=3 @rr_q1e0 ........ ........ ...... rn:5 rd:5 &qrr_e q=1 esz=0 +@rr_q1e2 ........ ........ ...... rn:5 rd:5 &qrr_e q=1 esz=2 @r2r_q1e0 ........ ........ ...... rm:5 rd:5 &qrrr_e rn=%rd q=1 esz=0 @rrr_q1e0 ........ ... rm:5 ...... rn:5 rd:5 &qrrr_e q=1 esz=0 @rrr_q1e3 ........ ... rm:5 ...... rn:5 rd:5 &qrrr_e q=1 esz=3 @rrrr_q1e3 ........ ... rm:5 . ra:5 rn:5 rd:5 &qrrrr_e q=1 esz=3 +@qrr_h . q:1 ...... .. ...... ...... rn:5 rd:5 &qrr_e esz=1 +@qrr_e . q:1 ...... esz:2 ...... ...... rn:5 rd:5 &qrr_e + @qrrr_b . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=0 @qrrr_h . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=1 @qrrr_s . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=2 @@ -1136,3 +1142,254 @@ FMADD 0001 1111 .. 0 ..... 0 ..... ..... ..... @rrrr_hsd FMSUB 0001 1111 .. 0 ..... 1 ..... ..... ..... @rrrr_hsd FNMADD 0001 1111 .. 1 ..... 0 ..... ..... ..... @rrrr_hsd FNMSUB 0001 1111 .. 1 ..... 1 ..... ..... ..... @rrrr_hsd + +# Advanced SIMD Extract + +EXT_d 0010 1110 00 0 rm:5 00 imm:3 0 rn:5 rd:5 +EXT_q 0110 1110 00 0 rm:5 0 imm:4 0 rn:5 rd:5 + +# Advanced SIMD Table Lookup + +TBL_TBX 0 q:1 00 1110 000 rm:5 0 len:2 tbx:1 00 rn:5 rd:5 + +# Advanced SIMD Permute + +UZP1 0.00 1110 .. 0 ..... 0 001 10 ..... ..... @qrrr_e +UZP2 0.00 1110 .. 0 ..... 0 101 10 ..... ..... @qrrr_e +TRN1 0.00 1110 .. 0 ..... 0 010 10 ..... ..... @qrrr_e +TRN2 0.00 1110 .. 0 ..... 0 110 10 ..... ..... @qrrr_e +ZIP1 0.00 1110 .. 0 ..... 0 011 10 ..... ..... @qrrr_e +ZIP2 0.00 1110 .. 0 ..... 0 111 10 ..... ..... @qrrr_e + +# Advanced SIMD Across Lanes + +ADDV 0.00 1110 .. 11000 11011 10 ..... ..... @qrr_e +SADDLV 0.00 1110 .. 11000 00011 10 ..... ..... @qrr_e +UADDLV 0.10 1110 .. 11000 00011 10 ..... ..... @qrr_e +SMAXV 0.00 1110 .. 11000 01010 10 ..... ..... @qrr_e +UMAXV 0.10 1110 .. 11000 01010 10 ..... ..... @qrr_e +SMINV 0.00 1110 .. 11000 11010 10 ..... ..... @qrr_e +UMINV 0.10 1110 .. 11000 11010 10 ..... ..... @qrr_e + +FMAXNMV_h 0.00 1110 00 11000 01100 10 ..... ..... @qrr_h +FMAXNMV_s 0110 1110 00 11000 01100 10 ..... ..... @rr_q1e2 + +FMINNMV_h 0.00 1110 10 11000 01100 10 ..... ..... @qrr_h +FMINNMV_s 0110 1110 10 11000 01100 10 ..... ..... @rr_q1e2 + +FMAXV_h 0.00 1110 00 11000 01111 10 ..... ..... @qrr_h +FMAXV_s 0110 1110 00 11000 01111 10 ..... ..... @rr_q1e2 + +FMINV_h 0.00 1110 10 11000 01111 10 ..... ..... @qrr_h +FMINV_s 0110 1110 10 11000 01111 10 ..... ..... @rr_q1e2 + +# Floating-point Immediate + +FMOVI_s 0001 1110 .. 1 imm:8 100 00000 rd:5 esz=%esz_hsd + +# Advanced SIMD Modified Immediate / Shift by Immediate + +%abcdefgh 16:3 5:5 + +# Right shifts are encoded as N - shift, where N is the element size in bits. +%neon_rshift_i6 16:6 !function=rsub_64 +%neon_rshift_i5 16:5 !function=rsub_32 +%neon_rshift_i4 16:4 !function=rsub_16 +%neon_rshift_i3 16:3 !function=rsub_8 + +@q_shri_b . q:1 .. ..... 0001 ... ..... . rn:5 rd:5 \ + &qrri_e esz=0 imm=%neon_rshift_i3 +@q_shri_h . q:1 .. ..... 001 .... ..... . rn:5 rd:5 \ + &qrri_e esz=1 imm=%neon_rshift_i4 +@q_shri_s . q:1 .. ..... 01 ..... ..... . rn:5 rd:5 \ + &qrri_e esz=2 imm=%neon_rshift_i5 +@q_shri_d . 1 .. ..... 1 ...... ..... . rn:5 rd:5 \ + &qrri_e esz=3 imm=%neon_rshift_i6 q=1 + +@q_shli_b . q:1 .. ..... 0001 imm:3 ..... . rn:5 rd:5 &qrri_e esz=0 +@q_shli_h . q:1 .. ..... 001 imm:4 ..... . rn:5 rd:5 &qrri_e esz=1 +@q_shli_s . q:1 .. ..... 01 imm:5 ..... . rn:5 rd:5 &qrri_e esz=2 +@q_shli_d . 1 .. ..... 1 imm:6 ..... . rn:5 rd:5 &qrri_e esz=3 q=1 + +FMOVI_v_h 0 q:1 00 1111 00000 ... 1111 11 ..... rd:5 %abcdefgh + +# MOVI, MVNI, ORR, BIC, FMOV are all intermixed via cmode. +Vimm 0 q:1 op:1 0 1111 00000 ... cmode:4 01 ..... rd:5 %abcdefgh + +SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_b +SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_h +SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_s +SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_d + +USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_b +USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_h +USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_s +USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_d + +SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_b +SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_h +SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_s +SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_d + +USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_b +USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_h +USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_s +USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_d + +SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_b +SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_h +SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_s +SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_d + +URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_b +URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_h +URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_s +URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_d + +SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_b +SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_h +SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_s +SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_d + +URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_b +URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_h +URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_s +URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_d + +SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_b +SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_h +SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_s +SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_d + +SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_b +SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_h +SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_s +SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_d + +SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_b +SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_h +SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_s +SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_d + +SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_b +SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_h +SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_s + +USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_b +USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_h +USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_s + +SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_b +SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_h +SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_s + +RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_b +RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_h +RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_s + +SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_b +SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_h +SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_s +SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_d + +UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_b +UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_h +UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_s +UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_d + +SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_b +SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_h +SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_s +SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_d + +SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_b +SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_h +SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_s + +UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_b +UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_h +UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_s + +SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_b +SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_h +SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_s + +SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_b +SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_h +SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_s + +UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_b +UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_h +UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_s + +SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_b +SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_h +SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_s + +# Advanced SIMD scalar shift by immediate + +@shri_b .... ..... 0001 ... ..... . rn:5 rd:5 \ + &rri_e esz=0 imm=%neon_rshift_i3 +@shri_h .... ..... 001 .... ..... . rn:5 rd:5 \ + &rri_e esz=1 imm=%neon_rshift_i4 +@shri_s .... ..... 01 ..... ..... . rn:5 rd:5 \ + &rri_e esz=2 imm=%neon_rshift_i5 +@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \ + &rri_e esz=3 imm=%neon_rshift_i6 + +@shli_b .... ..... 0001 imm:3 ..... . rn:5 rd:5 &rri_e esz=0 +@shli_h .... ..... 001 imm:4 ..... . rn:5 rd:5 &rri_e esz=1 +@shli_s .... ..... 01 imm:5 ..... . rn:5 rd:5 &rri_e esz=2 +@shli_d .... ..... 1 imm:6 ..... . rn:5 rd:5 &rri_e esz=3 + +SSHR_s 0101 11110 .... ... 00000 1 ..... ..... @shri_d +USHR_s 0111 11110 .... ... 00000 1 ..... ..... @shri_d +SSRA_s 0101 11110 .... ... 00010 1 ..... ..... @shri_d +USRA_s 0111 11110 .... ... 00010 1 ..... ..... @shri_d +SRSHR_s 0101 11110 .... ... 00100 1 ..... ..... @shri_d +URSHR_s 0111 11110 .... ... 00100 1 ..... ..... @shri_d +SRSRA_s 0101 11110 .... ... 00110 1 ..... ..... @shri_d +URSRA_s 0111 11110 .... ... 00110 1 ..... ..... @shri_d +SRI_s 0111 11110 .... ... 01000 1 ..... ..... @shri_d + +SHL_s 0101 11110 .... ... 01010 1 ..... ..... @shli_d +SLI_s 0111 11110 .... ... 01010 1 ..... ..... @shli_d + +SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_b +SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_h +SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_s +SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_d + +UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_b +UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_h +UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_s +UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_d + +SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_b +SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_h +SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_s +SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_d + +SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_b +SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_h +SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_s + +UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_b +UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_h +UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_s + +SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_b +SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_h +SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_s + +SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_b +SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_h +SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_s + +UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_b +UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_h +UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_s + +SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_b +SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_h +SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_s diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index b9f34f0..0168920 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -677,7 +677,7 @@ static void aarch64_neoverse_v1_initfn(Object *obj) cpu->isar.id_aa64dfr0 = 0x000001f210305519ull; cpu->isar.id_aa64dfr1 = 0x00000000; cpu->isar.id_aa64isar0 = 0x1011111110212120ull; /* with FEAT_RNG */ - cpu->isar.id_aa64isar1 = 0x0111000001211032ull; + cpu->isar.id_aa64isar1 = 0x0011100001211032ull; cpu->isar.id_aa64mmfr0 = 0x0000000000101125ull; cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; cpu->isar.id_aa64mmfr2 = 0x0220011102101011ull; diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c index 56a1dc1..f652520 100644 --- a/target/arm/tcg/gengvec.c +++ b/target/arm/tcg/gengvec.c @@ -88,6 +88,25 @@ GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) #undef GEN_CMP0 +void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + /* Signed shift out of range results in all-sign-bits */ + shift = MIN(shift, (8 << vece) - 1); + tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); +} + +void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + /* Unsigned shift out of range results in all-zero-bits */ + if (shift >= (8 << vece)) { + tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); + } else { + tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); + } +} + static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) { tcg_gen_vec_sar8i_i64(a, a, shift); @@ -285,7 +304,7 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) tcg_gen_add_i32(d, d, t); } - void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) { TCGv_i64 t = tcg_temp_new_i64(); @@ -297,10 +316,9 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) { TCGv_vec t = tcg_temp_new_vec_matching(d); - TCGv_vec ones = tcg_temp_new_vec_matching(d); + TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); tcg_gen_shri_vec(vece, t, a, sh - 1); - tcg_gen_dupi_vec(vece, ones, 1); tcg_gen_and_vec(vece, t, t, ones); tcg_gen_sari_vec(vece, d, a, sh); tcg_gen_add_vec(vece, d, d, t); @@ -492,10 +510,9 @@ void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) { TCGv_vec t = tcg_temp_new_vec_matching(d); - TCGv_vec ones = tcg_temp_new_vec_matching(d); + TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); tcg_gen_shri_vec(vece, t, a, shift - 1); - tcg_gen_dupi_vec(vece, ones, 1); tcg_gen_and_vec(vece, t, t, ones); tcg_gen_shri_vec(vece, d, a, shift); tcg_gen_add_vec(vece, d, d, t); @@ -685,9 +702,9 @@ static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) { TCGv_vec t = tcg_temp_new_vec_matching(d); - TCGv_vec m = tcg_temp_new_vec_matching(d); + int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); + TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); - tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh)); tcg_gen_shri_vec(vece, t, a, sh); tcg_gen_and_vec(vece, d, d, m); tcg_gen_or_vec(vece, d, d, t); @@ -773,10 +790,9 @@ static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) { TCGv_vec t = tcg_temp_new_vec_matching(d); - TCGv_vec m = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); tcg_gen_shli_vec(vece, t, a, sh); - tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh)); tcg_gen_and_vec(vece, d, d, m); tcg_gen_or_vec(vece, d, d, t); } @@ -1044,14 +1060,13 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst, TCGv_vec rval = tcg_temp_new_vec_matching(dst); TCGv_vec lsh = tcg_temp_new_vec_matching(dst); TCGv_vec rsh = tcg_temp_new_vec_matching(dst); - TCGv_vec msk, max; + TCGv_vec max, zero; tcg_gen_neg_vec(vece, rsh, shift); if (vece == MO_8) { tcg_gen_mov_vec(lsh, shift); } else { - msk = tcg_temp_new_vec_matching(dst); - tcg_gen_dupi_vec(vece, msk, 0xff); + TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); tcg_gen_and_vec(vece, lsh, shift, msk); tcg_gen_and_vec(vece, rsh, rsh, msk); } @@ -1064,26 +1079,21 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst, tcg_gen_shlv_vec(vece, lval, src, lsh); tcg_gen_shrv_vec(vece, rval, src, rsh); - max = tcg_temp_new_vec_matching(dst); - tcg_gen_dupi_vec(vece, max, 8 << vece); - /* - * The choice of LT (signed) and GEU (unsigned) are biased toward + * The choice of GE (signed) and GEU (unsigned) are biased toward * the instructions of the x86_64 host. For MO_8, the whole byte * is significant so we must use an unsigned compare; otherwise we * have already masked to a byte and so a signed compare works. * Other tcg hosts have a full set of comparisons and do not care. */ + zero = tcg_constant_vec_matching(dst, vece, 0); + max = tcg_constant_vec_matching(dst, vece, 8 << vece); if (vece == MO_8) { - tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max); - tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max); - tcg_gen_andc_vec(vece, lval, lval, lsh); - tcg_gen_andc_vec(vece, rval, rval, rsh); + tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval); + tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval); } else { - tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max); - tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max); - tcg_gen_and_vec(vece, lval, lval, lsh); - tcg_gen_and_vec(vece, rval, rval, rsh); + tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval); + tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval); } tcg_gen_or_vec(vece, dst, lval, rval); } @@ -1093,7 +1103,7 @@ void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, { static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, INDEX_op_shlv_vec, - INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0 + INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0 }; static const GVecGen3 ops[4] = { { .fniv = gen_ushl_vec, @@ -1169,7 +1179,7 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst, TCGv_vec rval = tcg_temp_new_vec_matching(dst); TCGv_vec lsh = tcg_temp_new_vec_matching(dst); TCGv_vec rsh = tcg_temp_new_vec_matching(dst); - TCGv_vec tmp = tcg_temp_new_vec_matching(dst); + TCGv_vec max, zero; /* * Rely on the TCG guarantee that out of range shifts produce @@ -1180,29 +1190,28 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst, if (vece == MO_8) { tcg_gen_mov_vec(lsh, shift); } else { - tcg_gen_dupi_vec(vece, tmp, 0xff); - tcg_gen_and_vec(vece, lsh, shift, tmp); - tcg_gen_and_vec(vece, rsh, rsh, tmp); + TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); + tcg_gen_and_vec(vece, lsh, shift, msk); + tcg_gen_and_vec(vece, rsh, rsh, msk); } /* Bound rsh so out of bound right shift gets -1. */ - tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1); - tcg_gen_umin_vec(vece, rsh, rsh, tmp); - tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp); + max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); + tcg_gen_umin_vec(vece, rsh, rsh, max); tcg_gen_shlv_vec(vece, lval, src, lsh); tcg_gen_sarv_vec(vece, rval, src, rsh); /* Select in-bound left shift. */ - tcg_gen_andc_vec(vece, lval, lval, tmp); + zero = tcg_constant_vec_matching(dst, vece, 0); + tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval); /* Select between left and right shift. */ if (vece == MO_8) { - tcg_gen_dupi_vec(vece, tmp, 0); - tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval); + tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); } else { - tcg_gen_dupi_vec(vece, tmp, 0x80); - tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval); + TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); + tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); } } @@ -1211,7 +1220,7 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, { static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, - INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 + INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0 }; static const GVecGen3 ops[4] = { { .fniv = gen_sshl_vec, @@ -1304,6 +1313,42 @@ void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, opr_sz, max_sz, 0, fns[vece]); } +void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_2_ptr * const fns[] = { + gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h, + gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d, + }; + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(c >= 0 && c <= (8 << vece)); + tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); +} + +void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_2_ptr * const fns[] = { + gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h, + gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d, + }; + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(c >= 0 && c <= (8 << vece)); + tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); +} + +void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_2_ptr * const fns[] = { + gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h, + gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d, + }; + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(c >= 0 && c <= (8 << vece)); + tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); +} + void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) { uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode index 788578c..e883c6a 100644 --- a/target/arm/tcg/neon-dp.decode +++ b/target/arm/tcg/neon-dp.decode @@ -291,17 +291,17 @@ VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b -VQSHLU_64_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d +VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b -VQSHL_S_64_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d +VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b -VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d +VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c index 082bfd8..93b2076 100644 --- a/target/arm/tcg/neon_helper.c +++ b/target/arm/tcg/neon_helper.c @@ -141,6 +141,19 @@ void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \ clear_tail(d, opr_sz, simd_maxsz(desc)); \ } +#define NEON_GVEC_VOP2i_ENV(name, vtype) \ +void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int imm = simd_data(desc); \ + vtype *d = vd, *n = vn; \ + CPUARMState *env = venv; \ + for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ + NEON_FN(d[i], n[i], imm); \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + /* Pairwise operations. */ /* For 32-bit elements each segment only contains a single element, so the elementwise and pairwise operations are the same. */ @@ -271,22 +284,26 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) NEON_VOP_ENV(qshl_u8, neon_u8, 4) NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) +NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) NEON_VOP_ENV(qshl_u16, neon_u16, 2) NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) +NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) +NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) +NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t) #undef NEON_FN uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) @@ -303,22 +320,26 @@ uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) NEON_VOP_ENV(qshl_s8, neon_s8, 4) NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) +NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) NEON_VOP_ENV(qshl_s16, neon_s16, 2) NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) +NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) +NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) +NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t) #undef NEON_FN uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) @@ -334,11 +355,13 @@ uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) #define NEON_FN(dest, src1, src2) \ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) NEON_VOP_ENV(qshlu_s8, neon_s8, 4) +NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) NEON_VOP_ENV(qshlu_s16, neon_s16, 2) +NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t) #undef NEON_FN uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) @@ -352,6 +375,16 @@ uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) } #define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) +NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) +NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) NEON_VOP_ENV(qrshl_u8, neon_u8, 4) NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) @@ -565,13 +598,15 @@ NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) #undef NEON_FN #undef NEON_QDMULH32 -uint32_t HELPER(neon_narrow_u8)(uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_u8)(uint64_t x) { return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) | ((x >> 24) & 0xff000000u); } -uint32_t HELPER(neon_narrow_u16)(uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_u16)(uint64_t x) { return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); } @@ -602,7 +637,8 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); } -uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) { uint16_t s; uint8_t d; @@ -629,7 +665,8 @@ uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) return res; } -uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) { uint16_t s; uint8_t d; @@ -652,7 +689,8 @@ uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) return res; } -uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) { int16_t s; uint8_t d; @@ -675,7 +713,8 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) return res; } -uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) { uint32_t high; uint32_t low; @@ -695,10 +734,11 @@ uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) high = 0xffff; SET_QC(); } - return low | (high << 16); + return deposit32(low, 16, 16, high); } -uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) { uint32_t high; uint32_t low; @@ -712,10 +752,11 @@ uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) high = 0xffff; SET_QC(); } - return low | (high << 16); + return deposit32(low, 16, 16, high); } -uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) { int32_t low; int32_t high; @@ -729,10 +770,11 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) high = (high >> 31) ^ 0x7fff; SET_QC(); } - return (uint16_t)low | (high << 16); + return deposit32(low, 16, 16, high); } -uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) { if (x & 0x8000000000000000ull) { SET_QC(); @@ -745,7 +787,8 @@ uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) return x; } -uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) { if (x > 0xffffffffu) { SET_QC(); @@ -754,13 +797,14 @@ uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) return x; } -uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) +/* Only the low 32-bits of output are significant. */ +uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) { if ((int64_t)x != (int32_t)x) { SET_QC(); - return ((int64_t)x >> 63) ^ 0x7fffffff; + return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff; } - return x; + return (uint32_t)x; } uint64_t HELPER(neon_widen_u8)(uint32_t x) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 6d5f12e..071b634 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -4680,6 +4680,88 @@ static bool trans_EXTR(DisasContext *s, arg_extract *a) return true; } +static bool trans_TBL_TBX(DisasContext *s, arg_TBL_TBX *a) +{ + if (fp_access_check(s)) { + int len = (a->len + 1) * 16; + + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rm), tcg_env, + a->q ? 16 : 8, vec_full_reg_size(s), + (len << 6) | (a->tbx << 5) | a->rn, + gen_helper_simd_tblx); + } + return true; +} + +typedef int simd_permute_idx_fn(int i, int part, int elements); + +static bool do_simd_permute(DisasContext *s, arg_qrrr_e *a, + simd_permute_idx_fn *fn, int part) +{ + MemOp esz = a->esz; + int datasize = a->q ? 16 : 8; + int elements = datasize >> esz; + TCGv_i64 tcg_res[2], tcg_ele; + + if (esz == MO_64 && !a->q) { + return false; + } + if (!fp_access_check(s)) { + return true; + } + + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = a->q ? tcg_temp_new_i64() : NULL; + tcg_ele = tcg_temp_new_i64(); + + for (int i = 0; i < elements; i++) { + int o, w, idx; + + idx = fn(i, part, elements); + read_vec_element(s, tcg_ele, (idx & elements ? a->rm : a->rn), + idx & (elements - 1), esz); + + w = (i << (esz + 3)) / 64; + o = (i << (esz + 3)) % 64; + if (o == 0) { + tcg_gen_mov_i64(tcg_res[w], tcg_ele); + } else { + tcg_gen_deposit_i64(tcg_res[w], tcg_res[w], tcg_ele, o, 8 << esz); + } + } + + for (int i = a->q; i >= 0; --i) { + write_vec_element(s, tcg_res[i], a->rd, i, MO_64); + } + clear_vec_high(s, a->q, a->rd); + return true; +} + +static int permute_load_uzp(int i, int part, int elements) +{ + return 2 * i + part; +} + +TRANS(UZP1, do_simd_permute, a, permute_load_uzp, 0) +TRANS(UZP2, do_simd_permute, a, permute_load_uzp, 1) + +static int permute_load_trn(int i, int part, int elements) +{ + return (i & 1) * elements + (i & ~1) + part; +} + +TRANS(TRN1, do_simd_permute, a, permute_load_trn, 0) +TRANS(TRN2, do_simd_permute, a, permute_load_trn, 1) + +static int permute_load_zip(int i, int part, int elements) +{ + return (i & 1) * elements + ((part * elements + i) >> 1); +} + +TRANS(ZIP1, do_simd_permute, a, permute_load_zip, 0) +TRANS(ZIP2, do_simd_permute, a, permute_load_zip, 1) + /* * Cryptographic AES, SHA, SHA512 */ @@ -6583,6 +6665,54 @@ static bool trans_FCSEL(DisasContext *s, arg_FCSEL *a) } /* + * Advanced SIMD Extract + */ + +static bool trans_EXT_d(DisasContext *s, arg_EXT_d *a) +{ + if (fp_access_check(s)) { + TCGv_i64 lo = read_fp_dreg(s, a->rn); + if (a->imm != 0) { + TCGv_i64 hi = read_fp_dreg(s, a->rm); + tcg_gen_extract2_i64(lo, lo, hi, a->imm * 8); + } + write_fp_dreg(s, a->rd, lo); + } + return true; +} + +static bool trans_EXT_q(DisasContext *s, arg_EXT_q *a) +{ + TCGv_i64 lo, hi; + int pos = (a->imm & 7) * 8; + int elt = a->imm >> 3; + + if (!fp_access_check(s)) { + return true; + } + + lo = tcg_temp_new_i64(); + hi = tcg_temp_new_i64(); + + read_vec_element(s, lo, a->rn, elt, MO_64); + elt++; + read_vec_element(s, hi, elt & 2 ? a->rm : a->rn, elt & 1, MO_64); + elt++; + + if (pos != 0) { + TCGv_i64 hh = tcg_temp_new_i64(); + tcg_gen_extract2_i64(lo, lo, hi, pos); + read_vec_element(s, hh, a->rm, elt & 1, MO_64); + tcg_gen_extract2_i64(hi, hi, hh, pos); + } + + write_vec_element(s, lo, a->rd, 0, MO_64); + write_vec_element(s, hi, a->rd, 1, MO_64); + clear_vec_high(s, true, a->rd); + return true; +} + +/* * Floating-point data-processing (3 source) */ @@ -6664,6 +6794,697 @@ TRANS(FNMADD, do_fmadd, a, true, true) TRANS(FMSUB, do_fmadd, a, false, true) TRANS(FNMSUB, do_fmadd, a, true, false) +/* + * Advanced SIMD Across Lanes + */ + +static bool do_int_reduction(DisasContext *s, arg_qrr_e *a, bool widen, + MemOp src_sign, NeonGenTwo64OpFn *fn) +{ + TCGv_i64 tcg_res, tcg_elt; + MemOp src_mop = a->esz | src_sign; + int elements = (a->q ? 16 : 8) >> a->esz; + + /* Reject MO_64, and MO_32 without Q: a minimum of 4 elements. */ + if (elements < 4) { + return false; + } + if (!fp_access_check(s)) { + return true; + } + + tcg_res = tcg_temp_new_i64(); + tcg_elt = tcg_temp_new_i64(); + + read_vec_element(s, tcg_res, a->rn, 0, src_mop); + for (int i = 1; i < elements; i++) { + read_vec_element(s, tcg_elt, a->rn, i, src_mop); + fn(tcg_res, tcg_res, tcg_elt); + } + + tcg_gen_ext_i64(tcg_res, tcg_res, a->esz + widen); + write_fp_dreg(s, a->rd, tcg_res); + return true; +} + +TRANS(ADDV, do_int_reduction, a, false, 0, tcg_gen_add_i64) +TRANS(SADDLV, do_int_reduction, a, true, MO_SIGN, tcg_gen_add_i64) +TRANS(UADDLV, do_int_reduction, a, true, 0, tcg_gen_add_i64) +TRANS(SMAXV, do_int_reduction, a, false, MO_SIGN, tcg_gen_smax_i64) +TRANS(UMAXV, do_int_reduction, a, false, 0, tcg_gen_umax_i64) +TRANS(SMINV, do_int_reduction, a, false, MO_SIGN, tcg_gen_smin_i64) +TRANS(UMINV, do_int_reduction, a, false, 0, tcg_gen_umin_i64) + +/* + * do_fp_reduction helper + * + * This mirrors the Reduce() pseudocode in the ARM ARM. It is + * important for correct NaN propagation that we do these + * operations in exactly the order specified by the pseudocode. + * + * This is a recursive function. + */ +static TCGv_i32 do_reduction_op(DisasContext *s, int rn, MemOp esz, + int ebase, int ecount, TCGv_ptr fpst, + NeonGenTwoSingleOpFn *fn) +{ + if (ecount == 1) { + TCGv_i32 tcg_elem = tcg_temp_new_i32(); + read_vec_element_i32(s, tcg_elem, rn, ebase, esz); + return tcg_elem; + } else { + int half = ecount >> 1; + TCGv_i32 tcg_hi, tcg_lo, tcg_res; + + tcg_hi = do_reduction_op(s, rn, esz, ebase + half, half, fpst, fn); + tcg_lo = do_reduction_op(s, rn, esz, ebase, half, fpst, fn); + tcg_res = tcg_temp_new_i32(); + + fn(tcg_res, tcg_lo, tcg_hi, fpst); + return tcg_res; + } +} + +static bool do_fp_reduction(DisasContext *s, arg_qrr_e *a, + NeonGenTwoSingleOpFn *fn) +{ + if (fp_access_check(s)) { + MemOp esz = a->esz; + int elts = (a->q ? 16 : 8) >> esz; + TCGv_ptr fpst = fpstatus_ptr(esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + TCGv_i32 res = do_reduction_op(s, a->rn, esz, 0, elts, fpst, fn); + write_fp_sreg(s, a->rd, res); + } + return true; +} + +TRANS_FEAT(FMAXNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_maxnumh) +TRANS_FEAT(FMINNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_minnumh) +TRANS_FEAT(FMAXV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_maxh) +TRANS_FEAT(FMINV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_minh) + +TRANS(FMAXNMV_s, do_fp_reduction, a, gen_helper_vfp_maxnums) +TRANS(FMINNMV_s, do_fp_reduction, a, gen_helper_vfp_minnums) +TRANS(FMAXV_s, do_fp_reduction, a, gen_helper_vfp_maxs) +TRANS(FMINV_s, do_fp_reduction, a, gen_helper_vfp_mins) + +/* + * Floating-point Immediate + */ + +static bool trans_FMOVI_s(DisasContext *s, arg_FMOVI_s *a) +{ + switch (a->esz) { + case MO_32: + case MO_64: + break; + case MO_16: + if (!dc_isar_feature(aa64_fp16, s)) { + return false; + } + break; + default: + return false; + } + if (fp_access_check(s)) { + uint64_t imm = vfp_expand_imm(a->esz, a->imm); + write_fp_dreg(s, a->rd, tcg_constant_i64(imm)); + } + return true; +} + +/* + * Advanced SIMD Modified Immediate + */ + +static bool trans_FMOVI_v_h(DisasContext *s, arg_FMOVI_v_h *a) +{ + if (!dc_isar_feature(aa64_fp16, s)) { + return false; + } + if (fp_access_check(s)) { + tcg_gen_gvec_dup_imm(MO_16, vec_full_reg_offset(s, a->rd), + a->q ? 16 : 8, vec_full_reg_size(s), + vfp_expand_imm(MO_16, a->abcdefgh)); + } + return true; +} + +static void gen_movi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c); +} + +static bool trans_Vimm(DisasContext *s, arg_Vimm *a) +{ + GVecGen2iFn *fn; + + /* Handle decode of cmode/op here between ORR/BIC/MOVI */ + if ((a->cmode & 1) && a->cmode < 12) { + /* For op=1, the imm will be inverted, so BIC becomes AND. */ + fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori; + } else { + /* There is one unallocated cmode/op combination in this space */ + if (a->cmode == 15 && a->op == 1 && a->q == 0) { + return false; + } + fn = gen_movi; + } + + if (fp_access_check(s)) { + uint64_t imm = asimd_imm_const(a->abcdefgh, a->cmode, a->op); + gen_gvec_fn2i(s, a->q, a->rd, a->rd, imm, fn, MO_64); + } + return true; +} + +/* + * Advanced SIMD Shift by Immediate + */ + +static bool do_vec_shift_imm(DisasContext *s, arg_qrri_e *a, GVecGen2iFn *fn) +{ + if (fp_access_check(s)) { + gen_gvec_fn2i(s, a->q, a->rd, a->rn, a->imm, fn, a->esz); + } + return true; +} + +TRANS(SSHR_v, do_vec_shift_imm, a, gen_gvec_sshr) +TRANS(USHR_v, do_vec_shift_imm, a, gen_gvec_ushr) +TRANS(SSRA_v, do_vec_shift_imm, a, gen_gvec_ssra) +TRANS(USRA_v, do_vec_shift_imm, a, gen_gvec_usra) +TRANS(SRSHR_v, do_vec_shift_imm, a, gen_gvec_srshr) +TRANS(URSHR_v, do_vec_shift_imm, a, gen_gvec_urshr) +TRANS(SRSRA_v, do_vec_shift_imm, a, gen_gvec_srsra) +TRANS(URSRA_v, do_vec_shift_imm, a, gen_gvec_ursra) +TRANS(SRI_v, do_vec_shift_imm, a, gen_gvec_sri) +TRANS(SHL_v, do_vec_shift_imm, a, tcg_gen_gvec_shli) +TRANS(SLI_v, do_vec_shift_imm, a, gen_gvec_sli); +TRANS(SQSHL_vi, do_vec_shift_imm, a, gen_neon_sqshli) +TRANS(UQSHL_vi, do_vec_shift_imm, a, gen_neon_uqshli) +TRANS(SQSHLU_vi, do_vec_shift_imm, a, gen_neon_sqshlui) + +static bool do_vec_shift_imm_wide(DisasContext *s, arg_qrri_e *a, bool is_u) +{ + TCGv_i64 tcg_rn, tcg_rd; + int esz = a->esz; + int esize; + + if (!fp_access_check(s)) { + return true; + } + + /* + * For the LL variants the store is larger than the load, + * so if rd == rn we would overwrite parts of our input. + * So load everything right now and use shifts in the main loop. + */ + tcg_rd = tcg_temp_new_i64(); + tcg_rn = tcg_temp_new_i64(); + read_vec_element(s, tcg_rn, a->rn, a->q, MO_64); + + esize = 8 << esz; + for (int i = 0, elements = 8 >> esz; i < elements; i++) { + if (is_u) { + tcg_gen_extract_i64(tcg_rd, tcg_rn, i * esize, esize); + } else { + tcg_gen_sextract_i64(tcg_rd, tcg_rn, i * esize, esize); + } + tcg_gen_shli_i64(tcg_rd, tcg_rd, a->imm); + write_vec_element(s, tcg_rd, a->rd, i, esz + 1); + } + clear_vec_high(s, true, a->rd); + return true; +} + +TRANS(SSHLL_v, do_vec_shift_imm_wide, a, false) +TRANS(USHLL_v, do_vec_shift_imm_wide, a, true) + +static void gen_sshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 64); + tcg_gen_sari_i64(dst, src, MIN(shift, 63)); +} + +static void gen_ushr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 64); + if (shift == 64) { + tcg_gen_movi_i64(dst, 0); + } else { + tcg_gen_shri_i64(dst, src, shift); + } +} + +static void gen_ssra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + gen_sshr_d(src, src, shift); + tcg_gen_add_i64(dst, dst, src); +} + +static void gen_usra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + gen_ushr_d(src, src, shift); + tcg_gen_add_i64(dst, dst, src); +} + +static void gen_srshr_bhs(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 32); + if (shift) { + TCGv_i64 rnd = tcg_constant_i64(1ull << (shift - 1)); + tcg_gen_add_i64(dst, src, rnd); + tcg_gen_sari_i64(dst, dst, shift); + } else { + tcg_gen_mov_i64(dst, src); + } +} + +static void gen_urshr_bhs(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 32); + if (shift) { + TCGv_i64 rnd = tcg_constant_i64(1ull << (shift - 1)); + tcg_gen_add_i64(dst, src, rnd); + tcg_gen_shri_i64(dst, dst, shift); + } else { + tcg_gen_mov_i64(dst, src); + } +} + +static void gen_srshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 64); + if (shift == 0) { + tcg_gen_mov_i64(dst, src); + } else if (shift == 64) { + /* Extension of sign bit (0,-1) plus sign bit (0,1) is zero. */ + tcg_gen_movi_i64(dst, 0); + } else { + TCGv_i64 rnd = tcg_temp_new_i64(); + tcg_gen_extract_i64(rnd, src, shift - 1, 1); + tcg_gen_sari_i64(dst, src, shift); + tcg_gen_add_i64(dst, dst, rnd); + } +} + +static void gen_urshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + assert(shift >= 0 && shift <= 64); + if (shift == 0) { + tcg_gen_mov_i64(dst, src); + } else if (shift == 64) { + /* Rounding will propagate bit 63 into bit 64. */ + tcg_gen_shri_i64(dst, src, 63); + } else { + TCGv_i64 rnd = tcg_temp_new_i64(); + tcg_gen_extract_i64(rnd, src, shift - 1, 1); + tcg_gen_shri_i64(dst, src, shift); + tcg_gen_add_i64(dst, dst, rnd); + } +} + +static void gen_srsra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + gen_srshr_d(src, src, shift); + tcg_gen_add_i64(dst, dst, src); +} + +static void gen_ursra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + gen_urshr_d(src, src, shift); + tcg_gen_add_i64(dst, dst, src); +} + +static void gen_sri_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + /* If shift is 64, dst is unchanged. */ + if (shift != 64) { + tcg_gen_shri_i64(src, src, shift); + tcg_gen_deposit_i64(dst, dst, src, 0, 64 - shift); + } +} + +static void gen_sli_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift) +{ + tcg_gen_deposit_i64(dst, dst, src, shift, 64 - shift); +} + +static bool do_vec_shift_imm_narrow(DisasContext *s, arg_qrri_e *a, + WideShiftImmFn * const fns[3], MemOp sign) +{ + TCGv_i64 tcg_rn, tcg_rd; + int esz = a->esz; + int esize; + WideShiftImmFn *fn; + + tcg_debug_assert(esz >= MO_8 && esz <= MO_32); + + if (!fp_access_check(s)) { + return true; + } + + tcg_rn = tcg_temp_new_i64(); + tcg_rd = tcg_temp_new_i64(); + tcg_gen_movi_i64(tcg_rd, 0); + + fn = fns[esz]; + esize = 8 << esz; + for (int i = 0, elements = 8 >> esz; i < elements; i++) { + read_vec_element(s, tcg_rn, a->rn, i, (esz + 1) | sign); + fn(tcg_rn, tcg_rn, a->imm); + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, esize * i, esize); + } + + write_vec_element(s, tcg_rd, a->rd, a->q, MO_64); + clear_vec_high(s, a->q, a->rd); + return true; +} + +static void gen_sqshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_sari_i64(d, s, i); + tcg_gen_ext16u_i64(d, d); + gen_helper_neon_narrow_sat_s8(d, tcg_env, d); +} + +static void gen_sqshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_sari_i64(d, s, i); + tcg_gen_ext32u_i64(d, d); + gen_helper_neon_narrow_sat_s16(d, tcg_env, d); +} + +static void gen_sqshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_sshr_d(d, s, i); + gen_helper_neon_narrow_sat_s32(d, tcg_env, d); +} + +static void gen_uqshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_shri_i64(d, s, i); + gen_helper_neon_narrow_sat_u8(d, tcg_env, d); +} + +static void gen_uqshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_shri_i64(d, s, i); + gen_helper_neon_narrow_sat_u16(d, tcg_env, d); +} + +static void gen_uqshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_ushr_d(d, s, i); + gen_helper_neon_narrow_sat_u32(d, tcg_env, d); +} + +static void gen_sqshrun_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_sari_i64(d, s, i); + tcg_gen_ext16u_i64(d, d); + gen_helper_neon_unarrow_sat8(d, tcg_env, d); +} + +static void gen_sqshrun_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + tcg_gen_sari_i64(d, s, i); + tcg_gen_ext32u_i64(d, d); + gen_helper_neon_unarrow_sat16(d, tcg_env, d); +} + +static void gen_sqshrun_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_sshr_d(d, s, i); + gen_helper_neon_unarrow_sat32(d, tcg_env, d); +} + +static void gen_sqrshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_bhs(d, s, i); + tcg_gen_ext16u_i64(d, d); + gen_helper_neon_narrow_sat_s8(d, tcg_env, d); +} + +static void gen_sqrshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_bhs(d, s, i); + tcg_gen_ext32u_i64(d, d); + gen_helper_neon_narrow_sat_s16(d, tcg_env, d); +} + +static void gen_sqrshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_d(d, s, i); + gen_helper_neon_narrow_sat_s32(d, tcg_env, d); +} + +static void gen_uqrshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_urshr_bhs(d, s, i); + gen_helper_neon_narrow_sat_u8(d, tcg_env, d); +} + +static void gen_uqrshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_urshr_bhs(d, s, i); + gen_helper_neon_narrow_sat_u16(d, tcg_env, d); +} + +static void gen_uqrshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_urshr_d(d, s, i); + gen_helper_neon_narrow_sat_u32(d, tcg_env, d); +} + +static void gen_sqrshrun_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_bhs(d, s, i); + tcg_gen_ext16u_i64(d, d); + gen_helper_neon_unarrow_sat8(d, tcg_env, d); +} + +static void gen_sqrshrun_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_bhs(d, s, i); + tcg_gen_ext32u_i64(d, d); + gen_helper_neon_unarrow_sat16(d, tcg_env, d); +} + +static void gen_sqrshrun_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_srshr_d(d, s, i); + gen_helper_neon_unarrow_sat32(d, tcg_env, d); +} + +static WideShiftImmFn * const shrn_fns[] = { + tcg_gen_shri_i64, + tcg_gen_shri_i64, + gen_ushr_d, +}; +TRANS(SHRN_v, do_vec_shift_imm_narrow, a, shrn_fns, 0) + +static WideShiftImmFn * const rshrn_fns[] = { + gen_urshr_bhs, + gen_urshr_bhs, + gen_urshr_d, +}; +TRANS(RSHRN_v, do_vec_shift_imm_narrow, a, rshrn_fns, 0) + +static WideShiftImmFn * const sqshrn_fns[] = { + gen_sqshrn_b, + gen_sqshrn_h, + gen_sqshrn_s, +}; +TRANS(SQSHRN_v, do_vec_shift_imm_narrow, a, sqshrn_fns, MO_SIGN) + +static WideShiftImmFn * const uqshrn_fns[] = { + gen_uqshrn_b, + gen_uqshrn_h, + gen_uqshrn_s, +}; +TRANS(UQSHRN_v, do_vec_shift_imm_narrow, a, uqshrn_fns, 0) + +static WideShiftImmFn * const sqshrun_fns[] = { + gen_sqshrun_b, + gen_sqshrun_h, + gen_sqshrun_s, +}; +TRANS(SQSHRUN_v, do_vec_shift_imm_narrow, a, sqshrun_fns, MO_SIGN) + +static WideShiftImmFn * const sqrshrn_fns[] = { + gen_sqrshrn_b, + gen_sqrshrn_h, + gen_sqrshrn_s, +}; +TRANS(SQRSHRN_v, do_vec_shift_imm_narrow, a, sqrshrn_fns, MO_SIGN) + +static WideShiftImmFn * const uqrshrn_fns[] = { + gen_uqrshrn_b, + gen_uqrshrn_h, + gen_uqrshrn_s, +}; +TRANS(UQRSHRN_v, do_vec_shift_imm_narrow, a, uqrshrn_fns, 0) + +static WideShiftImmFn * const sqrshrun_fns[] = { + gen_sqrshrun_b, + gen_sqrshrun_h, + gen_sqrshrun_s, +}; +TRANS(SQRSHRUN_v, do_vec_shift_imm_narrow, a, sqrshrun_fns, MO_SIGN) + +/* + * Advanced SIMD Scalar Shift by Immediate + */ + +static bool do_scalar_shift_imm(DisasContext *s, arg_rri_e *a, + WideShiftImmFn *fn, bool accumulate, + MemOp sign) +{ + if (fp_access_check(s)) { + TCGv_i64 rd = tcg_temp_new_i64(); + TCGv_i64 rn = tcg_temp_new_i64(); + + read_vec_element(s, rn, a->rn, 0, a->esz | sign); + if (accumulate) { + read_vec_element(s, rd, a->rd, 0, a->esz | sign); + } + fn(rd, rn, a->imm); + write_fp_dreg(s, a->rd, rd); + } + return true; +} + +TRANS(SSHR_s, do_scalar_shift_imm, a, gen_sshr_d, false, 0) +TRANS(USHR_s, do_scalar_shift_imm, a, gen_ushr_d, false, 0) +TRANS(SSRA_s, do_scalar_shift_imm, a, gen_ssra_d, true, 0) +TRANS(USRA_s, do_scalar_shift_imm, a, gen_usra_d, true, 0) +TRANS(SRSHR_s, do_scalar_shift_imm, a, gen_srshr_d, false, 0) +TRANS(URSHR_s, do_scalar_shift_imm, a, gen_urshr_d, false, 0) +TRANS(SRSRA_s, do_scalar_shift_imm, a, gen_srsra_d, true, 0) +TRANS(URSRA_s, do_scalar_shift_imm, a, gen_ursra_d, true, 0) +TRANS(SRI_s, do_scalar_shift_imm, a, gen_sri_d, true, 0) + +TRANS(SHL_s, do_scalar_shift_imm, a, tcg_gen_shli_i64, false, 0) +TRANS(SLI_s, do_scalar_shift_imm, a, gen_sli_d, true, 0) + +static void trunc_i64_env_imm(TCGv_i64 d, TCGv_i64 s, int64_t i, + NeonGenTwoOpEnvFn *fn) +{ + TCGv_i32 t = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t, s); + fn(t, tcg_env, t, tcg_constant_i32(i)); + tcg_gen_extu_i32_i64(d, t); +} + +static void gen_sqshli_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s8); +} + +static void gen_sqshli_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s16); +} + +static void gen_sqshli_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s32); +} + +static void gen_sqshli_d(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_helper_neon_qshl_s64(d, tcg_env, s, tcg_constant_i64(i)); +} + +static void gen_uqshli_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u8); +} + +static void gen_uqshli_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u16); +} + +static void gen_uqshli_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u32); +} + +static void gen_uqshli_d(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_helper_neon_qshl_u64(d, tcg_env, s, tcg_constant_i64(i)); +} + +static void gen_sqshlui_b(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s8); +} + +static void gen_sqshlui_h(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s16); +} + +static void gen_sqshlui_s(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s32); +} + +static void gen_sqshlui_d(TCGv_i64 d, TCGv_i64 s, int64_t i) +{ + gen_helper_neon_qshlu_s64(d, tcg_env, s, tcg_constant_i64(i)); +} + +static WideShiftImmFn * const f_scalar_sqshli[] = { + gen_sqshli_b, gen_sqshli_h, gen_sqshli_s, gen_sqshli_d +}; + +static WideShiftImmFn * const f_scalar_uqshli[] = { + gen_uqshli_b, gen_uqshli_h, gen_uqshli_s, gen_uqshli_d +}; + +static WideShiftImmFn * const f_scalar_sqshlui[] = { + gen_sqshlui_b, gen_sqshlui_h, gen_sqshlui_s, gen_sqshlui_d +}; + +/* Note that the helpers sign-extend their inputs, so don't do it here. */ +TRANS(SQSHL_si, do_scalar_shift_imm, a, f_scalar_sqshli[a->esz], false, 0) +TRANS(UQSHL_si, do_scalar_shift_imm, a, f_scalar_uqshli[a->esz], false, 0) +TRANS(SQSHLU_si, do_scalar_shift_imm, a, f_scalar_sqshlui[a->esz], false, 0) + +static bool do_scalar_shift_imm_narrow(DisasContext *s, arg_rri_e *a, + WideShiftImmFn * const fns[3], + MemOp sign, bool zext) +{ + MemOp esz = a->esz; + + tcg_debug_assert(esz >= MO_8 && esz <= MO_32); + + if (fp_access_check(s)) { + TCGv_i64 rd = tcg_temp_new_i64(); + TCGv_i64 rn = tcg_temp_new_i64(); + + read_vec_element(s, rn, a->rn, 0, (esz + 1) | sign); + fns[esz](rd, rn, a->imm); + if (zext) { + tcg_gen_ext_i64(rd, rd, esz); + } + write_fp_dreg(s, a->rd, rd); + } + return true; +} + +TRANS(SQSHRN_si, do_scalar_shift_imm_narrow, a, sqshrn_fns, MO_SIGN, true) +TRANS(SQRSHRN_si, do_scalar_shift_imm_narrow, a, sqrshrn_fns, MO_SIGN, true) +TRANS(UQSHRN_si, do_scalar_shift_imm_narrow, a, uqshrn_fns, 0, false) +TRANS(UQRSHRN_si, do_scalar_shift_imm_narrow, a, uqrshrn_fns, 0, false) +TRANS(SQSHRUN_si, do_scalar_shift_imm_narrow, a, sqshrun_fns, MO_SIGN, false) +TRANS(SQRSHRUN_si, do_scalar_shift_imm_narrow, a, sqrshrun_fns, MO_SIGN, false) + /* Shift a TCGv src by TCGv shift_amount, put result in dst. * Note that it is the caller's responsibility to ensure that the * shift amount is in range (ie 0..31 or 0..63) and provide the ARM @@ -8401,53 +9222,6 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn) } } -/* Floating point immediate - * 31 30 29 28 24 23 22 21 20 13 12 10 9 5 4 0 - * +---+---+---+-----------+------+---+------------+-------+------+------+ - * | M | 0 | S | 1 1 1 1 0 | type | 1 | imm8 | 1 0 0 | imm5 | Rd | - * +---+---+---+-----------+------+---+------------+-------+------+------+ - */ -static void disas_fp_imm(DisasContext *s, uint32_t insn) -{ - int rd = extract32(insn, 0, 5); - int imm5 = extract32(insn, 5, 5); - int imm8 = extract32(insn, 13, 8); - int type = extract32(insn, 22, 2); - int mos = extract32(insn, 29, 3); - uint64_t imm; - MemOp sz; - - if (mos || imm5) { - unallocated_encoding(s); - return; - } - - switch (type) { - case 0: - sz = MO_32; - break; - case 1: - sz = MO_64; - break; - case 3: - sz = MO_16; - if (dc_isar_feature(aa64_fp16, s)) { - break; - } - /* fallthru */ - default: - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - imm = vfp_expand_imm(sz, imm8); - write_fp_dreg(s, rd, tcg_constant_i64(imm)); -} - /* Handle floating point <=> fixed point conversions. Note that we can * also deal with fp <=> integer conversions as a special case (scale == 64) * OPTME: consider handling that special case specially or at least skipping @@ -8867,7 +9641,7 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn) switch (ctz32(extract32(insn, 12, 4))) { case 0: /* [15:12] == xxx1 */ /* Floating point immediate */ - disas_fp_imm(s, insn); + unallocated_encoding(s); /* in decodetree */ break; case 1: /* [15:12] == xx10 */ /* Floating point compare */ @@ -8890,874 +9664,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn) } } -static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right, - int pos) -{ - /* Extract 64 bits from the middle of two concatenated 64 bit - * vector register slices left:right. The extracted bits start - * at 'pos' bits into the right (least significant) side. - * We return the result in tcg_right, and guarantee not to - * trash tcg_left. - */ - TCGv_i64 tcg_tmp = tcg_temp_new_i64(); - assert(pos > 0 && pos < 64); - - tcg_gen_shri_i64(tcg_right, tcg_right, pos); - tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos); - tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp); -} - -/* EXT - * 31 30 29 24 23 22 21 20 16 15 14 11 10 9 5 4 0 - * +---+---+-------------+-----+---+------+---+------+---+------+------+ - * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 | Rm | 0 | imm4 | 0 | Rn | Rd | - * +---+---+-------------+-----+---+------+---+------+---+------+------+ - */ -static void disas_simd_ext(DisasContext *s, uint32_t insn) -{ - int is_q = extract32(insn, 30, 1); - int op2 = extract32(insn, 22, 2); - int imm4 = extract32(insn, 11, 4); - int rm = extract32(insn, 16, 5); - int rn = extract32(insn, 5, 5); - int rd = extract32(insn, 0, 5); - int pos = imm4 << 3; - TCGv_i64 tcg_resl, tcg_resh; - - if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - tcg_resh = tcg_temp_new_i64(); - tcg_resl = tcg_temp_new_i64(); - - /* Vd gets bits starting at pos bits into Vm:Vn. This is - * either extracting 128 bits from a 128:128 concatenation, or - * extracting 64 bits from a 64:64 concatenation. - */ - if (!is_q) { - read_vec_element(s, tcg_resl, rn, 0, MO_64); - if (pos != 0) { - read_vec_element(s, tcg_resh, rm, 0, MO_64); - do_ext64(s, tcg_resh, tcg_resl, pos); - } - } else { - TCGv_i64 tcg_hh; - typedef struct { - int reg; - int elt; - } EltPosns; - EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} }; - EltPosns *elt = eltposns; - - if (pos >= 64) { - elt++; - pos -= 64; - } - - read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64); - elt++; - read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64); - elt++; - if (pos != 0) { - do_ext64(s, tcg_resh, tcg_resl, pos); - tcg_hh = tcg_temp_new_i64(); - read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64); - do_ext64(s, tcg_hh, tcg_resh, pos); - } - } - - write_vec_element(s, tcg_resl, rd, 0, MO_64); - if (is_q) { - write_vec_element(s, tcg_resh, rd, 1, MO_64); - } - clear_vec_high(s, is_q, rd); -} - -/* TBL/TBX - * 31 30 29 24 23 22 21 20 16 15 14 13 12 11 10 9 5 4 0 - * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+ - * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 | Rm | 0 | len | op | 0 0 | Rn | Rd | - * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+ - */ -static void disas_simd_tb(DisasContext *s, uint32_t insn) -{ - int op2 = extract32(insn, 22, 2); - int is_q = extract32(insn, 30, 1); - int rm = extract32(insn, 16, 5); - int rn = extract32(insn, 5, 5); - int rd = extract32(insn, 0, 5); - int is_tbx = extract32(insn, 12, 1); - int len = (extract32(insn, 13, 2) + 1) * 16; - - if (op2 != 0) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd), - vec_full_reg_offset(s, rm), tcg_env, - is_q ? 16 : 8, vec_full_reg_size(s), - (len << 6) | (is_tbx << 5) | rn, - gen_helper_simd_tblx); -} - -/* ZIP/UZP/TRN - * 31 30 29 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0 - * +---+---+-------------+------+---+------+---+------------------+------+ - * | 0 | Q | 0 0 1 1 1 0 | size | 0 | Rm | 0 | opc | 1 0 | Rn | Rd | - * +---+---+-------------+------+---+------+---+------------------+------+ - */ -static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) -{ - int rd = extract32(insn, 0, 5); - int rn = extract32(insn, 5, 5); - int rm = extract32(insn, 16, 5); - int size = extract32(insn, 22, 2); - /* opc field bits [1:0] indicate ZIP/UZP/TRN; - * bit 2 indicates 1 vs 2 variant of the insn. - */ - int opcode = extract32(insn, 12, 2); - bool part = extract32(insn, 14, 1); - bool is_q = extract32(insn, 30, 1); - int esize = 8 << size; - int i; - int datasize = is_q ? 128 : 64; - int elements = datasize / esize; - TCGv_i64 tcg_res[2], tcg_ele; - - if (opcode == 0 || (size == 3 && !is_q)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - tcg_res[0] = tcg_temp_new_i64(); - tcg_res[1] = is_q ? tcg_temp_new_i64() : NULL; - tcg_ele = tcg_temp_new_i64(); - - for (i = 0; i < elements; i++) { - int o, w; - - switch (opcode) { - case 1: /* UZP1/2 */ - { - int midpoint = elements / 2; - if (i < midpoint) { - read_vec_element(s, tcg_ele, rn, 2 * i + part, size); - } else { - read_vec_element(s, tcg_ele, rm, - 2 * (i - midpoint) + part, size); - } - break; - } - case 2: /* TRN1/2 */ - if (i & 1) { - read_vec_element(s, tcg_ele, rm, (i & ~1) + part, size); - } else { - read_vec_element(s, tcg_ele, rn, (i & ~1) + part, size); - } - break; - case 3: /* ZIP1/2 */ - { - int base = part * elements / 2; - if (i & 1) { - read_vec_element(s, tcg_ele, rm, base + (i >> 1), size); - } else { - read_vec_element(s, tcg_ele, rn, base + (i >> 1), size); - } - break; - } - default: - g_assert_not_reached(); - } - - w = (i * esize) / 64; - o = (i * esize) % 64; - if (o == 0) { - tcg_gen_mov_i64(tcg_res[w], tcg_ele); - } else { - tcg_gen_shli_i64(tcg_ele, tcg_ele, o); - tcg_gen_or_i64(tcg_res[w], tcg_res[w], tcg_ele); - } - } - - for (i = 0; i <= is_q; ++i) { - write_vec_element(s, tcg_res[i], rd, i, MO_64); - } - clear_vec_high(s, is_q, rd); -} - -/* - * do_reduction_op helper - * - * This mirrors the Reduce() pseudocode in the ARM ARM. It is - * important for correct NaN propagation that we do these - * operations in exactly the order specified by the pseudocode. - * - * This is a recursive function, TCG temps should be freed by the - * calling function once it is done with the values. - */ -static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn, - int esize, int size, int vmap, TCGv_ptr fpst) -{ - if (esize == size) { - int element; - MemOp msize = esize == 16 ? MO_16 : MO_32; - TCGv_i32 tcg_elem; - - /* We should have one register left here */ - assert(ctpop8(vmap) == 1); - element = ctz32(vmap); - assert(element < 8); - - tcg_elem = tcg_temp_new_i32(); - read_vec_element_i32(s, tcg_elem, rn, element, msize); - return tcg_elem; - } else { - int bits = size / 2; - int shift = ctpop8(vmap) / 2; - int vmap_lo = (vmap >> shift) & vmap; - int vmap_hi = (vmap & ~vmap_lo); - TCGv_i32 tcg_hi, tcg_lo, tcg_res; - - tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst); - tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst); - tcg_res = tcg_temp_new_i32(); - - switch (fpopcode) { - case 0x0c: /* fmaxnmv half-precision */ - gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x0f: /* fmaxv half-precision */ - gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x1c: /* fminnmv half-precision */ - gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x1f: /* fminv half-precision */ - gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x2c: /* fmaxnmv */ - gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x2f: /* fmaxv */ - gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x3c: /* fminnmv */ - gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst); - break; - case 0x3f: /* fminv */ - gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst); - break; - default: - g_assert_not_reached(); - } - return tcg_res; - } -} - -/* AdvSIMD across lanes - * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 - * +---+---+---+-----------+------+-----------+--------+-----+------+------+ - * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd | - * +---+---+---+-----------+------+-----------+--------+-----+------+------+ - */ -static void disas_simd_across_lanes(DisasContext *s, uint32_t insn) -{ - int rd = extract32(insn, 0, 5); - int rn = extract32(insn, 5, 5); - int size = extract32(insn, 22, 2); - int opcode = extract32(insn, 12, 5); - bool is_q = extract32(insn, 30, 1); - bool is_u = extract32(insn, 29, 1); - bool is_fp = false; - bool is_min = false; - int esize; - int elements; - int i; - TCGv_i64 tcg_res, tcg_elt; - - switch (opcode) { - case 0x1b: /* ADDV */ - if (is_u) { - unallocated_encoding(s); - return; - } - /* fall through */ - case 0x3: /* SADDLV, UADDLV */ - case 0xa: /* SMAXV, UMAXV */ - case 0x1a: /* SMINV, UMINV */ - if (size == 3 || (size == 2 && !is_q)) { - unallocated_encoding(s); - return; - } - break; - case 0xc: /* FMAXNMV, FMINNMV */ - case 0xf: /* FMAXV, FMINV */ - /* Bit 1 of size field encodes min vs max and the actual size - * depends on the encoding of the U bit. If not set (and FP16 - * enabled) then we do half-precision float instead of single - * precision. - */ - is_min = extract32(size, 1, 1); - is_fp = true; - if (!is_u && dc_isar_feature(aa64_fp16, s)) { - size = 1; - } else if (!is_u || !is_q || extract32(size, 0, 1)) { - unallocated_encoding(s); - return; - } else { - size = 2; - } - break; - default: - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - esize = 8 << size; - elements = (is_q ? 128 : 64) / esize; - - tcg_res = tcg_temp_new_i64(); - tcg_elt = tcg_temp_new_i64(); - - /* These instructions operate across all lanes of a vector - * to produce a single result. We can guarantee that a 64 - * bit intermediate is sufficient: - * + for [US]ADDLV the maximum element size is 32 bits, and - * the result type is 64 bits - * + for FMAX*V, FMIN*V, ADDV the intermediate type is the - * same as the element size, which is 32 bits at most - * For the integer operations we can choose to work at 64 - * or 32 bits and truncate at the end; for simplicity - * we use 64 bits always. The floating point - * ops do require 32 bit intermediates, though. - */ - if (!is_fp) { - read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN)); - - for (i = 1; i < elements; i++) { - read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN)); - - switch (opcode) { - case 0x03: /* SADDLV / UADDLV */ - case 0x1b: /* ADDV */ - tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt); - break; - case 0x0a: /* SMAXV / UMAXV */ - if (is_u) { - tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt); - } else { - tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt); - } - break; - case 0x1a: /* SMINV / UMINV */ - if (is_u) { - tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt); - } else { - tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt); - } - break; - default: - g_assert_not_reached(); - } - - } - } else { - /* Floating point vector reduction ops which work across 32 - * bit (single) or 16 bit (half-precision) intermediates. - * Note that correct NaN propagation requires that we do these - * operations in exactly the order specified by the pseudocode. - */ - TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); - int fpopcode = opcode | is_min << 4 | is_u << 5; - int vmap = (1 << elements) - 1; - TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize, - (is_q ? 128 : 64), vmap, fpst); - tcg_gen_extu_i32_i64(tcg_res, tcg_res32); - } - - /* Now truncate the result to the width required for the final output */ - if (opcode == 0x03) { - /* SADDLV, UADDLV: result is 2*esize */ - size++; - } - - switch (size) { - case 0: - tcg_gen_ext8u_i64(tcg_res, tcg_res); - break; - case 1: - tcg_gen_ext16u_i64(tcg_res, tcg_res); - break; - case 2: - tcg_gen_ext32u_i64(tcg_res, tcg_res); - break; - case 3: - break; - default: - g_assert_not_reached(); - } - - write_fp_dreg(s, rd, tcg_res); -} - -/* AdvSIMD modified immediate - * 31 30 29 28 19 18 16 15 12 11 10 9 5 4 0 - * +---+---+----+---------------------+-----+-------+----+---+-------+------+ - * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh | Rd | - * +---+---+----+---------------------+-----+-------+----+---+-------+------+ - * - * There are a number of operations that can be carried out here: - * MOVI - move (shifted) imm into register - * MVNI - move inverted (shifted) imm into register - * ORR - bitwise OR of (shifted) imm with register - * BIC - bitwise clear of (shifted) imm with register - * With ARMv8.2 we also have: - * FMOV half-precision - */ -static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) -{ - int rd = extract32(insn, 0, 5); - int cmode = extract32(insn, 12, 4); - int o2 = extract32(insn, 11, 1); - uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5); - bool is_neg = extract32(insn, 29, 1); - bool is_q = extract32(insn, 30, 1); - uint64_t imm = 0; - - if (o2) { - if (cmode != 0xf || is_neg) { - unallocated_encoding(s); - return; - } - /* FMOV (vector, immediate) - half-precision */ - if (!dc_isar_feature(aa64_fp16, s)) { - unallocated_encoding(s); - return; - } - imm = vfp_expand_imm(MO_16, abcdefgh); - /* now duplicate across the lanes */ - imm = dup_const(MO_16, imm); - } else { - if (cmode == 0xf && is_neg && !is_q) { - unallocated_encoding(s); - return; - } - imm = asimd_imm_const(abcdefgh, cmode, is_neg); - } - - if (!fp_access_check(s)) { - return; - } - - if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) { - /* MOVI or MVNI, with MVNI negation handled above. */ - tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8, - vec_full_reg_size(s), imm); - } else { - /* ORR or BIC, with BIC negation to AND handled above. */ - if (is_neg) { - gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64); - } else { - gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64); - } - } -} - -/* - * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate) - * - * This code is handles the common shifting code and is used by both - * the vector and scalar code. - */ -static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src, - TCGv_i64 tcg_rnd, bool accumulate, - bool is_u, int size, int shift) -{ - bool extended_result = false; - bool round = tcg_rnd != NULL; - int ext_lshift = 0; - TCGv_i64 tcg_src_hi; - - if (round && size == 3) { - extended_result = true; - ext_lshift = 64 - shift; - tcg_src_hi = tcg_temp_new_i64(); - } else if (shift == 64) { - if (!accumulate && is_u) { - /* result is zero */ - tcg_gen_movi_i64(tcg_res, 0); - return; - } - } - - /* Deal with the rounding step */ - if (round) { - if (extended_result) { - TCGv_i64 tcg_zero = tcg_constant_i64(0); - if (!is_u) { - /* take care of sign extending tcg_res */ - tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63); - tcg_gen_add2_i64(tcg_src, tcg_src_hi, - tcg_src, tcg_src_hi, - tcg_rnd, tcg_zero); - } else { - tcg_gen_add2_i64(tcg_src, tcg_src_hi, - tcg_src, tcg_zero, - tcg_rnd, tcg_zero); - } - } else { - tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd); - } - } - - /* Now do the shift right */ - if (round && extended_result) { - /* extended case, >64 bit precision required */ - if (ext_lshift == 0) { - /* special case, only high bits matter */ - tcg_gen_mov_i64(tcg_src, tcg_src_hi); - } else { - tcg_gen_shri_i64(tcg_src, tcg_src, shift); - tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift); - tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi); - } - } else { - if (is_u) { - if (shift == 64) { - /* essentially shifting in 64 zeros */ - tcg_gen_movi_i64(tcg_src, 0); - } else { - tcg_gen_shri_i64(tcg_src, tcg_src, shift); - } - } else { - if (shift == 64) { - /* effectively extending the sign-bit */ - tcg_gen_sari_i64(tcg_src, tcg_src, 63); - } else { - tcg_gen_sari_i64(tcg_src, tcg_src, shift); - } - } - } - - if (accumulate) { - tcg_gen_add_i64(tcg_res, tcg_res, tcg_src); - } else { - tcg_gen_mov_i64(tcg_res, tcg_src); - } -} - -/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */ -static void handle_scalar_simd_shri(DisasContext *s, - bool is_u, int immh, int immb, - int opcode, int rn, int rd) -{ - const int size = 3; - int immhb = immh << 3 | immb; - int shift = 2 * (8 << size) - immhb; - bool accumulate = false; - bool round = false; - bool insert = false; - TCGv_i64 tcg_rn; - TCGv_i64 tcg_rd; - TCGv_i64 tcg_round; - - if (!extract32(immh, 3, 1)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - switch (opcode) { - case 0x02: /* SSRA / USRA (accumulate) */ - accumulate = true; - break; - case 0x04: /* SRSHR / URSHR (rounding) */ - round = true; - break; - case 0x06: /* SRSRA / URSRA (accum + rounding) */ - accumulate = round = true; - break; - case 0x08: /* SRI */ - insert = true; - break; - } - - if (round) { - tcg_round = tcg_constant_i64(1ULL << (shift - 1)); - } else { - tcg_round = NULL; - } - - tcg_rn = read_fp_dreg(s, rn); - tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); - - if (insert) { - /* shift count same as element size is valid but does nothing; - * special case to avoid potential shift by 64. - */ - int esize = 8 << size; - if (shift != esize) { - tcg_gen_shri_i64(tcg_rn, tcg_rn, shift); - tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift); - } - } else { - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - accumulate, is_u, size, shift); - } - - write_fp_dreg(s, rd, tcg_rd); -} - -/* SHL/SLI - Scalar shift left */ -static void handle_scalar_simd_shli(DisasContext *s, bool insert, - int immh, int immb, int opcode, - int rn, int rd) -{ - int size = 32 - clz32(immh) - 1; - int immhb = immh << 3 | immb; - int shift = immhb - (8 << size); - TCGv_i64 tcg_rn; - TCGv_i64 tcg_rd; - - if (!extract32(immh, 3, 1)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - tcg_rn = read_fp_dreg(s, rn); - tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); - - if (insert) { - tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift); - } else { - tcg_gen_shli_i64(tcg_rd, tcg_rn, shift); - } - - write_fp_dreg(s, rd, tcg_rd); -} - -/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with - * (signed/unsigned) narrowing */ -static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q, - bool is_u_shift, bool is_u_narrow, - int immh, int immb, int opcode, - int rn, int rd) -{ - int immhb = immh << 3 | immb; - int size = 32 - clz32(immh) - 1; - int esize = 8 << size; - int shift = (2 * esize) - immhb; - int elements = is_scalar ? 1 : (64 / esize); - bool round = extract32(opcode, 0, 1); - MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN); - TCGv_i64 tcg_rn, tcg_rd, tcg_round; - TCGv_i32 tcg_rd_narrowed; - TCGv_i64 tcg_final; - - static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = { - { gen_helper_neon_narrow_sat_s8, - gen_helper_neon_unarrow_sat8 }, - { gen_helper_neon_narrow_sat_s16, - gen_helper_neon_unarrow_sat16 }, - { gen_helper_neon_narrow_sat_s32, - gen_helper_neon_unarrow_sat32 }, - { NULL, NULL }, - }; - static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = { - gen_helper_neon_narrow_sat_u8, - gen_helper_neon_narrow_sat_u16, - gen_helper_neon_narrow_sat_u32, - NULL - }; - NeonGenNarrowEnvFn *narrowfn; - - int i; - - assert(size < 4); - - if (extract32(immh, 3, 1)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - if (is_u_shift) { - narrowfn = unsigned_narrow_fns[size]; - } else { - narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0]; - } - - tcg_rn = tcg_temp_new_i64(); - tcg_rd = tcg_temp_new_i64(); - tcg_rd_narrowed = tcg_temp_new_i32(); - tcg_final = tcg_temp_new_i64(); - - if (round) { - tcg_round = tcg_constant_i64(1ULL << (shift - 1)); - } else { - tcg_round = NULL; - } - - for (i = 0; i < elements; i++) { - read_vec_element(s, tcg_rn, rn, i, ldop); - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - false, is_u_shift, size+1, shift); - narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd); - tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed); - if (i == 0) { - tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize); - } else { - tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); - } - } - - if (!is_q) { - write_vec_element(s, tcg_final, rd, 0, MO_64); - } else { - write_vec_element(s, tcg_final, rd, 1, MO_64); - } - clear_vec_high(s, is_q, rd); -} - -/* SQSHLU, UQSHL, SQSHL: saturating left shifts */ -static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q, - bool src_unsigned, bool dst_unsigned, - int immh, int immb, int rn, int rd) -{ - int immhb = immh << 3 | immb; - int size = 32 - clz32(immh) - 1; - int shift = immhb - (8 << size); - int pass; - - assert(immh != 0); - assert(!(scalar && is_q)); - - if (!scalar) { - if (!is_q && extract32(immh, 3, 1)) { - unallocated_encoding(s); - return; - } - - /* Since we use the variable-shift helpers we must - * replicate the shift count into each element of - * the tcg_shift value. - */ - switch (size) { - case 0: - shift |= shift << 8; - /* fall through */ - case 1: - shift |= shift << 16; - break; - case 2: - case 3: - break; - default: - g_assert_not_reached(); - } - } - - if (!fp_access_check(s)) { - return; - } - - if (size == 3) { - TCGv_i64 tcg_shift = tcg_constant_i64(shift); - static NeonGenTwo64OpEnvFn * const fns[2][2] = { - { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 }, - { NULL, gen_helper_neon_qshl_u64 }, - }; - NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned]; - int maxpass = is_q ? 2 : 1; - - for (pass = 0; pass < maxpass; pass++) { - TCGv_i64 tcg_op = tcg_temp_new_i64(); - - read_vec_element(s, tcg_op, rn, pass, MO_64); - genfn(tcg_op, tcg_env, tcg_op, tcg_shift); - write_vec_element(s, tcg_op, rd, pass, MO_64); - } - clear_vec_high(s, is_q, rd); - } else { - TCGv_i32 tcg_shift = tcg_constant_i32(shift); - static NeonGenTwoOpEnvFn * const fns[2][2][3] = { - { - { gen_helper_neon_qshl_s8, - gen_helper_neon_qshl_s16, - gen_helper_neon_qshl_s32 }, - { gen_helper_neon_qshlu_s8, - gen_helper_neon_qshlu_s16, - gen_helper_neon_qshlu_s32 } - }, { - { NULL, NULL, NULL }, - { gen_helper_neon_qshl_u8, - gen_helper_neon_qshl_u16, - gen_helper_neon_qshl_u32 } - } - }; - NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size]; - MemOp memop = scalar ? size : MO_32; - int maxpass = scalar ? 1 : is_q ? 4 : 2; - - for (pass = 0; pass < maxpass; pass++) { - TCGv_i32 tcg_op = tcg_temp_new_i32(); - - read_vec_element_i32(s, tcg_op, rn, pass, memop); - genfn(tcg_op, tcg_env, tcg_op, tcg_shift); - if (scalar) { - switch (size) { - case 0: - tcg_gen_ext8u_i32(tcg_op, tcg_op); - break; - case 1: - tcg_gen_ext16u_i32(tcg_op, tcg_op); - break; - case 2: - break; - default: - g_assert_not_reached(); - } - write_fp_sreg(s, rd, tcg_op); - } else { - write_vec_element_i32(s, tcg_op, rd, pass, MO_32); - } - } - - if (!scalar) { - clear_vec_high(s, is_q, rd); - } - } -} - /* Common vector code for handling integer to FP conversion */ static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn, int elements, int is_signed, @@ -10019,53 +9925,26 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn) } switch (opcode) { - case 0x08: /* SRI */ - if (!is_u) { - unallocated_encoding(s); - return; - } - /* fall through */ - case 0x00: /* SSHR / USHR */ - case 0x02: /* SSRA / USRA */ - case 0x04: /* SRSHR / URSHR */ - case 0x06: /* SRSRA / URSRA */ - handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd); - break; - case 0x0a: /* SHL / SLI */ - handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd); - break; case 0x1c: /* SCVTF, UCVTF */ handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb, opcode, rn, rd); break; - case 0x10: /* SQSHRUN, SQSHRUN2 */ - case 0x11: /* SQRSHRUN, SQRSHRUN2 */ - if (!is_u) { - unallocated_encoding(s); - return; - } - handle_vec_simd_sqshrn(s, true, false, false, true, - immh, immb, opcode, rn, rd); - break; - case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */ - case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */ - handle_vec_simd_sqshrn(s, true, false, is_u, is_u, - immh, immb, opcode, rn, rd); - break; - case 0xc: /* SQSHLU */ - if (!is_u) { - unallocated_encoding(s); - return; - } - handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd); - break; - case 0xe: /* SQSHL, UQSHL */ - handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd); - break; case 0x1f: /* FCVTZS, FCVTZU */ handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd); break; default: + case 0x00: /* SSHR / USHR */ + case 0x02: /* SSRA / USRA */ + case 0x04: /* SRSHR / URSHR */ + case 0x06: /* SRSRA / URSRA */ + case 0x08: /* SRI */ + case 0x0a: /* SHL / SLI */ + case 0x0c: /* SQSHLU */ + case 0x0e: /* SQSHL, UQSHL */ + case 0x10: /* SQSHRUN */ + case 0x11: /* SQRSHRUN */ + case 0x12: /* SQSHRN, UQSHRN */ + case 0x13: /* SQRSHRN, UQRSHRN */ unallocated_encoding(s); break; } @@ -10380,35 +10259,35 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar, * in the source becomes a size element in the destination). */ int pass; - TCGv_i32 tcg_res[2]; + TCGv_i64 tcg_res[2]; int destelt = is_q ? 2 : 0; int passes = scalar ? 1 : 2; if (scalar) { - tcg_res[1] = tcg_constant_i32(0); + tcg_res[1] = tcg_constant_i64(0); } for (pass = 0; pass < passes; pass++) { TCGv_i64 tcg_op = tcg_temp_new_i64(); - NeonGenNarrowFn *genfn = NULL; - NeonGenNarrowEnvFn *genenvfn = NULL; + NeonGenOne64OpFn *genfn = NULL; + NeonGenOne64OpEnvFn *genenvfn = NULL; if (scalar) { read_vec_element(s, tcg_op, rn, pass, size + 1); } else { read_vec_element(s, tcg_op, rn, pass, MO_64); } - tcg_res[pass] = tcg_temp_new_i32(); + tcg_res[pass] = tcg_temp_new_i64(); switch (opcode) { case 0x12: /* XTN, SQXTUN */ { - static NeonGenNarrowFn * const xtnfns[3] = { + static NeonGenOne64OpFn * const xtnfns[3] = { gen_helper_neon_narrow_u8, gen_helper_neon_narrow_u16, - tcg_gen_extrl_i64_i32, + tcg_gen_ext32u_i64, }; - static NeonGenNarrowEnvFn * const sqxtunfns[3] = { + static NeonGenOne64OpEnvFn * const sqxtunfns[3] = { gen_helper_neon_unarrow_sat8, gen_helper_neon_unarrow_sat16, gen_helper_neon_unarrow_sat32, @@ -10422,7 +10301,7 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar, } case 0x14: /* SQXTN, UQXTN */ { - static NeonGenNarrowEnvFn * const fns[3][2] = { + static NeonGenOne64OpEnvFn * const fns[3][2] = { { gen_helper_neon_narrow_sat_s8, gen_helper_neon_narrow_sat_u8 }, { gen_helper_neon_narrow_sat_s16, @@ -10436,7 +10315,9 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar, case 0x16: /* FCVTN, FCVTN2 */ /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */ if (size == 2) { - gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, tcg_env); + TCGv_i32 tmp = tcg_temp_new_i32(); + gen_helper_vfp_fcvtsd(tmp, tcg_op, tcg_env); + tcg_gen_extu_i32_i64(tcg_res[pass], tmp); } else { TCGv_i32 tcg_lo = tcg_temp_new_i32(); TCGv_i32 tcg_hi = tcg_temp_new_i32(); @@ -10446,21 +10327,29 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar, tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op); gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp); gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp); - tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16); + tcg_gen_deposit_i32(tcg_lo, tcg_lo, tcg_hi, 16, 16); + tcg_gen_extu_i32_i64(tcg_res[pass], tcg_lo); } break; case 0x36: /* BFCVTN, BFCVTN2 */ { TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); - gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst); + TCGv_i32 tmp = tcg_temp_new_i32(); + gen_helper_bfcvt_pair(tmp, tcg_op, fpst); + tcg_gen_extu_i32_i64(tcg_res[pass], tmp); } break; case 0x56: /* FCVTXN, FCVTXN2 */ - /* 64 bit to 32 bit float conversion - * with von Neumann rounding (round to odd) - */ - assert(size == 2); - gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, tcg_env); + { + /* + * 64 bit to 32 bit float conversion + * with von Neumann rounding (round to odd) + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + assert(size == 2); + gen_helper_fcvtx_f64_to_f32(tmp, tcg_op, tcg_env); + tcg_gen_extu_i32_i64(tcg_res[pass], tmp); + } break; default: g_assert_not_reached(); @@ -10474,7 +10363,7 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar, } for (pass = 0; pass < 2; pass++) { - write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32); + write_vec_element(s, tcg_res[pass], rd, destelt + pass, MO_32); } clear_vec_high(s, is_q, rd); } @@ -10667,185 +10556,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) } } -/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */ -static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, - int immh, int immb, int opcode, int rn, int rd) -{ - int size = 32 - clz32(immh) - 1; - int immhb = immh << 3 | immb; - int shift = 2 * (8 << size) - immhb; - GVecGen2iFn *gvec_fn; - - if (extract32(immh, 3, 1) && !is_q) { - unallocated_encoding(s); - return; - } - tcg_debug_assert(size <= 3); - - if (!fp_access_check(s)) { - return; - } - - switch (opcode) { - case 0x02: /* SSRA / USRA (accumulate) */ - gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra; - break; - - case 0x08: /* SRI */ - gvec_fn = gen_gvec_sri; - break; - - case 0x00: /* SSHR / USHR */ - if (is_u) { - if (shift == 8 << size) { - /* Shift count the same size as element size produces zero. */ - tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd), - is_q ? 16 : 8, vec_full_reg_size(s), 0); - return; - } - gvec_fn = tcg_gen_gvec_shri; - } else { - /* Shift count the same size as element size produces all sign. */ - if (shift == 8 << size) { - shift -= 1; - } - gvec_fn = tcg_gen_gvec_sari; - } - break; - - case 0x04: /* SRSHR / URSHR (rounding) */ - gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr; - break; - - case 0x06: /* SRSRA / URSRA (accum + rounding) */ - gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra; - break; - - default: - g_assert_not_reached(); - } - - gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size); -} - -/* SHL/SLI - Vector shift left */ -static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, - int immh, int immb, int opcode, int rn, int rd) -{ - int size = 32 - clz32(immh) - 1; - int immhb = immh << 3 | immb; - int shift = immhb - (8 << size); - - /* Range of size is limited by decode: immh is a non-zero 4 bit field */ - assert(size >= 0 && size <= 3); - - if (extract32(immh, 3, 1) && !is_q) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - if (insert) { - gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size); - } else { - gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size); - } -} - -/* USHLL/SHLL - Vector shift left with widening */ -static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u, - int immh, int immb, int opcode, int rn, int rd) -{ - int size = 32 - clz32(immh) - 1; - int immhb = immh << 3 | immb; - int shift = immhb - (8 << size); - int dsize = 64; - int esize = 8 << size; - int elements = dsize/esize; - TCGv_i64 tcg_rn = tcg_temp_new_i64(); - TCGv_i64 tcg_rd = tcg_temp_new_i64(); - int i; - - if (size >= 3) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - /* For the LL variants the store is larger than the load, - * so if rd == rn we would overwrite parts of our input. - * So load everything right now and use shifts in the main loop. - */ - read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64); - - for (i = 0; i < elements; i++) { - tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize); - ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0); - tcg_gen_shli_i64(tcg_rd, tcg_rd, shift); - write_vec_element(s, tcg_rd, rd, i, size + 1); - } - clear_vec_high(s, true, rd); -} - -/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */ -static void handle_vec_simd_shrn(DisasContext *s, bool is_q, - int immh, int immb, int opcode, int rn, int rd) -{ - int immhb = immh << 3 | immb; - int size = 32 - clz32(immh) - 1; - int dsize = 64; - int esize = 8 << size; - int elements = dsize/esize; - int shift = (2 * esize) - immhb; - bool round = extract32(opcode, 0, 1); - TCGv_i64 tcg_rn, tcg_rd, tcg_final; - TCGv_i64 tcg_round; - int i; - - if (extract32(immh, 3, 1)) { - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - tcg_rn = tcg_temp_new_i64(); - tcg_rd = tcg_temp_new_i64(); - tcg_final = tcg_temp_new_i64(); - read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64); - - if (round) { - tcg_round = tcg_constant_i64(1ULL << (shift - 1)); - } else { - tcg_round = NULL; - } - - for (i = 0; i < elements; i++) { - read_vec_element(s, tcg_rn, rn, i, size+1); - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - false, true, size+1, shift); - - tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); - } - - if (!is_q) { - write_vec_element(s, tcg_final, rd, 0, MO_64); - } else { - write_vec_element(s, tcg_final, rd, 1, MO_64); - } - - clear_vec_high(s, is_q, rd); -} - - /* AdvSIMD shift by immediate * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0 * +---+---+---+-------------+------+------+--------+---+------+------+ @@ -10862,60 +10572,33 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn) bool is_u = extract32(insn, 29, 1); bool is_q = extract32(insn, 30, 1); - /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */ - assert(immh != 0); + if (immh == 0) { + unallocated_encoding(s); + return; + } switch (opcode) { - case 0x08: /* SRI */ - if (!is_u) { - unallocated_encoding(s); - return; - } - /* fall through */ + case 0x1c: /* SCVTF / UCVTF */ + handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb, + opcode, rn, rd); + break; + case 0x1f: /* FCVTZS/ FCVTZU */ + handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd); + return; + default: case 0x00: /* SSHR / USHR */ case 0x02: /* SSRA / USRA (accumulate) */ case 0x04: /* SRSHR / URSHR (rounding) */ case 0x06: /* SRSRA / URSRA (accum + rounding) */ - handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd); - break; + case 0x08: /* SRI */ case 0x0a: /* SHL / SLI */ - handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd); - break; - case 0x10: /* SHRN */ + case 0x0c: /* SQSHLU */ + case 0x0e: /* SQSHL, UQSHL */ + case 0x10: /* SHRN / SQSHRUN */ case 0x11: /* RSHRN / SQRSHRUN */ - if (is_u) { - handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb, - opcode, rn, rd); - } else { - handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd); - } - break; case 0x12: /* SQSHRN / UQSHRN */ case 0x13: /* SQRSHRN / UQRSHRN */ - handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb, - opcode, rn, rd); - break; case 0x14: /* SSHLL / USHLL */ - handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd); - break; - case 0x1c: /* SCVTF / UCVTF */ - handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb, - opcode, rn, rd); - break; - case 0xc: /* SQSHLU */ - if (!is_u) { - unallocated_encoding(s); - return; - } - handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd); - break; - case 0xe: /* SQSHL, UQSHL */ - handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd); - break; - case 0x1f: /* FCVTZS/ FCVTZU */ - handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd); - return; - default: unallocated_encoding(s); return; } @@ -11871,13 +11554,7 @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn) static const AArch64DecodeTable data_proc_simd[] = { /* pattern , mask , fn */ { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc }, - { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes }, - /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */ - { 0x0f000400, 0x9ff80400, disas_simd_mod_imm }, { 0x0f000400, 0x9f800400, disas_simd_shift_imm }, - { 0x0e000000, 0xbf208c00, disas_simd_tb }, - { 0x0e000800, 0xbf208c00, disas_simd_zip_trn }, - { 0x2e000000, 0xbf208400, disas_simd_ext }, { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc }, { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm }, { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 }, diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c index 13cd31a..9c8829a 100644 --- a/target/arm/tcg/translate-neon.c +++ b/target/arm/tcg/translate-neon.c @@ -1099,144 +1099,18 @@ DO_2SH(VRSHR_S, gen_gvec_srshr) DO_2SH(VRSHR_U, gen_gvec_urshr) DO_2SH(VRSRA_S, gen_gvec_srsra) DO_2SH(VRSRA_U, gen_gvec_ursra) - -static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a) -{ - /* Signed shift out of range results in all-sign-bits */ - a->shift = MIN(a->shift, (8 << a->size) - 1); - return do_vector_2sh(s, a, tcg_gen_gvec_sari); -} - -static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, - int64_t shift, uint32_t oprsz, uint32_t maxsz) -{ - tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0); -} - -static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a) -{ - /* Shift out of range is architecturally valid and results in zero. */ - if (a->shift >= (8 << a->size)) { - return do_vector_2sh(s, a, gen_zero_rd_2sh); - } else { - return do_vector_2sh(s, a, tcg_gen_gvec_shri); - } -} - -static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a, - NeonGenTwo64OpEnvFn *fn) -{ - /* - * 2-reg-and-shift operations, size == 3 case, where the - * function needs to be passed tcg_env. - */ - TCGv_i64 constimm; - int pass; - - if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { - return false; - } - - /* UNDEF accesses to D16-D31 if they don't exist. */ - if (!dc_isar_feature(aa32_simd_r32, s) && - ((a->vd | a->vm) & 0x10)) { - return false; - } - - if ((a->vm | a->vd) & a->q) { - return false; - } - - if (!vfp_access_check(s)) { - return true; - } - - /* - * To avoid excessive duplication of ops we implement shift - * by immediate using the variable shift operations. - */ - constimm = tcg_constant_i64(dup_const(a->size, a->shift)); - - for (pass = 0; pass < a->q + 1; pass++) { - TCGv_i64 tmp = tcg_temp_new_i64(); - - read_neon_element64(tmp, a->vm, pass, MO_64); - fn(tmp, tcg_env, tmp, constimm); - write_neon_element64(tmp, a->vd, pass, MO_64); - } - return true; -} - -static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a, - NeonGenTwoOpEnvFn *fn) -{ - /* - * 2-reg-and-shift operations, size < 3 case, where the - * helper needs to be passed tcg_env. - */ - TCGv_i32 constimm, tmp; - int pass; - - if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { - return false; - } - - /* UNDEF accesses to D16-D31 if they don't exist. */ - if (!dc_isar_feature(aa32_simd_r32, s) && - ((a->vd | a->vm) & 0x10)) { - return false; - } - - if ((a->vm | a->vd) & a->q) { - return false; - } - - if (!vfp_access_check(s)) { - return true; - } - - /* - * To avoid excessive duplication of ops we implement shift - * by immediate using the variable shift operations. - */ - constimm = tcg_constant_i32(dup_const(a->size, a->shift)); - tmp = tcg_temp_new_i32(); - - for (pass = 0; pass < (a->q ? 4 : 2); pass++) { - read_neon_element32(tmp, a->vm, pass, MO_32); - fn(tmp, tcg_env, tmp, constimm); - write_neon_element32(tmp, a->vd, pass, MO_32); - } - return true; -} - -#define DO_2SHIFT_ENV(INSN, FUNC) \ - static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \ - { \ - return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \ - } \ - static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ - { \ - static NeonGenTwoOpEnvFn * const fns[] = { \ - gen_helper_neon_##FUNC##8, \ - gen_helper_neon_##FUNC##16, \ - gen_helper_neon_##FUNC##32, \ - }; \ - assert(a->size < ARRAY_SIZE(fns)); \ - return do_2shift_env_32(s, a, fns[a->size]); \ - } - -DO_2SHIFT_ENV(VQSHLU, qshlu_s) -DO_2SHIFT_ENV(VQSHL_U, qshl_u) -DO_2SHIFT_ENV(VQSHL_S, qshl_s) +DO_2SH(VSHR_S, gen_gvec_sshr) +DO_2SH(VSHR_U, gen_gvec_ushr) +DO_2SH(VQSHLU, gen_neon_sqshlui) +DO_2SH(VQSHL_U, gen_neon_uqshli) +DO_2SH(VQSHL_S, gen_neon_sqshli) static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a, NeonGenTwo64OpFn *shiftfn, - NeonGenNarrowEnvFn *narrowfn) + NeonGenOne64OpEnvFn *narrowfn) { /* 2-reg-and-shift narrowing-shift operations, size == 3 case */ - TCGv_i64 constimm, rm1, rm2; - TCGv_i32 rd; + TCGv_i64 constimm, rm1, rm2, rd; if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { return false; @@ -1263,7 +1137,7 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a, constimm = tcg_constant_i64(-a->shift); rm1 = tcg_temp_new_i64(); rm2 = tcg_temp_new_i64(); - rd = tcg_temp_new_i32(); + rd = tcg_temp_new_i64(); /* Load both inputs first to avoid potential overwrite if rm == rd */ read_neon_element64(rm1, a->vm, 0, MO_64); @@ -1271,18 +1145,18 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a, shiftfn(rm1, rm1, constimm); narrowfn(rd, tcg_env, rm1); - write_neon_element32(rd, a->vd, 0, MO_32); + write_neon_element64(rd, a->vd, 0, MO_32); shiftfn(rm2, rm2, constimm); narrowfn(rd, tcg_env, rm2); - write_neon_element32(rd, a->vd, 1, MO_32); + write_neon_element64(rd, a->vd, 1, MO_32); return true; } static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a, NeonGenTwoOpFn *shiftfn, - NeonGenNarrowEnvFn *narrowfn) + NeonGenOne64OpEnvFn *narrowfn) { /* 2-reg-and-shift narrowing-shift operations, size < 3 case */ TCGv_i32 constimm, rm1, rm2, rm3, rm4; @@ -1337,16 +1211,16 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a, tcg_gen_concat_i32_i64(rtmp, rm1, rm2); - narrowfn(rm1, tcg_env, rtmp); - write_neon_element32(rm1, a->vd, 0, MO_32); + narrowfn(rtmp, tcg_env, rtmp); + write_neon_element64(rtmp, a->vd, 0, MO_32); shiftfn(rm3, rm3, constimm); shiftfn(rm4, rm4, constimm); tcg_gen_concat_i32_i64(rtmp, rm3, rm4); - narrowfn(rm3, tcg_env, rtmp); - write_neon_element32(rm3, a->vd, 1, MO_32); + narrowfn(rtmp, tcg_env, rtmp); + write_neon_element64(rtmp, a->vd, 1, MO_32); return true; } @@ -1361,17 +1235,17 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a, return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \ } -static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +static void gen_neon_narrow_u32(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src) { - tcg_gen_extrl_i64_i32(dest, src); + tcg_gen_ext32u_i64(dest, src); } -static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +static void gen_neon_narrow_u16(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src) { gen_helper_neon_narrow_u16(dest, src); } -static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +static void gen_neon_narrow_u8(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src) { gen_helper_neon_narrow_u8(dest, src); } @@ -2962,10 +2836,9 @@ static bool trans_VZIP(DisasContext *s, arg_2misc *a) } static bool do_vmovn(DisasContext *s, arg_2misc *a, - NeonGenNarrowEnvFn *narrowfn) + NeonGenOne64OpEnvFn *narrowfn) { - TCGv_i64 rm; - TCGv_i32 rd0, rd1; + TCGv_i64 rm, rd0, rd1; if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { return false; @@ -2990,22 +2863,22 @@ static bool do_vmovn(DisasContext *s, arg_2misc *a, } rm = tcg_temp_new_i64(); - rd0 = tcg_temp_new_i32(); - rd1 = tcg_temp_new_i32(); + rd0 = tcg_temp_new_i64(); + rd1 = tcg_temp_new_i64(); read_neon_element64(rm, a->vm, 0, MO_64); narrowfn(rd0, tcg_env, rm); read_neon_element64(rm, a->vm, 1, MO_64); narrowfn(rd1, tcg_env, rm); - write_neon_element32(rd0, a->vd, 0, MO_32); - write_neon_element32(rd1, a->vd, 1, MO_32); + write_neon_element64(rd0, a->vd, 0, MO_32); + write_neon_element64(rd1, a->vd, 1, MO_32); return true; } #define DO_VMOVN(INSN, FUNC) \ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ { \ - static NeonGenNarrowEnvFn * const narrowfn[] = { \ + static NeonGenOne64OpEnvFn * const narrowfn[] = { \ FUNC##8, \ FUNC##16, \ FUNC##32, \ diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index 9e2536d..49d32fa 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -6081,9 +6081,9 @@ static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm) if (top) { if (shl == halfbits) { - TCGv_vec t = tcg_temp_new_vec_matching(d); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits)); - tcg_gen_and_vec(vece, d, n, t); + tcg_gen_and_vec(vece, d, n, + tcg_constant_vec_matching(d, vece, + MAKE_64BIT_MASK(halfbits, halfbits))); } else { tcg_gen_sari_vec(vece, d, n, halfbits); tcg_gen_shli_vec(vece, d, d, shl); @@ -6138,18 +6138,18 @@ static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm) if (top) { if (shl == halfbits) { - TCGv_vec t = tcg_temp_new_vec_matching(d); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits)); - tcg_gen_and_vec(vece, d, n, t); + tcg_gen_and_vec(vece, d, n, + tcg_constant_vec_matching(d, vece, + MAKE_64BIT_MASK(halfbits, halfbits))); } else { tcg_gen_shri_vec(vece, d, n, halfbits); tcg_gen_shli_vec(vece, d, d, shl); } } else { if (shl == 0) { - TCGv_vec t = tcg_temp_new_vec_matching(d); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_and_vec(vece, d, n, t); + tcg_gen_and_vec(vece, d, n, + tcg_constant_vec_matching(d, vece, + MAKE_64BIT_MASK(0, halfbits))); } else { tcg_gen_shli_vec(vece, d, n, halfbits); tcg_gen_shri_vec(vece, d, d, halfbits - shl); @@ -6317,18 +6317,14 @@ static const TCGOpcode sqxtn_list[] = { static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t mask = (1ull << halfbits) - 1; int64_t min = -1ull << (halfbits - 1); int64_t max = -min - 1; - tcg_gen_dupi_vec(vece, t, min); - tcg_gen_smax_vec(vece, d, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_smin_vec(vece, d, d, t); - tcg_gen_dupi_vec(vece, t, mask); - tcg_gen_and_vec(vece, d, d, t); + tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, min)); + tcg_gen_smin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max)); + tcg_gen_and_vec(vece, d, d, tcg_constant_vec_matching(d, vece, mask)); } static const GVecGen2 sqxtnb_ops[3] = { @@ -6349,19 +6345,15 @@ TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops) static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t mask = (1ull << halfbits) - 1; int64_t min = -1ull << (halfbits - 1); int64_t max = -min - 1; - tcg_gen_dupi_vec(vece, t, min); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_smin_vec(vece, n, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min)); + tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max)); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_dupi_vec(vece, t, mask); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n); } static const GVecGen2 sqxtnt_ops[3] = { @@ -6389,12 +6381,10 @@ static const TCGOpcode uqxtn_list[] = { static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = (1ull << halfbits) - 1; - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_umin_vec(vece, d, n, t); + tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max)); } static const GVecGen2 uqxtnb_ops[3] = { @@ -6415,14 +6405,13 @@ TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops) static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = (1ull << halfbits) - 1; + TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_umin_vec(vece, n, n, maxv); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, maxv, d, n); } static const GVecGen2 uqxtnt_ops[3] = { @@ -6450,14 +6439,11 @@ static const TCGOpcode sqxtun_list[] = { static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = (1ull << halfbits) - 1; - tcg_gen_dupi_vec(vece, t, 0); - tcg_gen_smax_vec(vece, d, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_umin_vec(vece, d, d, t); + tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, 0)); + tcg_gen_umin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max)); } static const GVecGen2 sqxtunb_ops[3] = { @@ -6478,16 +6464,14 @@ TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops) static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = (1ull << halfbits) - 1; + TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max); - tcg_gen_dupi_vec(vece, t, 0); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0)); + tcg_gen_umin_vec(vece, n, n, maxv); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, maxv, d, n); } static const GVecGen2 sqxtunt_ops[3] = { @@ -6551,13 +6535,11 @@ static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; uint64_t mask = MAKE_64BIT_MASK(0, halfbits); tcg_gen_shri_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, mask); - tcg_gen_and_vec(vece, d, n, t); + tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask)); } static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 }; @@ -6609,13 +6591,11 @@ static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; uint64_t mask = MAKE_64BIT_MASK(0, halfbits); tcg_gen_shli_vec(vece, n, n, halfbits - shr); - tcg_gen_dupi_vec(vece, t, mask); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n); } static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 }; @@ -6658,14 +6638,12 @@ TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops) static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; + uint64_t max = MAKE_64BIT_MASK(0, halfbits); tcg_gen_sari_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, 0); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_umin_vec(vece, d, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0)); + tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max)); } static const TCGOpcode sqshrunb_vec_list[] = { @@ -6690,16 +6668,15 @@ TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops) static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; + uint64_t max = MAKE_64BIT_MASK(0, halfbits); + TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max); tcg_gen_sari_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, 0); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0)); + tcg_gen_umin_vec(vece, n, n, maxv); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, maxv, d, n); } static const TCGOpcode sqshrunt_vec_list[] = { @@ -6742,18 +6719,15 @@ TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops) static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = MAKE_64BIT_MASK(0, halfbits - 1); int64_t min = -max - 1; + int64_t mask = MAKE_64BIT_MASK(0, halfbits); tcg_gen_sari_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, min); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_smin_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_and_vec(vece, d, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min)); + tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max)); + tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask)); } static const TCGOpcode sqshrnb_vec_list[] = { @@ -6778,19 +6752,16 @@ TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops) static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; int64_t max = MAKE_64BIT_MASK(0, halfbits - 1); int64_t min = -max - 1; + int64_t mask = MAKE_64BIT_MASK(0, halfbits); tcg_gen_sari_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, min); - tcg_gen_smax_vec(vece, n, n, t); - tcg_gen_dupi_vec(vece, t, max); - tcg_gen_smin_vec(vece, n, n, t); + tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min)); + tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max)); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n); } static const TCGOpcode sqshrnt_vec_list[] = { @@ -6833,12 +6804,11 @@ TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops) static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; + int64_t max = MAKE_64BIT_MASK(0, halfbits); tcg_gen_shri_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_umin_vec(vece, d, n, t); + tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max)); } static const TCGOpcode uqshrnb_vec_list[] = { @@ -6863,14 +6833,14 @@ TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops) static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) { - TCGv_vec t = tcg_temp_new_vec_matching(d); int halfbits = 4 << vece; + int64_t max = MAKE_64BIT_MASK(0, halfbits); + TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max); tcg_gen_shri_vec(vece, n, n, shr); - tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); - tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_umin_vec(vece, n, n, maxv); tcg_gen_shli_vec(vece, n, n, halfbits); - tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_gen_bitsel_vec(vece, d, maxv, d, n); } static const TCGOpcode uqshrnt_vec_list[] = { diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index 3f0e9ce..5a2e10d 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -471,6 +471,13 @@ void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz); +void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz); +void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + int64_t c, uint32_t opr_sz, uint32_t max_sz); + void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, @@ -514,6 +521,11 @@ void gen_sqsub_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b); void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); + void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, int64_t shift, uint32_t opr_sz, uint32_t max_sz); void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, @@ -593,13 +605,13 @@ typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32, typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64); typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64); typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64); -typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64); typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32); typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32); typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr); typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64); +typedef void NeonGenOne64OpEnvFn(TCGv_i64, TCGv_env, TCGv_i64); typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr); typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); diff --git a/tests/functional/test_aarch64_sbsaref.py b/tests/functional/test_aarch64_sbsaref.py index f31c2a6..b50e1a5 100755 --- a/tests/functional/test_aarch64_sbsaref.py +++ b/tests/functional/test_aarch64_sbsaref.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Functional test that boots a Linux kernel and checks the console +# Functional test that boots a kernel and checks the console # # SPDX-FileCopyrightText: 2023-2024 Linaro Ltd. # SPDX-FileContributor: Philippe Mathieu-Daudé <philmd@linaro.org> @@ -110,16 +110,17 @@ class Aarch64SbsarefMachine(QemuSystemTest): # This tests the whole boot chain from EFI to Userspace # We only boot a whole OS for the current top level CPU and GIC # Other test profiles should use more minimal boots - def boot_alpine_linux(self, cpu): + def boot_alpine_linux(self, cpu=None): self.fetch_firmware() iso_path = self.ASSET_ALPINE_ISO.fetch() self.vm.set_console() self.vm.add_args( - "-cpu", cpu, "-drive", f"file={iso_path},media=cdrom,format=raw", ) + if cpu: + self.vm.add_args("-cpu", cpu) self.vm.launch() wait_for_console_pattern(self, "Welcome to Alpine Linux 3.17") @@ -127,8 +128,8 @@ class Aarch64SbsarefMachine(QemuSystemTest): def test_sbsaref_alpine_linux_cortex_a57(self): self.boot_alpine_linux("cortex-a57") - def test_sbsaref_alpine_linux_neoverse_n1(self): - self.boot_alpine_linux("neoverse-n1") + def test_sbsaref_alpine_linux_default_cpu(self): + self.boot_alpine_linux() def test_sbsaref_alpine_linux_max_pauth_off(self): self.boot_alpine_linux("max,pauth=off") @@ -136,50 +137,53 @@ class Aarch64SbsarefMachine(QemuSystemTest): def test_sbsaref_alpine_linux_max_pauth_impdef(self): self.boot_alpine_linux("max,pauth-impdef=on") - @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout') + @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), + 'Test might timeout due to PAuth emulation') def test_sbsaref_alpine_linux_max(self): self.boot_alpine_linux("max") - ASSET_OPENBSD_ISO = Asset( - ('https://cdn.openbsd.org/pub/OpenBSD/7.3/arm64/miniroot73.img'), - '7fc2c75401d6f01fbfa25f4953f72ad7d7c18650056d30755c44b9c129b707e5') + ASSET_FREEBSD_ISO = Asset( + ('https://download.freebsd.org/releases/arm64/aarch64/ISO-IMAGES/' + '14.1/FreeBSD-14.1-RELEASE-arm64-aarch64-bootonly.iso'), + '44cdbae275ef1bb6dab1d5fbb59473d4f741e1c8ea8a80fd9e906b531d6ad461') # This tests the whole boot chain from EFI to Userspace # We only boot a whole OS for the current top level CPU and GIC # Other test profiles should use more minimal boots - def boot_openbsd73(self, cpu): + def boot_freebsd14(self, cpu=None): self.fetch_firmware() - img_path = self.ASSET_OPENBSD_ISO.fetch() + img_path = self.ASSET_FREEBSD_ISO.fetch() self.vm.set_console() self.vm.add_args( - "-cpu", cpu, "-drive", f"file={img_path},format=raw,snapshot=on", ) + if cpu: + self.vm.add_args("-cpu", cpu) self.vm.launch() - wait_for_console_pattern(self, - "Welcome to the OpenBSD/arm64" - " 7.3 installation program.") + wait_for_console_pattern(self, 'Welcome to FreeBSD!') - def test_sbsaref_openbsd73_cortex_a57(self): - self.boot_openbsd73("cortex-a57") + def test_sbsaref_freebsd14_cortex_a57(self): + self.boot_freebsd14("cortex-a57") - def test_sbsaref_openbsd73_neoverse_n1(self): - self.boot_openbsd73("neoverse-n1") + def test_sbsaref_freebsd14_default_cpu(self): + self.boot_freebsd14() - def test_sbsaref_openbsd73_max_pauth_off(self): - self.boot_openbsd73("max,pauth=off") + def test_sbsaref_freebsd14_max_pauth_off(self): + self.boot_freebsd14("max,pauth=off") - @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout') - def test_sbsaref_openbsd73_max_pauth_impdef(self): - self.boot_openbsd73("max,pauth-impdef=on") + @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), + 'Test might timeout due to PAuth emulation') + def test_sbsaref_freebsd14_max_pauth_impdef(self): + self.boot_freebsd14("max,pauth-impdef=on") - @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout') - def test_sbsaref_openbsd73_max(self): - self.boot_openbsd73("max") + @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), + 'Test might timeout due to PAuth emulation') + def test_sbsaref_freebsd14_max(self): + self.boot_freebsd14("max") if __name__ == '__main__': diff --git a/tests/qtest/stm32l4x5_usart-test.c b/tests/qtest/stm32l4x5_usart-test.c index c175ff3..64cebda 100644 --- a/tests/qtest/stm32l4x5_usart-test.c +++ b/tests/qtest/stm32l4x5_usart-test.c @@ -36,6 +36,8 @@ REG32(GTPR, 0x10) REG32(RTOR, 0x14) REG32(RQR, 0x18) REG32(ISR, 0x1C) + FIELD(ISR, REACK, 22, 1) + FIELD(ISR, TEACK, 21, 1) FIELD(ISR, TXE, 7, 1) FIELD(ISR, RXNE, 5, 1) FIELD(ISR, ORE, 3, 1) @@ -191,7 +193,7 @@ static void init_uart(QTestState *qts) /* Enable the transmitter, the receiver and the USART. */ qtest_writel(qts, (USART1_BASE_ADDR + A_CR1), - R_CR1_UE_MASK | R_CR1_RE_MASK | R_CR1_TE_MASK); + cr1 | R_CR1_UE_MASK | R_CR1_RE_MASK | R_CR1_TE_MASK); } static void test_write_read(void) @@ -298,6 +300,37 @@ static void test_send_str(void) qtest_quit(qts); } +static void test_ack(void) +{ + uint32_t cr1; + uint32_t isr; + QTestState *qts = qtest_init("-M b-l475e-iot01a"); + + init_uart(qts); + + cr1 = qtest_readl(qts, (USART1_BASE_ADDR + A_CR1)); + + /* Disable the transmitter and receiver. */ + qtest_writel(qts, (USART1_BASE_ADDR + A_CR1), + cr1 & ~(R_CR1_RE_MASK | R_CR1_TE_MASK)); + + /* Test ISR ACK for transmitter and receiver disabled */ + isr = qtest_readl(qts, (USART1_BASE_ADDR + A_ISR)); + g_assert_false(isr & R_ISR_TEACK_MASK); + g_assert_false(isr & R_ISR_REACK_MASK); + + /* Enable the transmitter and receiver. */ + qtest_writel(qts, (USART1_BASE_ADDR + A_CR1), + cr1 | (R_CR1_RE_MASK | R_CR1_TE_MASK)); + + /* Test ISR ACK for transmitter and receiver disabled */ + isr = qtest_readl(qts, (USART1_BASE_ADDR + A_ISR)); + g_assert_true(isr & R_ISR_TEACK_MASK); + g_assert_true(isr & R_ISR_REACK_MASK); + + qtest_quit(qts); +} + int main(int argc, char **argv) { int ret; @@ -310,6 +343,7 @@ int main(int argc, char **argv) qtest_add_func("stm32l4x5/usart/send_char", test_send_char); qtest_add_func("stm32l4x5/usart/receive_str", test_receive_str); qtest_add_func("stm32l4x5/usart/send_str", test_send_str); + qtest_add_func("stm32l4x5/usart/ack", test_ack); ret = g_test_run(); return ret; |