aboutsummaryrefslogtreecommitdiff
path: root/target/arm/tcg
diff options
context:
space:
mode:
authorFabiano Rosas <farosas@suse.de>2023-02-17 17:11:29 -0300
committerPeter Maydell <peter.maydell@linaro.org>2023-02-27 13:27:04 +0000
commitf0984d4040c328d1c021ae6680479cbbe13c485b (patch)
tree96d3b38a11fe7dc6ba19c24e47f06f48eb0cb8e5 /target/arm/tcg
parent2059ec754f9040a6a9f62a9abfeb76a9d8655e11 (diff)
downloadqemu-f0984d4040c328d1c021ae6680479cbbe13c485b.zip
qemu-f0984d4040c328d1c021ae6680479cbbe13c485b.tar.gz
qemu-f0984d4040c328d1c021ae6680479cbbe13c485b.tar.bz2
target/arm: move translate modules to tcg/
Introduce the target/arm/tcg directory. Its purpose is to hold the TCG code that is selected by CONFIG_TCG. Signed-off-by: Claudio Fontana <cfontana@suse.de> Signed-off-by: Fabiano Rosas <farosas@suse.de> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target/arm/tcg')
-rw-r--r--target/arm/tcg/a32-uncond.decode74
-rw-r--r--target/arm/tcg/a32.decode557
-rw-r--r--target/arm/tcg/m-nocp.decode72
-rw-r--r--target/arm/tcg/meson.build32
-rw-r--r--target/arm/tcg/mve.decode832
-rw-r--r--target/arm/tcg/neon-dp.decode646
-rw-r--r--target/arm/tcg/neon-ls.decode52
-rw-r--r--target/arm/tcg/neon-shared.decode99
-rw-r--r--target/arm/tcg/sme-fa64.decode60
-rw-r--r--target/arm/tcg/sme.decode88
-rw-r--r--target/arm/tcg/sve.decode1702
-rw-r--r--target/arm/tcg/t16.decode281
-rw-r--r--target/arm/tcg/t32.decode753
-rw-r--r--target/arm/tcg/translate-a64.c15054
-rw-r--r--target/arm/tcg/translate-a64.h201
-rw-r--r--target/arm/tcg/translate-m-nocp.c788
-rw-r--r--target/arm/tcg/translate-mve.c2310
-rw-r--r--target/arm/tcg/translate-neon.c4064
-rw-r--r--target/arm/tcg/translate-sme.c373
-rw-r--r--target/arm/tcg/translate-sve.c7583
-rw-r--r--target/arm/tcg/translate-vfp.c3619
-rw-r--r--target/arm/tcg/translate.c9990
-rw-r--r--target/arm/tcg/translate.h644
-rw-r--r--target/arm/tcg/vfp-uncond.decode82
-rw-r--r--target/arm/tcg/vfp.decode247
25 files changed, 50203 insertions, 0 deletions
diff --git a/target/arm/tcg/a32-uncond.decode b/target/arm/tcg/a32-uncond.decode
new file mode 100644
index 0000000..2339de2
--- /dev/null
+++ b/target/arm/tcg/a32-uncond.decode
@@ -0,0 +1,74 @@
+# A32 unconditional instructions
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# All insns that have 0xf in insn[31:28] are decoded here.
+# All of those that have a COND field in insn[31:28] are in a32.decode
+#
+
+&empty !extern
+&i !extern imm
+&setend E
+
+# Branch with Link and Exchange
+
+%imm24h 0:s24 24:1 !function=times_2
+
+BLX_i 1111 101 . ........................ &i imm=%imm24h
+
+# System Instructions
+
+&rfe rn w pu
+&srs mode w pu
+&cps mode imod M A I F
+
+RFE 1111 100 pu:2 0 w:1 1 rn:4 0000 1010 0000 0000 &rfe
+SRS 1111 100 pu:2 1 w:1 0 1101 0000 0101 000 mode:5 &srs
+CPS 1111 0001 0000 imod:2 M:1 0 0000 000 A:1 I:1 F:1 0 mode:5 \
+ &cps
+
+# Clear-Exclusive, Barriers
+
+# QEMU does not require the option field for the barriers.
+CLREX 1111 0101 0111 1111 1111 0000 0001 1111
+DSB 1111 0101 0111 1111 1111 0000 0100 ----
+DMB 1111 0101 0111 1111 1111 0000 0101 ----
+ISB 1111 0101 0111 1111 1111 0000 0110 ----
+SB 1111 0101 0111 1111 1111 0000 0111 0000
+
+# Set Endianness
+SETEND 1111 0001 0000 0001 0000 00 E:1 0 0000 0000 &setend
+
+# Preload instructions
+
+PLD 1111 0101 -101 ---- 1111 ---- ---- ---- # (imm, lit) 5te
+PLDW 1111 0101 -001 ---- 1111 ---- ---- ---- # (imm, lit) 7mp
+PLI 1111 0100 -101 ---- 1111 ---- ---- ---- # (imm, lit) 7
+
+PLD 1111 0111 -101 ---- 1111 ----- -- 0 ---- # (register) 5te
+PLDW 1111 0111 -001 ---- 1111 ----- -- 0 ---- # (register) 7mp
+PLI 1111 0110 -101 ---- 1111 ----- -- 0 ---- # (register) 7
+
+# Unallocated memory hints
+#
+# Since these are v7MP nops, and PLDW is v7MP and implemented as nop,
+# (ab)use the PLDW helper.
+
+PLDW 1111 0100 -001 ---- ---- ---- ---- ----
+PLDW 1111 0110 -001 ---- ---- ---- ---0 ----
diff --git a/target/arm/tcg/a32.decode b/target/arm/tcg/a32.decode
new file mode 100644
index 0000000..f2ca480
--- /dev/null
+++ b/target/arm/tcg/a32.decode
@@ -0,0 +1,557 @@
+# A32 conditional instructions
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# All of the insn that have a COND field in insn[31:28] are here.
+# All insns that have 0xf in insn[31:28] are in a32-uncond.decode.
+#
+
+&empty
+&s_rrr_shi s rd rn rm shim shty
+&s_rrr_shr s rn rd rm rs shty
+&s_rri_rot s rn rd imm rot
+&s_rrrr s rd rn rm ra
+&rrrr rd rn rm ra
+&rrr_rot rd rn rm rot
+&rrr rd rn rm
+&rr rd rm
+&ri rd imm
+&r rm
+&i imm
+&msr_reg rn r mask
+&mrs_reg rd r
+&msr_bank rn r sysm
+&mrs_bank rd r sysm
+&ldst_rr p w u rn rt rm shimm shtype
+&ldst_ri p w u rn rt imm
+&ldst_block rn i b u w list
+&strex rn rd rt rt2 imm
+&ldrex rn rt rt2 imm
+&bfx rd rn lsb widthm1
+&bfi rd rn lsb msb
+&sat rd rn satimm imm sh
+&pkh rd rn rm imm tb
+&mcr cp opc1 crn crm opc2 rt
+&mcrr cp opc1 crm rt rt2
+
+# Data-processing (register)
+
+@s_rrr_shi ---- ... .... s:1 rn:4 rd:4 shim:5 shty:2 . rm:4 \
+ &s_rrr_shi
+@s_rxr_shi ---- ... .... s:1 .... rd:4 shim:5 shty:2 . rm:4 \
+ &s_rrr_shi rn=0
+@S_xrr_shi ---- ... .... . rn:4 .... shim:5 shty:2 . rm:4 \
+ &s_rrr_shi s=1 rd=0
+
+AND_rrri .... 000 0000 . .... .... ..... .. 0 .... @s_rrr_shi
+EOR_rrri .... 000 0001 . .... .... ..... .. 0 .... @s_rrr_shi
+SUB_rrri .... 000 0010 . .... .... ..... .. 0 .... @s_rrr_shi
+RSB_rrri .... 000 0011 . .... .... ..... .. 0 .... @s_rrr_shi
+ADD_rrri .... 000 0100 . .... .... ..... .. 0 .... @s_rrr_shi
+ADC_rrri .... 000 0101 . .... .... ..... .. 0 .... @s_rrr_shi
+SBC_rrri .... 000 0110 . .... .... ..... .. 0 .... @s_rrr_shi
+RSC_rrri .... 000 0111 . .... .... ..... .. 0 .... @s_rrr_shi
+TST_xrri .... 000 1000 1 .... 0000 ..... .. 0 .... @S_xrr_shi
+TEQ_xrri .... 000 1001 1 .... 0000 ..... .. 0 .... @S_xrr_shi
+CMP_xrri .... 000 1010 1 .... 0000 ..... .. 0 .... @S_xrr_shi
+CMN_xrri .... 000 1011 1 .... 0000 ..... .. 0 .... @S_xrr_shi
+ORR_rrri .... 000 1100 . .... .... ..... .. 0 .... @s_rrr_shi
+MOV_rxri .... 000 1101 . 0000 .... ..... .. 0 .... @s_rxr_shi
+BIC_rrri .... 000 1110 . .... .... ..... .. 0 .... @s_rrr_shi
+MVN_rxri .... 000 1111 . 0000 .... ..... .. 0 .... @s_rxr_shi
+
+%imm16 16:4 0:12
+@mov16 ---- .... .... .... rd:4 ............ &ri imm=%imm16
+
+MOVW .... 0011 0000 .... .... ............ @mov16
+MOVT .... 0011 0100 .... .... ............ @mov16
+
+# Data-processing (register-shifted register)
+
+@s_rrr_shr ---- ... .... s:1 rn:4 rd:4 rs:4 . shty:2 . rm:4 \
+ &s_rrr_shr
+@s_rxr_shr ---- ... .... s:1 .... rd:4 rs:4 . shty:2 . rm:4 \
+ &s_rrr_shr rn=0
+@S_xrr_shr ---- ... .... . rn:4 .... rs:4 . shty:2 . rm:4 \
+ &s_rrr_shr rd=0 s=1
+
+AND_rrrr .... 000 0000 . .... .... .... 0 .. 1 .... @s_rrr_shr
+EOR_rrrr .... 000 0001 . .... .... .... 0 .. 1 .... @s_rrr_shr
+SUB_rrrr .... 000 0010 . .... .... .... 0 .. 1 .... @s_rrr_shr
+RSB_rrrr .... 000 0011 . .... .... .... 0 .. 1 .... @s_rrr_shr
+ADD_rrrr .... 000 0100 . .... .... .... 0 .. 1 .... @s_rrr_shr
+ADC_rrrr .... 000 0101 . .... .... .... 0 .. 1 .... @s_rrr_shr
+SBC_rrrr .... 000 0110 . .... .... .... 0 .. 1 .... @s_rrr_shr
+RSC_rrrr .... 000 0111 . .... .... .... 0 .. 1 .... @s_rrr_shr
+TST_xrrr .... 000 1000 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr
+TEQ_xrrr .... 000 1001 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr
+CMP_xrrr .... 000 1010 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr
+CMN_xrrr .... 000 1011 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr
+ORR_rrrr .... 000 1100 . .... .... .... 0 .. 1 .... @s_rrr_shr
+MOV_rxrr .... 000 1101 . 0000 .... .... 0 .. 1 .... @s_rxr_shr
+BIC_rrrr .... 000 1110 . .... .... .... 0 .. 1 .... @s_rrr_shr
+MVN_rxrr .... 000 1111 . 0000 .... .... 0 .. 1 .... @s_rxr_shr
+
+# Data-processing (immediate)
+
+%a32extrot 8:4 !function=times_2
+
+@s_rri_rot ---- ... .... s:1 rn:4 rd:4 .... imm:8 \
+ &s_rri_rot rot=%a32extrot
+@s_rxi_rot ---- ... .... s:1 .... rd:4 .... imm:8 \
+ &s_rri_rot rot=%a32extrot rn=0
+@S_xri_rot ---- ... .... . rn:4 .... .... imm:8 \
+ &s_rri_rot rot=%a32extrot rd=0 s=1
+
+AND_rri .... 001 0000 . .... .... ............ @s_rri_rot
+EOR_rri .... 001 0001 . .... .... ............ @s_rri_rot
+SUB_rri .... 001 0010 . .... .... ............ @s_rri_rot
+RSB_rri .... 001 0011 . .... .... ............ @s_rri_rot
+ADD_rri .... 001 0100 . .... .... ............ @s_rri_rot
+ADC_rri .... 001 0101 . .... .... ............ @s_rri_rot
+SBC_rri .... 001 0110 . .... .... ............ @s_rri_rot
+RSC_rri .... 001 0111 . .... .... ............ @s_rri_rot
+TST_xri .... 001 1000 1 .... 0000 ............ @S_xri_rot
+TEQ_xri .... 001 1001 1 .... 0000 ............ @S_xri_rot
+CMP_xri .... 001 1010 1 .... 0000 ............ @S_xri_rot
+CMN_xri .... 001 1011 1 .... 0000 ............ @S_xri_rot
+ORR_rri .... 001 1100 . .... .... ............ @s_rri_rot
+MOV_rxi .... 001 1101 . 0000 .... ............ @s_rxi_rot
+BIC_rri .... 001 1110 . .... .... ............ @s_rri_rot
+MVN_rxi .... 001 1111 . 0000 .... ............ @s_rxi_rot
+
+# Multiply and multiply accumulate
+
+@s_rdamn ---- .... ... s:1 rd:4 ra:4 rm:4 .... rn:4 &s_rrrr
+@s_rd0mn ---- .... ... s:1 rd:4 .... rm:4 .... rn:4 &s_rrrr ra=0
+@rdamn ---- .... ... . rd:4 ra:4 rm:4 .... rn:4 &rrrr
+@rd0mn ---- .... ... . rd:4 .... rm:4 .... rn:4 &rrrr ra=0
+
+MUL .... 0000 000 . .... 0000 .... 1001 .... @s_rd0mn
+MLA .... 0000 001 . .... .... .... 1001 .... @s_rdamn
+UMAAL .... 0000 010 0 .... .... .... 1001 .... @rdamn
+MLS .... 0000 011 0 .... .... .... 1001 .... @rdamn
+UMULL .... 0000 100 . .... .... .... 1001 .... @s_rdamn
+UMLAL .... 0000 101 . .... .... .... 1001 .... @s_rdamn
+SMULL .... 0000 110 . .... .... .... 1001 .... @s_rdamn
+SMLAL .... 0000 111 . .... .... .... 1001 .... @s_rdamn
+
+# Saturating addition and subtraction
+
+@rndm ---- .... .... rn:4 rd:4 .... .... rm:4 &rrr
+
+QADD .... 0001 0000 .... .... 0000 0101 .... @rndm
+QSUB .... 0001 0010 .... .... 0000 0101 .... @rndm
+QDADD .... 0001 0100 .... .... 0000 0101 .... @rndm
+QDSUB .... 0001 0110 .... .... 0000 0101 .... @rndm
+
+# Halfword multiply and multiply accumulate
+
+SMLABB .... 0001 0000 .... .... .... 1000 .... @rdamn
+SMLABT .... 0001 0000 .... .... .... 1100 .... @rdamn
+SMLATB .... 0001 0000 .... .... .... 1010 .... @rdamn
+SMLATT .... 0001 0000 .... .... .... 1110 .... @rdamn
+SMLAWB .... 0001 0010 .... .... .... 1000 .... @rdamn
+SMULWB .... 0001 0010 .... 0000 .... 1010 .... @rd0mn
+SMLAWT .... 0001 0010 .... .... .... 1100 .... @rdamn
+SMULWT .... 0001 0010 .... 0000 .... 1110 .... @rd0mn
+SMLALBB .... 0001 0100 .... .... .... 1000 .... @rdamn
+SMLALBT .... 0001 0100 .... .... .... 1100 .... @rdamn
+SMLALTB .... 0001 0100 .... .... .... 1010 .... @rdamn
+SMLALTT .... 0001 0100 .... .... .... 1110 .... @rdamn
+SMULBB .... 0001 0110 .... 0000 .... 1000 .... @rd0mn
+SMULBT .... 0001 0110 .... 0000 .... 1100 .... @rd0mn
+SMULTB .... 0001 0110 .... 0000 .... 1010 .... @rd0mn
+SMULTT .... 0001 0110 .... 0000 .... 1110 .... @rd0mn
+
+# MSR (immediate) and hints
+
+&msr_i r mask rot imm
+@msr_i ---- .... .... mask:4 .... rot:4 imm:8 &msr_i
+
+{
+ {
+ [
+ YIELD ---- 0011 0010 0000 1111 ---- 0000 0001
+ WFE ---- 0011 0010 0000 1111 ---- 0000 0010
+ WFI ---- 0011 0010 0000 1111 ---- 0000 0011
+
+ # TODO: Implement SEV, SEVL; may help SMP performance.
+ # SEV ---- 0011 0010 0000 1111 ---- 0000 0100
+ # SEVL ---- 0011 0010 0000 1111 ---- 0000 0101
+
+ ESB ---- 0011 0010 0000 1111 ---- 0001 0000
+ ]
+
+ # The canonical nop ends in 00000000, but the whole of the
+ # rest of the space executes as nop if otherwise unsupported.
+ NOP ---- 0011 0010 0000 1111 ---- ---- ----
+ }
+ # Note mask = 0 is covered by NOP
+ MSR_imm .... 0011 0010 .... 1111 .... .... .... @msr_i r=0
+}
+MSR_imm .... 0011 0110 .... 1111 .... .... .... @msr_i r=1
+
+# Cyclic Redundancy Check
+
+CRC32B .... 0001 0000 .... .... 0000 0100 .... @rndm
+CRC32H .... 0001 0010 .... .... 0000 0100 .... @rndm
+CRC32W .... 0001 0100 .... .... 0000 0100 .... @rndm
+CRC32CB .... 0001 0000 .... .... 0010 0100 .... @rndm
+CRC32CH .... 0001 0010 .... .... 0010 0100 .... @rndm
+CRC32CW .... 0001 0100 .... .... 0010 0100 .... @rndm
+
+# Miscellaneous instructions
+
+%sysm 8:1 16:4
+%imm16_8_0 8:12 0:4
+
+@rm ---- .... .... .... .... .... .... rm:4 &r
+@rdm ---- .... .... .... rd:4 .... .... rm:4 &rr
+@i16 ---- .... .... .... .... .... .... .... &i imm=%imm16_8_0
+
+MRS_bank ---- 0001 0 r:1 00 .... rd:4 001. 0000 0000 &mrs_bank %sysm
+MSR_bank ---- 0001 0 r:1 10 .... 1111 001. 0000 rn:4 &msr_bank %sysm
+
+MRS_reg ---- 0001 0 r:1 00 1111 rd:4 0000 0000 0000 &mrs_reg
+MSR_reg ---- 0001 0 r:1 10 mask:4 1111 0000 0000 rn:4 &msr_reg
+
+BX .... 0001 0010 1111 1111 1111 0001 .... @rm
+BXJ .... 0001 0010 1111 1111 1111 0010 .... @rm
+BLX_r .... 0001 0010 1111 1111 1111 0011 .... @rm
+
+CLZ .... 0001 0110 1111 .... 1111 0001 .... @rdm
+
+ERET ---- 0001 0110 0000 0000 0000 0110 1110
+
+HLT .... 0001 0000 .... .... .... 0111 .... @i16
+BKPT .... 0001 0010 .... .... .... 0111 .... @i16
+HVC .... 0001 0100 .... .... .... 0111 .... @i16
+SMC ---- 0001 0110 0000 0000 0000 0111 imm:4 &i
+
+# Load/Store Dual, Half, Signed Byte (register)
+
+@ldst_rr_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... rm:4 \
+ &ldst_rr p=1 shimm=0 shtype=0
+@ldst_rr_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 .... .... rm:4 \
+ &ldst_rr p=0 w=0 shimm=0 shtype=0
+
+STRH_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_pw0
+STRH_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_p1w
+
+LDRD_rr .... 000. .0.0 .... .... 0000 1101 .... @ldst_rr_pw0
+LDRD_rr .... 000. .0.0 .... .... 0000 1101 .... @ldst_rr_p1w
+
+STRD_rr .... 000. .0.0 .... .... 0000 1111 .... @ldst_rr_pw0
+STRD_rr .... 000. .0.0 .... .... 0000 1111 .... @ldst_rr_p1w
+
+LDRH_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_pw0
+LDRH_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_p1w
+
+LDRSB_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_pw0
+LDRSB_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_p1w
+
+LDRSH_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_pw0
+LDRSH_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_p1w
+
+# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
+# and act as normal post-indexed (P=0, W=0).
+@ldst_rr_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 .... .... rm:4 \
+ &ldst_rr p=0 w=0 shimm=0 shtype=0
+
+STRHT_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_p0w1
+LDRHT_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_p0w1
+LDRSBT_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_p0w1
+LDRSHT_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_p0w1
+
+# Load/Store word and unsigned byte (register)
+
+@ldst_rs_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+ &ldst_rr p=1
+@ldst_rs_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+ &ldst_rr p=0 w=0
+
+STR_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_pw0
+STR_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_p1w
+STRB_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_pw0
+STRB_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_p1w
+
+LDR_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_pw0
+LDR_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_p1w
+LDRB_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_pw0
+LDRB_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_p1w
+
+@ldst_rs_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+ &ldst_rr p=0 w=0
+
+STRT_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_p0w1
+STRBT_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_p0w1
+LDRT_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_p0w1
+LDRBT_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_p0w1
+
+# Load/Store Dual, Half, Signed Byte (immediate)
+
+%imm8s_8_0 8:4 0:4
+@ldst_ri8_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... .... \
+ &ldst_ri imm=%imm8s_8_0 p=1
+@ldst_ri8_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 .... .... .... \
+ &ldst_ri imm=%imm8s_8_0 p=0 w=0
+
+STRH_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_pw0
+STRH_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_p1w
+
+LDRD_ri_a32 .... 000. .1.0 .... .... .... 1101 .... @ldst_ri8_pw0
+LDRD_ri_a32 .... 000. .1.0 .... .... .... 1101 .... @ldst_ri8_p1w
+
+STRD_ri_a32 .... 000. .1.0 .... .... .... 1111 .... @ldst_ri8_pw0
+STRD_ri_a32 .... 000. .1.0 .... .... .... 1111 .... @ldst_ri8_p1w
+
+LDRH_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_pw0
+LDRH_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_p1w
+
+LDRSB_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_pw0
+LDRSB_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_p1w
+
+LDRSH_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_pw0
+LDRSH_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_p1w
+
+# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
+# and act as normal post-indexed (P=0, W=0).
+@ldst_ri8_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 .... .... .... \
+ &ldst_ri imm=%imm8s_8_0 p=0 w=0
+
+STRHT_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_p0w1
+LDRHT_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_p0w1
+LDRSBT_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_p0w1
+LDRSHT_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_p0w1
+
+# Load/Store word and unsigned byte (immediate)
+
+@ldst_ri12_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 imm:12 &ldst_ri p=1
+@ldst_ri12_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 imm:12 &ldst_ri p=0 w=0
+
+STR_ri .... 010. .0.0 .... .... ............ @ldst_ri12_p1w
+STR_ri .... 010. .0.0 .... .... ............ @ldst_ri12_pw0
+STRB_ri .... 010. .1.0 .... .... ............ @ldst_ri12_p1w
+STRB_ri .... 010. .1.0 .... .... ............ @ldst_ri12_pw0
+
+LDR_ri .... 010. .0.1 .... .... ............ @ldst_ri12_p1w
+LDR_ri .... 010. .0.1 .... .... ............ @ldst_ri12_pw0
+LDRB_ri .... 010. .1.1 .... .... ............ @ldst_ri12_p1w
+LDRB_ri .... 010. .1.1 .... .... ............ @ldst_ri12_pw0
+
+@ldst_ri12_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 imm:12 &ldst_ri p=0 w=0
+
+STRT_ri .... 010. .0.0 .... .... ............ @ldst_ri12_p0w1
+STRBT_ri .... 010. .1.0 .... .... ............ @ldst_ri12_p0w1
+LDRT_ri .... 010. .0.1 .... .... ............ @ldst_ri12_p0w1
+LDRBT_ri .... 010. .1.1 .... .... ............ @ldst_ri12_p0w1
+
+# Synchronization primitives
+
+@swp ---- .... .... rn:4 rt:4 .... .... rt2:4
+
+SWP .... 0001 0000 .... .... 0000 1001 .... @swp
+SWPB .... 0001 0100 .... .... 0000 1001 .... @swp
+
+# Load/Store Exclusive and Load-Acquire/Store-Release
+#
+# Note rt2 for STREXD/LDREXD is set by the helper after checking rt is even.
+
+@strex ---- .... .... rn:4 rd:4 .... .... rt:4 \
+ &strex imm=0 rt2=15
+@ldrex ---- .... .... rn:4 rt:4 .... .... .... \
+ &ldrex imm=0 rt2=15
+@stl ---- .... .... rn:4 .... .... .... rt:4 \
+ &ldrex imm=0 rt2=15
+
+STREX .... 0001 1000 .... .... 1111 1001 .... @strex
+STREXD_a32 .... 0001 1010 .... .... 1111 1001 .... @strex
+STREXB .... 0001 1100 .... .... 1111 1001 .... @strex
+STREXH .... 0001 1110 .... .... 1111 1001 .... @strex
+
+STLEX .... 0001 1000 .... .... 1110 1001 .... @strex
+STLEXD_a32 .... 0001 1010 .... .... 1110 1001 .... @strex
+STLEXB .... 0001 1100 .... .... 1110 1001 .... @strex
+STLEXH .... 0001 1110 .... .... 1110 1001 .... @strex
+
+STL .... 0001 1000 .... 1111 1100 1001 .... @stl
+STLB .... 0001 1100 .... 1111 1100 1001 .... @stl
+STLH .... 0001 1110 .... 1111 1100 1001 .... @stl
+
+LDREX .... 0001 1001 .... .... 1111 1001 1111 @ldrex
+LDREXD_a32 .... 0001 1011 .... .... 1111 1001 1111 @ldrex
+LDREXB .... 0001 1101 .... .... 1111 1001 1111 @ldrex
+LDREXH .... 0001 1111 .... .... 1111 1001 1111 @ldrex
+
+LDAEX .... 0001 1001 .... .... 1110 1001 1111 @ldrex
+LDAEXD_a32 .... 0001 1011 .... .... 1110 1001 1111 @ldrex
+LDAEXB .... 0001 1101 .... .... 1110 1001 1111 @ldrex
+LDAEXH .... 0001 1111 .... .... 1110 1001 1111 @ldrex
+
+LDA .... 0001 1001 .... .... 1100 1001 1111 @ldrex
+LDAB .... 0001 1101 .... .... 1100 1001 1111 @ldrex
+LDAH .... 0001 1111 .... .... 1100 1001 1111 @ldrex
+
+# Media instructions
+
+# usad8 is usada8 w/ ra=15
+USADA8 ---- 0111 1000 rd:4 ra:4 rm:4 0001 rn:4
+
+# ubfx and sbfx
+@bfx ---- .... ... widthm1:5 rd:4 lsb:5 ... rn:4 &bfx
+
+SBFX .... 0111 101 ..... .... ..... 101 .... @bfx
+UBFX .... 0111 111 ..... .... ..... 101 .... @bfx
+
+# bfc is bfi w/ rn=15
+BFCI ---- 0111 110 msb:5 rd:4 lsb:5 001 rn:4 &bfi
+
+# While we could get UDEF by not including this, add the pattern for
+# documentation and to conflict with any other typos in this file.
+UDF 1110 0111 1111 ---- ---- ---- 1111 ----
+
+# Parallel addition and subtraction
+
+SADD16 .... 0110 0001 .... .... 1111 0001 .... @rndm
+SASX .... 0110 0001 .... .... 1111 0011 .... @rndm
+SSAX .... 0110 0001 .... .... 1111 0101 .... @rndm
+SSUB16 .... 0110 0001 .... .... 1111 0111 .... @rndm
+SADD8 .... 0110 0001 .... .... 1111 1001 .... @rndm
+SSUB8 .... 0110 0001 .... .... 1111 1111 .... @rndm
+
+QADD16 .... 0110 0010 .... .... 1111 0001 .... @rndm
+QASX .... 0110 0010 .... .... 1111 0011 .... @rndm
+QSAX .... 0110 0010 .... .... 1111 0101 .... @rndm
+QSUB16 .... 0110 0010 .... .... 1111 0111 .... @rndm
+QADD8 .... 0110 0010 .... .... 1111 1001 .... @rndm
+QSUB8 .... 0110 0010 .... .... 1111 1111 .... @rndm
+
+SHADD16 .... 0110 0011 .... .... 1111 0001 .... @rndm
+SHASX .... 0110 0011 .... .... 1111 0011 .... @rndm
+SHSAX .... 0110 0011 .... .... 1111 0101 .... @rndm
+SHSUB16 .... 0110 0011 .... .... 1111 0111 .... @rndm
+SHADD8 .... 0110 0011 .... .... 1111 1001 .... @rndm
+SHSUB8 .... 0110 0011 .... .... 1111 1111 .... @rndm
+
+UADD16 .... 0110 0101 .... .... 1111 0001 .... @rndm
+UASX .... 0110 0101 .... .... 1111 0011 .... @rndm
+USAX .... 0110 0101 .... .... 1111 0101 .... @rndm
+USUB16 .... 0110 0101 .... .... 1111 0111 .... @rndm
+UADD8 .... 0110 0101 .... .... 1111 1001 .... @rndm
+USUB8 .... 0110 0101 .... .... 1111 1111 .... @rndm
+
+UQADD16 .... 0110 0110 .... .... 1111 0001 .... @rndm
+UQASX .... 0110 0110 .... .... 1111 0011 .... @rndm
+UQSAX .... 0110 0110 .... .... 1111 0101 .... @rndm
+UQSUB16 .... 0110 0110 .... .... 1111 0111 .... @rndm
+UQADD8 .... 0110 0110 .... .... 1111 1001 .... @rndm
+UQSUB8 .... 0110 0110 .... .... 1111 1111 .... @rndm
+
+UHADD16 .... 0110 0111 .... .... 1111 0001 .... @rndm
+UHASX .... 0110 0111 .... .... 1111 0011 .... @rndm
+UHSAX .... 0110 0111 .... .... 1111 0101 .... @rndm
+UHSUB16 .... 0110 0111 .... .... 1111 0111 .... @rndm
+UHADD8 .... 0110 0111 .... .... 1111 1001 .... @rndm
+UHSUB8 .... 0110 0111 .... .... 1111 1111 .... @rndm
+
+# Packing, unpacking, saturation, and reversal
+
+PKH ---- 0110 1000 rn:4 rd:4 imm:5 tb:1 01 rm:4 &pkh
+
+@sat ---- .... ... satimm:5 rd:4 imm:5 sh:1 .. rn:4 &sat
+@sat16 ---- .... .... satimm:4 rd:4 .... .... rn:4 \
+ &sat imm=0 sh=0
+
+SSAT .... 0110 101. .... .... .... ..01 .... @sat
+USAT .... 0110 111. .... .... .... ..01 .... @sat
+
+SSAT16 .... 0110 1010 .... .... 1111 0011 .... @sat16
+USAT16 .... 0110 1110 .... .... 1111 0011 .... @sat16
+
+@rrr_rot ---- .... .... rn:4 rd:4 rot:2 ...... rm:4 &rrr_rot
+
+SXTAB16 .... 0110 1000 .... .... ..00 0111 .... @rrr_rot
+SXTAB .... 0110 1010 .... .... ..00 0111 .... @rrr_rot
+SXTAH .... 0110 1011 .... .... ..00 0111 .... @rrr_rot
+UXTAB16 .... 0110 1100 .... .... ..00 0111 .... @rrr_rot
+UXTAB .... 0110 1110 .... .... ..00 0111 .... @rrr_rot
+UXTAH .... 0110 1111 .... .... ..00 0111 .... @rrr_rot
+
+SEL .... 0110 1000 .... .... 1111 1011 .... @rndm
+REV .... 0110 1011 1111 .... 1111 0011 .... @rdm
+REV16 .... 0110 1011 1111 .... 1111 1011 .... @rdm
+REVSH .... 0110 1111 1111 .... 1111 1011 .... @rdm
+RBIT .... 0110 1111 1111 .... 1111 0011 .... @rdm
+
+# Signed multiply, signed and unsigned divide
+
+@rdmn ---- .... .... rd:4 .... rm:4 .... rn:4 &rrr
+
+SMLAD .... 0111 0000 .... .... .... 0001 .... @rdamn
+SMLADX .... 0111 0000 .... .... .... 0011 .... @rdamn
+SMLSD .... 0111 0000 .... .... .... 0101 .... @rdamn
+SMLSDX .... 0111 0000 .... .... .... 0111 .... @rdamn
+
+SDIV .... 0111 0001 .... 1111 .... 0001 .... @rdmn
+UDIV .... 0111 0011 .... 1111 .... 0001 .... @rdmn
+
+SMLALD .... 0111 0100 .... .... .... 0001 .... @rdamn
+SMLALDX .... 0111 0100 .... .... .... 0011 .... @rdamn
+SMLSLD .... 0111 0100 .... .... .... 0101 .... @rdamn
+SMLSLDX .... 0111 0100 .... .... .... 0111 .... @rdamn
+
+SMMLA .... 0111 0101 .... .... .... 0001 .... @rdamn
+SMMLAR .... 0111 0101 .... .... .... 0011 .... @rdamn
+SMMLS .... 0111 0101 .... .... .... 1101 .... @rdamn
+SMMLSR .... 0111 0101 .... .... .... 1111 .... @rdamn
+
+# Block data transfer
+
+STM ---- 100 b:1 i:1 u:1 w:1 0 rn:4 list:16 &ldst_block
+LDM_a32 ---- 100 b:1 i:1 u:1 w:1 1 rn:4 list:16 &ldst_block
+
+# Branch, branch with link
+
+%imm26 0:s24 !function=times_4
+@branch ---- .... ........................ &i imm=%imm26
+
+B .... 1010 ........................ @branch
+BL .... 1011 ........................ @branch
+
+# Coprocessor instructions
+
+# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
+# other coprocessor instructions always UNDEF.
+# The trans_ functions for these will ignore cp values 8..13 for v7 or
+# earlier, and 0..13 for v8 and later, because those areas of the
+# encoding space may be used for other things, such as VFP or Neon.
+
+@mcr ---- .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4 &mcr
+@mcrr ---- .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4 &mcrr
+
+MCRR .... 1100 0100 .... .... .... .... .... @mcrr
+MRRC .... 1100 0101 .... .... .... .... .... @mcrr
+
+MCR .... 1110 ... 0 .... .... .... ... 1 .... @mcr
+MRC .... 1110 ... 1 .... .... .... ... 1 .... @mcr
+
+# Supervisor call
+
+SVC ---- 1111 imm:24 &i
diff --git a/target/arm/tcg/m-nocp.decode b/target/arm/tcg/m-nocp.decode
new file mode 100644
index 0000000..b65c801
--- /dev/null
+++ b/target/arm/tcg/m-nocp.decode
@@ -0,0 +1,72 @@
+# M-profile UserFault.NOCP exception handling
+#
+# Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# For M-profile, the architecture specifies that NOCP UsageFaults
+# should take precedence over UNDEF faults over the whole wide
+# range of coprocessor-space encodings, with the exception of
+# VLLDM and VLSTM. (Compare v8.1M IsCPInstruction() pseudocode and
+# v8M Arm ARM rule R_QLGM.) This isn't mandatory for v8.0M but we choose
+# to behave the same as v8.1M.
+# This decode is handled before any others (and in particular before
+# decoding FP instructions which are in the coprocessor space).
+# If the coprocessor is not present or disabled then we will generate
+# the NOCP exception; otherwise we let the insn through to the main decode.
+
+%vd_dp 22:1 12:4
+%vd_sp 12:4 22:1
+
+&nocp cp
+
+# M-profile VLDR/VSTR to sysreg
+%vldr_sysreg 22:1 13:3
+%imm7_0x4 0:7 !function=times_4
+
+&vldr_sysreg rn reg imm a w p
+@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \
+ reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg
+
+{
+ # Special cases which do not take an early NOCP: VLLDM and VLSTM
+ VLLDM_VLSTM 1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000
+ # VSCCLRM (new in v8.1M) is similar:
+ VSCCLRM 1110 1100 1.01 1111 .... 1011 imm:7 0 vd=%vd_dp size=3
+ VSCCLRM 1110 1100 1.01 1111 .... 1010 imm:8 vd=%vd_sp size=2
+
+ # FP system register accesses: these are a special case because accesses
+ # to FPCXT_NS succeed even if the FPU is disabled. We therefore need
+ # to handle them before the big NOCP blocks. Note that within these
+ # insns NOCP still has higher priority than UNDEFs; this is implemented
+ # by their returning 'false' for UNDEF so as to fall through into the
+ # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding()
+ # for the UNDEFs there that must take precedence over NOCP.)
+
+ VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+
+ # P=0 W=0 is SEE "Related encodings", so split into two patterns
+ VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1
+ VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+ VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1
+ VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+
+ NOCP 111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp
+ NOCP 111- 110- ---- ---- ---- cp:4 ---- ---- &nocp
+ # From v8.1M onwards this range will also NOCP:
+ NOCP_8_1 111- 1111 ---- ---- ---- ---- ---- ---- &nocp cp=10
+}
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
new file mode 100644
index 0000000..044561b
--- /dev/null
+++ b/target/arm/tcg/meson.build
@@ -0,0 +1,32 @@
+gen = [
+ decodetree.process('sve.decode', extra_args: '--decode=disas_sve'),
+ decodetree.process('sme.decode', extra_args: '--decode=disas_sme'),
+ decodetree.process('sme-fa64.decode', extra_args: '--static-decode=disas_sme_fa64'),
+ decodetree.process('neon-shared.decode', extra_args: '--decode=disas_neon_shared'),
+ decodetree.process('neon-dp.decode', extra_args: '--decode=disas_neon_dp'),
+ decodetree.process('neon-ls.decode', extra_args: '--decode=disas_neon_ls'),
+ decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
+ decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
+ decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
+ decodetree.process('mve.decode', extra_args: '--decode=disas_mve'),
+ decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
+ decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
+ decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
+ decodetree.process('t16.decode', extra_args: ['-w', '16', '--static-decode=disas_t16']),
+]
+
+arm_ss.add(gen)
+
+arm_ss.add(files(
+ 'translate.c',
+ 'translate-m-nocp.c',
+ 'translate-mve.c',
+ 'translate-neon.c',
+ 'translate-vfp.c',
+))
+
+arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
+ 'translate-a64.c',
+ 'translate-sve.c',
+ 'translate-sme.c',
+))
diff --git a/target/arm/tcg/mve.decode b/target/arm/tcg/mve.decode
new file mode 100644
index 0000000..14a4f39
--- /dev/null
+++ b/target/arm/tcg/mve.decode
@@ -0,0 +1,832 @@
+# M-profile MVE instruction descriptions
+#
+# Copyright (c) 2021 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+%qd 22:1 13:3
+%qm 5:1 1:3
+%qn 7:1 17:3
+
+# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit
+%size_28 28:1 !function=plus_1
+
+# 2 operand fp insns have size in bit 20: 1 for 16 bit, 0 for 32 bit,
+# like Neon FP insns.
+%2op_fp_size 20:1 !function=neon_3same_fp_size
+# VCADD is an exception, where bit 20 is 0 for 16 bit and 1 for 32 bit
+%2op_fp_size_rev 20:1 !function=plus_1
+# FP scalars have size in bit 28, 1 for 16 bit, 0 for 32 bit
+%2op_fp_scalar_size 28:1 !function=neon_3same_fp_size
+
+# 1imm format immediate
+%imm_28_16_0 28:1 16:3 0:4
+
+&vldr_vstr rn qd imm p a w size l u
+&1op qd qm size
+&2op qd qm qn size
+&2scalar qd qn rm size
+&1imm qd imm cmode op
+&2shift qd qm shift size
+&vidup qd rn size imm
+&viwdup qd rn rm size imm
+&vcmp qm qn size mask
+&vcmp_scalar qn rm size mask
+&shl_scalar qda rm size
+&vmaxv qm rda size
+&vabav qn qm rda size
+&vldst_sg qd qm rn size msize os
+&vldst_sg_imm qd qm a w imm
+&vldst_il qd rn size pat w
+
+# scatter-gather memory size is in bits 6:4
+%sg_msize 6:1 4:1
+
+@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
+# Note that both Rn and Qd are 3 bits only (no D bit)
+@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr
+
+@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \
+ qd=%qd qm=%qm msize=%sg_msize
+
+# Qm is in the fields usually labeled Qn
+@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \
+ qd=%qd qm=%qn
+
+# Deinterleaving load/interleaving store
+@vldst_il .... .... .. w:1 . rn:4 .... ... size:2 pat:2 ..... &vldst_il \
+ qd=%qd
+
+@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
+@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0
+@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn
+@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0
+@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \
+ size=%size_28
+@1imm .... .... .... .... .... cmode:4 .. op:1 . .... &1imm qd=%qd imm=%imm_28_16_0
+
+# The _rev suffix indicates that Vn and Vm are reversed. This is
+# the case for shifts. In the Arm ARM these insns are documented
+# with the Vm and Vn fields in their usual places, but in the
+# assembly the operands are listed "backwards", ie in the order
+# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose
+# to consider Vm and Vn as being in different fields in the insn.
+# This gives us consistency with A64 and Neon.
+@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm
+
+@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+
+@2_shl_b .... .... .. 001 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
+@2_shl_h .... .... .. 01 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
+@2_shl_w .... .... .. 1 shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
+
+@2_shll_b .... .... ... 01 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
+@2_shll_h .... .... ... 1 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
+# VSHLL encoding T2 where shift == esize
+@2_shll_esize_b .... .... .... 00 .. .... .... .... .... &2shift \
+ qd=%qd qm=%qm size=0 shift=8
+@2_shll_esize_h .... .... .... 01 .. .... .... .... .... &2shift \
+ qd=%qd qm=%qm size=1 shift=16
+
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%rshift_i5 16:5 !function=rsub_32
+%rshift_i4 16:4 !function=rsub_16
+%rshift_i3 16:3 !function=rsub_8
+
+@2_shr_b .... .... .. 001 ... .... .... .... .... &2shift qd=%qd qm=%qm \
+ size=0 shift=%rshift_i3
+@2_shr_h .... .... .. 01 .... .... .... .... .... &2shift qd=%qd qm=%qm \
+ size=1 shift=%rshift_i4
+@2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \
+ size=2 shift=%rshift_i5
+
+@shl_scalar .... .... .... size:2 .. .... .... .... rm:4 &shl_scalar qda=%qd
+
+# Vector comparison; 4-bit Qm but 3-bit Qn
+%mask_22_13 22:1 13:3
+@vcmp .... .... .. size:2 qn:3 . .... .... .... .... &vcmp qm=%qm mask=%mask_22_13
+@vcmp_scalar .... .... .. size:2 qn:3 . .... .... .... rm:4 &vcmp_scalar \
+ mask=%mask_22_13
+
+@vcmp_fp .... .... .... qn:3 . .... .... .... .... &vcmp \
+ qm=%qm size=%2op_fp_scalar_size mask=%mask_22_13
+
+# Bit 28 is a 2op_fp_scalar_size bit, but we do not decode it in this
+# format to avoid complicated overlapping-instruction-groups
+@vcmp_fp_scalar .... .... .... qn:3 . .... .... .... rm:4 &vcmp_scalar \
+ mask=%mask_22_13
+
+@vmaxv .... .... .... size:2 .. rda:4 .... .... .... &vmaxv qm=%qm
+
+@2op_fp .... .... .... .... .... .... .... .... &2op \
+ qd=%qd qn=%qn qm=%qm size=%2op_fp_size
+
+@2op_fp_size_rev .... .... .... .... .... .... .... .... &2op \
+ qd=%qd qn=%qn qm=%qm size=%2op_fp_size_rev
+
+# 2-operand, but Qd and Qn share a field. Size is in bit 28, but we
+# don't decode it in this format
+@vmaxnma .... .... .... .... .... .... .... .... &2op \
+ qd=%qd qn=%qd qm=%qm
+
+# Here also we don't decode the bit 28 size in the format to avoid
+# awkward nested overlap groups
+@vmaxnmv .... .... .... .... rda:4 .... .... .... &vmaxv qm=%qm
+
+@2op_fp_scalar .... .... .... .... .... .... .... rm:4 &2scalar \
+ qd=%qd qn=%qn size=%2op_fp_scalar_size
+
+# Vector loads and stores
+
+# Widening loads and narrowing stores:
+# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding'
+# This means we need to expand out to multiple patterns for P, W, SZ.
+# For stores the U bit must be 0 but we catch that in the trans_ function.
+# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from
+# signed halfword element in register", etc.
+VLDSTB_H 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 01 ....... @vldst_wn \
+ p=0 w=1 size=1
+VLDSTB_H 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \
+ p=1 size=1
+VLDSTB_W 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 10 ....... @vldst_wn \
+ p=0 w=1 size=2
+VLDSTB_W 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \
+ p=1 size=2
+VLDSTH_W 111 . 110 0 a:1 0 1 . 1 ... ... 0 111 10 ....... @vldst_wn \
+ p=0 w=1 size=2
+VLDSTH_W 111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \
+ p=1 size=2
+
+# Non-widening loads/stores (P=0 W=0 is 'related encoding')
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \
+ size=0 p=0 w=1
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111101 ....... @vldr_vstr \
+ size=1 p=0 w=1
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111110 ....... @vldr_vstr \
+ size=2 p=0 w=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111100 ....... @vldr_vstr \
+ size=0 p=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \
+ size=1 p=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \
+ size=2 p=1
+
+# gather loads/scatter stores
+VLDR_S_sg 111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
+VLDR_U_sg 111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
+VSTR_sg 111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg
+
+VLDRW_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm
+VLDRD_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm
+VSTRW_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm
+VSTRD_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm
+
+# deinterleaving loads/interleaving stores
+VLD2 1111 1100 1 .. 1 .... ... 1 111 .. .. 00000 @vldst_il
+VLD4 1111 1100 1 .. 1 .... ... 1 111 .. .. 00001 @vldst_il
+VST2 1111 1100 1 .. 0 .... ... 1 111 .. .. 00000 @vldst_il
+VST4 1111 1100 1 .. 0 .... ... 1 111 .. .. 00001 @vldst_il
+
+# Moves between 2 32-bit vector lanes and 2 general purpose registers
+VMOV_to_2gp 1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
+VMOV_from_2gp 1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
+
+# Vector 2-op
+VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+
+VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op
+
+# The VSHLL T2 encoding is not a @2op pattern, but is here because it
+# overlaps what would be size=0b11 VMULH/VRMULH
+{
+ VCVTB_SH 111 0 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz
+
+ VMAXNMA 111 0 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=2
+
+ VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
+ VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+ VQMOVUNB 111 0 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
+ VQMOVN_BS 111 0 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
+
+ VMAXA 111 0 1110 0 . 11 .. 11 ... 0 1110 1 0 . 0 ... 1 @1op
+
+ VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+ VCVTB_HS 111 1 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz
+
+ VMAXNMA 111 1 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=1
+
+ VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
+ VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+ VMOVNB 111 1 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
+ VQMOVN_BU 111 1 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
+
+ VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+ VCVTT_SH 111 0 1110 0 . 11 1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
+
+ VMINNMA 111 0 1110 0 . 11 1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=2
+ VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
+ VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+ VQMOVUNT 111 0 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
+ VQMOVN_TS 111 0 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
+
+ VMINA 111 0 1110 0 . 11 .. 11 ... 1 1110 1 0 . 0 ... 1 @1op
+
+ VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+ VCVTT_HS 111 1 1110 0 . 11 1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
+
+ VMINNMA 111 1 1110 0 . 11 1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=1
+ VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
+ VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+ VMOVNT 111 1 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
+ VQMOVN_TU 111 1 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
+
+ VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+
+VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+
+VHADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+
+{
+ VMULLP_B 111 . 1110 0 . 11 ... 1 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
+ VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+ VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+}
+{
+ VMULLP_T 111 . 1110 0 . 11 ... 1 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
+ VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+ VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+}
+
+VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+
+VQADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+
+VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+
+VRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+VRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+
+VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+
+VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+
+{
+ VCMUL0 111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
+ VQDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+ VQDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+}
+
+{
+ VCMUL180 111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
+ VQDMLADHX 111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+ VQDMLSDHX 111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+}
+
+{
+ VCMUL90 111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 1 @2op_sz28
+ VQRDMLADH 111 0 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+ VQRDMLSDH 111 1 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+ VCMUL270 111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 1 @2op_sz28
+ VQRDMLADHX 111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+ VQRDMLSDHX 111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28
+VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28
+
+VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+
+{
+ VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+ VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+ VHCADD90 1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+ VHCADD270 1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+{
+ VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+ VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+ VCADD90 1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+ VCADD270 1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+# Vector miscellaneous
+
+VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op
+VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op
+
+VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op
+VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op
+VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op
+
+VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz
+
+VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op
+VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op
+VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op
+VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op
+
+VQABS 1111 1111 1 . 11 .. 00 ... 0 0111 01 . 0 ... 0 @1op
+VQNEG 1111 1111 1 . 11 .. 00 ... 0 0111 11 . 0 ... 0 @1op
+
+&vdup qd rt size
+# Qd is in the fields usually named Qn
+@vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup
+
+# B and E bits encode size, which we decode here to the usual size values
+VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0
+VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1
+VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2
+
+# Incrementing and decrementing dup
+
+# VIDUP, VDDUP format immediate: 1 << (immh:imml)
+%imm_vidup 7:1 0:1 !function=vidup_imm
+
+# VIDUP, VDDUP registers: Rm bits [3:1] from insn, bit 0 is 1;
+# Rn bits [3:1] from insn, bit 0 is 0
+%vidup_rm 1:3 !function=times_2_plus_1
+%vidup_rn 17:3 !function=times_2
+
+@vidup .... .... . . size:2 .... .... .... .... .... \
+ qd=%qd imm=%imm_vidup rn=%vidup_rn &vidup
+@viwdup .... .... . . size:2 .... .... .... .... .... \
+ qd=%qd imm=%imm_vidup rm=%vidup_rm rn=%vidup_rn &viwdup
+{
+ VIDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 111 . @vidup
+ VIWDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 ... . @viwdup
+}
+{
+ VCMPGT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=2
+ VCMPLE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=2
+ VDDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 111 . @vidup
+ VDWDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 ... . @viwdup
+}
+
+# multiply-add long dual accumulate
+# rdahi: bits [3:1] from insn, bit 0 is 1
+# rdalo: bits [3:1] from insn, bit 0 is 0
+%rdahi 20:3 !function=times_2_plus_1
+%rdalo 13:3 !function=times_2
+# size bit is 0 for 16 bit, 1 for 32 bit
+%size_16 16:1 !function=plus_1
+
+&vmlaldav rdahi rdalo size qn qm x a
+&vmladav rda size qn qm x a
+
+@vmlaldav .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
+ qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav
+@vmlaldav_nosz .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
+ qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav
+@vmladav .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
+ qn=%qn rda=%rdalo size=%size_16 &vmladav
+@vmladav_nosz .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
+ qn=%qn rda=%rdalo size=0 &vmladav
+
+{
+ VMLADAV_S 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav
+ VMLALDAV_S 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
+}
+{
+ VMLADAV_U 1111 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav
+ VMLALDAV_U 1111 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
+}
+
+{
+ VMLSDAV 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 1 @vmladav
+ VMLSLDAV 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 1 @vmlaldav
+}
+
+{
+ VMLSDAV 1111 1110 1111 ... 0 ... . 1110 . 0 . 0 ... 1 @vmladav_nosz
+ VRMLSLDAVH 1111 1110 1 ... ... 0 ... . 1110 . 0 . 0 ... 1 @vmlaldav_nosz
+}
+
+VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
+VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
+
+{
+ [
+ VMAXNMAV 1110 1110 1110 11 00 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=2
+ VMINNMAV 1110 1110 1110 11 00 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=2
+ VMAXNMV 1110 1110 1110 11 10 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=2
+ VMINNMV 1110 1110 1110 11 10 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=2
+ ]
+ [
+ VMAXV_S 1110 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv
+ VMINV_S 1110 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv
+ VMAXAV 1110 1110 1110 .. 00 .... 1111 0 0 . 0 ... 0 @vmaxv
+ VMINAV 1110 1110 1110 .. 00 .... 1111 1 0 . 0 ... 0 @vmaxv
+ ]
+ VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
+ VRMLALDAVH_S 1110 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
+}
+
+{
+ [
+ VMAXNMAV 1111 1110 1110 11 00 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=1
+ VMINNMAV 1111 1110 1110 11 00 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=1
+ VMAXNMV 1111 1110 1110 11 10 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=1
+ VMINNMV 1111 1110 1110 11 10 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=1
+ ]
+ [
+ VMAXV_U 1111 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv
+ VMINV_U 1111 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv
+ ]
+ VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
+ VRMLALDAVH_U 1111 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
+}
+
+# Scalar operations
+
+{
+ VCMPEQ_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=2
+ VCMPNE_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=2
+ VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar
+}
+
+{
+ VCMPLT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=2
+ VCMPGE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=2
+ VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar
+}
+
+{
+ VSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
+ VRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
+ VQSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
+ VQRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
+ VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+}
+
+{
+ VSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
+ VRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
+ VQSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
+ VQRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
+ VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+}
+
+{
+ VADD_fp_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 100 .... @2op_fp_scalar
+ VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+ VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+}
+
+{
+ VSUB_fp_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 100 .... @2op_fp_scalar
+ VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+ VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+}
+
+{
+ VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+ VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+ VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \
+ size=%size_28
+}
+
+{
+ VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+ VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+ VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \
+ size=%size_28
+}
+
+{
+ VMUL_fp_scalar 111 . 1110 0 . 11 ... 1 ... 0 1110 . 110 .... @2op_fp_scalar
+ VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+ VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+}
+
+{
+ VFMA_scalar 111 . 1110 0 . 11 ... 1 ... 0 1110 . 100 .... @2op_fp_scalar
+ # The U bit (28) is don't-care because it does not affect the result
+ VMLA 111 - 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
+}
+
+{
+ VFMAS_scalar 111 . 1110 0 . 11 ... 1 ... 1 1110 . 100 .... @2op_fp_scalar
+ # The U bit (28) is don't-care because it does not affect the result
+ VMLAS 111 - 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
+}
+
+VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
+VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
+VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
+VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
+
+# Vector add across vector
+{
+ VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
+ VADDLV 111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \
+ rdahi=%rdahi rdalo=%rdalo
+}
+
+@vabav .... .... .. size:2 .... rda:4 .... .... .... &vabav qn=%qn qm=%qm
+
+VABAV_S 111 0 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
+VABAV_U 111 1 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
+
+# Logical immediate operations (1 reg and modified-immediate)
+
+# The cmode/op bits here decode VORR/VBIC/VMOV/VMVN, but
+# not in a way we can conveniently represent in decodetree without
+# a lot of repetition:
+# VORR: op=0, (cmode & 1) && cmode < 12
+# VBIC: op=1, (cmode & 1) && cmode < 12
+# VMOV: everything else
+# So we have a single decode line and check the cmode/op in the
+# trans function.
+Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm
+
+# Shifts by immediate
+
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w
+
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
+
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
+
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
+
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
+
+# VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file
+# Note that VMOVL is encoded as "VSHLL with a zero shift count"; we
+# implement it that way rather than special-casing it in the decode.
+VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
+
+# Shift-and-insert
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_b
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_h
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w
+
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
+
+# Narrowing shifts (which only support b and h sizes)
+VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
+VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
+VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
+VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
+
+VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
+VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
+VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
+VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
+
+VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
+
+VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
+VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
+VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
+VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
+
+VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
+
+VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
+VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
+VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
+VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
+
+VSHLC 111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd
+
+# Comparisons. We expand out the conditions which are split across
+# encodings T1, T2, T3 and the fc bits. These include VPT, which is
+# effectively "VCMP then VPST". A plain "VCMP" has a mask field of zero.
+{
+ VCMPEQ_fp 111 . 1110 0 . 11 ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp_fp
+ VCMPEQ 111 1 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp
+}
+
+{
+ VCMPNE_fp 111 . 1110 0 . 11 ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp_fp
+ VCMPNE 111 1 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp
+}
+
+{
+ VCMPGE_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp_fp
+ VCMPGE 111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp
+}
+
+{
+ VCMPLT_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp_fp
+ VCMPLT 111 1 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp
+}
+
+{
+ VCMPGT_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp_fp
+ VCMPGT 111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp
+}
+
+{
+ VCMPLE_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp_fp
+ VCMPLE 1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp
+}
+
+{
+ VPSEL 1111 1110 0 . 11 ... 1 ... 0 1111 . 0 . 0 ... 1 @2op_nosz
+ VCMPCS 1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 1 @vcmp
+ VCMPHI 1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 1 @vcmp
+}
+
+{
+ VPNOT 1111 1110 0 0 11 000 1 000 0 1111 0100 1101
+ VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
+ VCMPEQ_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=1
+ VCMPEQ_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0100 .... @vcmp_scalar
+}
+
+{
+ VCMPNE_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=1
+ VCMPNE_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1100 .... @vcmp_scalar
+}
+
+{
+ VCMPGT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=1
+ VCMPGT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0110 .... @vcmp_scalar
+}
+
+{
+ VCMPLE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=1
+ VCMPLE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1110 .... @vcmp_scalar
+}
+
+{
+ VCMPGE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=1
+ VCMPGE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0100 .... @vcmp_scalar
+}
+{
+ VCMPLT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=1
+ VCMPLT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1100 .... @vcmp_scalar
+}
+
+VCMPCS_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0 1 1 0 .... @vcmp_scalar
+VCMPHI_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1 1 1 0 .... @vcmp_scalar
+
+# 2-operand FP
+VADD_fp 1110 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+VSUB_fp 1110 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+VMUL_fp 1111 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 1 ... 0 @2op_fp
+VABD_fp 1111 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+
+VMAXNM 1111 1111 0 . 0 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
+VMINNM 1111 1111 0 . 1 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
+
+VCADD90_fp 1111 1100 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCADD270_fp 1111 1101 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+
+VFMA 1110 1111 0 . 0 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
+VFMS 1110 1111 0 . 1 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
+
+VCMLA0 1111 110 00 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA90 1111 110 01 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA180 1111 110 10 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA270 1111 110 11 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+
+# floating-point <-> fixed-point conversions. Naming convention:
+# VCVT_<from><to>, S = signed int, U = unsigned int, H = halfprec, F = singleprec
+@vcvt .... .... .. 1 ..... .... .. 1 . .... .... &2shift \
+ qd=%qd qm=%qm shift=%rshift_i5 size=2
+@vcvt_f16 .... .... .. 11 .... .... .. 0 . .... .... &2shift \
+ qd=%qd qm=%qm shift=%rshift_i4 size=1
+
+VCVT_SH_fixed 1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
+VCVT_UH_fixed 1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
+
+VCVT_HS_fixed 1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
+VCVT_HU_fixed 1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
+
+VCVT_SF_fixed 1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
+VCVT_UF_fixed 1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
+
+VCVT_FS_fixed 1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
+VCVT_FU_fixed 1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
+
+# VCVT between floating point and integer (halfprec and single);
+# VCVT_<from><to>, S = signed int, U = unsigned int, F = float
+VCVT_SF 1111 1111 1 . 11 .. 11 ... 0 011 00 1 . 0 ... 0 @1op
+VCVT_UF 1111 1111 1 . 11 .. 11 ... 0 011 01 1 . 0 ... 0 @1op
+VCVT_FS 1111 1111 1 . 11 .. 11 ... 0 011 10 1 . 0 ... 0 @1op
+VCVT_FU 1111 1111 1 . 11 .. 11 ... 0 011 11 1 . 0 ... 0 @1op
+
+# VCVT from floating point to integer with specified rounding mode
+VCVTAS 1111 1111 1 . 11 .. 11 ... 000 00 0 1 . 0 ... 0 @1op
+VCVTAU 1111 1111 1 . 11 .. 11 ... 000 00 1 1 . 0 ... 0 @1op
+VCVTNS 1111 1111 1 . 11 .. 11 ... 000 01 0 1 . 0 ... 0 @1op
+VCVTNU 1111 1111 1 . 11 .. 11 ... 000 01 1 1 . 0 ... 0 @1op
+VCVTPS 1111 1111 1 . 11 .. 11 ... 000 10 0 1 . 0 ... 0 @1op
+VCVTPU 1111 1111 1 . 11 .. 11 ... 000 10 1 1 . 0 ... 0 @1op
+VCVTMS 1111 1111 1 . 11 .. 11 ... 000 11 0 1 . 0 ... 0 @1op
+VCVTMU 1111 1111 1 . 11 .. 11 ... 000 11 1 1 . 0 ... 0 @1op
+
+VRINTN 1111 1111 1 . 11 .. 10 ... 001 000 1 . 0 ... 0 @1op
+VRINTX 1111 1111 1 . 11 .. 10 ... 001 001 1 . 0 ... 0 @1op
+VRINTA 1111 1111 1 . 11 .. 10 ... 001 010 1 . 0 ... 0 @1op
+VRINTZ 1111 1111 1 . 11 .. 10 ... 001 011 1 . 0 ... 0 @1op
+VRINTM 1111 1111 1 . 11 .. 10 ... 001 101 1 . 0 ... 0 @1op
+VRINTP 1111 1111 1 . 11 .. 10 ... 001 111 1 . 0 ... 0 @1op
diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode
new file mode 100644
index 0000000..fd3a01b
--- /dev/null
+++ b/target/arm/tcg/neon-dp.decode
@@ -0,0 +1,646 @@
+# AArch32 Neon data-processing instruction descriptions
+#
+# Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# VFP/Neon register fields; same as vfp.decode
+%vm_dp 5:1 0:4
+%vn_dp 7:1 16:4
+%vd_dp 22:1 12:4
+
+# Encodings for Neon data processing instructions where the T32 encoding
+# is a simple transformation of the A32 encoding.
+# More specifically, this file covers instructions where the A32 encoding is
+# 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+# and the T32 encoding is
+# 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+# This file works on the A32 encoding only; calling code for T32 has to
+# transform the insn into the A32 version first.
+
+######################################################################
+# 3-reg-same grouping:
+# 1111 001 U 0 D sz:2 Vn:4 Vd:4 opc:4 N Q M op Vm:4
+######################################################################
+
+&3same vm vn vd q size
+
+@3same .... ... . . . size:2 .... .... .... . q:1 . . .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+@3same_q0 .... ... . . . size:2 .... .... .... . 0 . . .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
+
+# For FP insns the high bit of 'size' is used as part of opcode decode,
+# and the 'size' bit is 0 for 32-bit float and 1 for 16-bit float.
+# This converts this encoding to the same MO_8/16/32/64 values that the
+# integer neon insns use.
+%3same_fp_size 20:1 !function=neon_3same_fp_size
+
+@3same_fp .... ... . . . . . .... .... .... . q:1 . . .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%3same_fp_size
+@3same_fp_q0 .... ... . . . . . .... .... .... . 0 . . .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 size=%3same_fp_size
+
+VHADD_S_3s 1111 001 0 0 . .. .... .... 0000 . . . 0 .... @3same
+VHADD_U_3s 1111 001 1 0 . .. .... .... 0000 . . . 0 .... @3same
+VQADD_S_3s 1111 001 0 0 . .. .... .... 0000 . . . 1 .... @3same
+VQADD_U_3s 1111 001 1 0 . .. .... .... 0000 . . . 1 .... @3same
+
+VRHADD_S_3s 1111 001 0 0 . .. .... .... 0001 . . . 0 .... @3same
+VRHADD_U_3s 1111 001 1 0 . .. .... .... 0001 . . . 0 .... @3same
+
+@3same_logic .... ... . . . .. .... .... .... . q:1 .. .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0
+
+VAND_3s 1111 001 0 0 . 00 .... .... 0001 ... 1 .... @3same_logic
+VBIC_3s 1111 001 0 0 . 01 .... .... 0001 ... 1 .... @3same_logic
+VORR_3s 1111 001 0 0 . 10 .... .... 0001 ... 1 .... @3same_logic
+VORN_3s 1111 001 0 0 . 11 .... .... 0001 ... 1 .... @3same_logic
+VEOR_3s 1111 001 1 0 . 00 .... .... 0001 ... 1 .... @3same_logic
+VBSL_3s 1111 001 1 0 . 01 .... .... 0001 ... 1 .... @3same_logic
+VBIT_3s 1111 001 1 0 . 10 .... .... 0001 ... 1 .... @3same_logic
+VBIF_3s 1111 001 1 0 . 11 .... .... 0001 ... 1 .... @3same_logic
+
+VHSUB_S_3s 1111 001 0 0 . .. .... .... 0010 . . . 0 .... @3same
+VHSUB_U_3s 1111 001 1 0 . .. .... .... 0010 . . . 0 .... @3same
+
+VQSUB_S_3s 1111 001 0 0 . .. .... .... 0010 . . . 1 .... @3same
+VQSUB_U_3s 1111 001 1 0 . .. .... .... 0010 . . . 1 .... @3same
+
+VCGT_S_3s 1111 001 0 0 . .. .... .... 0011 . . . 0 .... @3same
+VCGT_U_3s 1111 001 1 0 . .. .... .... 0011 . . . 0 .... @3same
+VCGE_S_3s 1111 001 0 0 . .. .... .... 0011 . . . 1 .... @3same
+VCGE_U_3s 1111 001 1 0 . .. .... .... 0011 . . . 1 .... @3same
+
+# The _rev suffix indicates that Vn and Vm are reversed. This is
+# the case for shifts. In the Arm ARM these insns are documented
+# with the Vm and Vn fields in their usual places, but in the
+# assembly the operands are listed "backwards", ie in the order
+# Dd, Dm, Dn where other insns use Dd, Dn, Dm. For QEMU we choose
+# to consider Vm and Vn as being in different fields in the insn,
+# which allows us to avoid special-casing shifts in the trans_
+# function code. We would otherwise need to manually swap the operands
+# over to call Neon helper functions that are shared with AArch64,
+# which does not have this odd reversed-operand situation.
+@3same_rev .... ... . . . size:2 .... .... .... . q:1 . . .... \
+ &3same vn=%vm_dp vm=%vn_dp vd=%vd_dp
+
+VSHL_S_3s 1111 001 0 0 . .. .... .... 0100 . . . 0 .... @3same_rev
+VSHL_U_3s 1111 001 1 0 . .. .... .... 0100 . . . 0 .... @3same_rev
+
+# Insns operating on 64-bit elements (size!=0b11 handled elsewhere)
+# The _rev suffix indicates that Vn and Vm are reversed (as explained
+# by the comment for the @3same_rev format).
+@3same_64_rev .... ... . . . 11 .... .... .... . q:1 . . .... \
+ &3same vm=%vn_dp vn=%vm_dp vd=%vd_dp size=3
+
+{
+ VQSHL_S64_3s 1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
+ VQSHL_S_3s 1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_rev
+}
+{
+ VQSHL_U64_3s 1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
+ VQSHL_U_3s 1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_rev
+}
+{
+ VRSHL_S64_3s 1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
+ VRSHL_S_3s 1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_rev
+}
+{
+ VRSHL_U64_3s 1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
+ VRSHL_U_3s 1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_rev
+}
+{
+ VQRSHL_S64_3s 1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
+ VQRSHL_S_3s 1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_rev
+}
+{
+ VQRSHL_U64_3s 1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
+ VQRSHL_U_3s 1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_rev
+}
+
+VMAX_S_3s 1111 001 0 0 . .. .... .... 0110 . . . 0 .... @3same
+VMAX_U_3s 1111 001 1 0 . .. .... .... 0110 . . . 0 .... @3same
+VMIN_S_3s 1111 001 0 0 . .. .... .... 0110 . . . 1 .... @3same
+VMIN_U_3s 1111 001 1 0 . .. .... .... 0110 . . . 1 .... @3same
+
+VABD_S_3s 1111 001 0 0 . .. .... .... 0111 . . . 0 .... @3same
+VABD_U_3s 1111 001 1 0 . .. .... .... 0111 . . . 0 .... @3same
+
+VABA_S_3s 1111 001 0 0 . .. .... .... 0111 . . . 1 .... @3same
+VABA_U_3s 1111 001 1 0 . .. .... .... 0111 . . . 1 .... @3same
+
+VADD_3s 1111 001 0 0 . .. .... .... 1000 . . . 0 .... @3same
+VSUB_3s 1111 001 1 0 . .. .... .... 1000 . . . 0 .... @3same
+
+VTST_3s 1111 001 0 0 . .. .... .... 1000 . . . 1 .... @3same
+VCEQ_3s 1111 001 1 0 . .. .... .... 1000 . . . 1 .... @3same
+
+VMLA_3s 1111 001 0 0 . .. .... .... 1001 . . . 0 .... @3same
+VMLS_3s 1111 001 1 0 . .. .... .... 1001 . . . 0 .... @3same
+
+VMUL_3s 1111 001 0 0 . .. .... .... 1001 . . . 1 .... @3same
+VMUL_p_3s 1111 001 1 0 . .. .... .... 1001 . . . 1 .... @3same
+
+VPMAX_S_3s 1111 001 0 0 . .. .... .... 1010 . . . 0 .... @3same_q0
+VPMAX_U_3s 1111 001 1 0 . .. .... .... 1010 . . . 0 .... @3same_q0
+
+VPMIN_S_3s 1111 001 0 0 . .. .... .... 1010 . . . 1 .... @3same_q0
+VPMIN_U_3s 1111 001 1 0 . .. .... .... 1010 . . . 1 .... @3same_q0
+
+VQDMULH_3s 1111 001 0 0 . .. .... .... 1011 . . . 0 .... @3same
+VQRDMULH_3s 1111 001 1 0 . .. .... .... 1011 . . . 0 .... @3same
+
+VPADD_3s 1111 001 0 0 . .. .... .... 1011 . . . 1 .... @3same_q0
+
+VQRDMLAH_3s 1111 001 1 0 . .. .... .... 1011 ... 1 .... @3same
+
+@3same_crypto .... .... .... .... .... .... .... .... \
+ &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0 q=1
+
+SHA1C_3s 1111 001 0 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1P_3s 1111 001 0 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1M_3s 1111 001 0 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1SU0_3s 1111 001 0 0 . 11 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256H_3s 1111 001 1 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256H2_3s 1111 001 1 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256SU1_3s 1111 001 1 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
+
+VFMA_fp_3s 1111 001 0 0 . 0 . .... .... 1100 ... 1 .... @3same_fp
+VFMS_fp_3s 1111 001 0 0 . 1 . .... .... 1100 ... 1 .... @3same_fp
+
+VQRDMLSH_3s 1111 001 1 0 . .. .... .... 1100 ... 1 .... @3same
+
+VADD_fp_3s 1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp
+VSUB_fp_3s 1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VPADD_fp_3s 1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0
+VABD_fp_3s 1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VMLA_fp_3s 1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VMLS_fp_3s 1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp
+VMUL_fp_3s 1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VCEQ_fp_3s 1111 001 0 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
+VCGE_fp_3s 1111 001 1 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
+VACGE_fp_3s 1111 001 1 0 . 0 . .... .... 1110 ... 1 .... @3same_fp
+VCGT_fp_3s 1111 001 1 0 . 1 . .... .... 1110 ... 0 .... @3same_fp
+VACGT_fp_3s 1111 001 1 0 . 1 . .... .... 1110 ... 1 .... @3same_fp
+VMAX_fp_3s 1111 001 0 0 . 0 . .... .... 1111 ... 0 .... @3same_fp
+VMIN_fp_3s 1111 001 0 0 . 1 . .... .... 1111 ... 0 .... @3same_fp
+VPMAX_fp_3s 1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0
+VPMIN_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0
+VRECPS_fp_3s 1111 001 0 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
+VRSQRTS_fp_3s 1111 001 0 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
+VMAXNM_fp_3s 1111 001 1 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
+VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
+
+######################################################################
+# 2-reg-and-shift grouping:
+# 1111 001 U 1 D immH:3 immL:3 Vd:4 opc:4 L Q M 1 Vm:4
+######################################################################
+&2reg_shift vm vd q shift size
+
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%neon_rshift_i6 16:6 !function=rsub_64
+%neon_rshift_i5 16:5 !function=rsub_32
+%neon_rshift_i4 16:4 !function=rsub_16
+%neon_rshift_i3 16:3 !function=rsub_8
+
+@2reg_shr_d .... ... . . . ...... .... .... 1 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=3 shift=%neon_rshift_i6
+@2reg_shr_s .... ... . . . 1 ..... .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
+@2reg_shr_h .... ... . . . 01 .... .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
+@2reg_shr_b .... ... . . . 001 ... .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i3
+
+@2reg_shl_d .... ... . . . shift:6 .... .... 1 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=3
+@2reg_shl_s .... ... . . . 1 shift:5 .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2
+@2reg_shl_h .... ... . . . 01 shift:4 .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1
+@2reg_shl_b .... ... . . . 001 shift:3 .... .... 0 q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=0
+
+# Narrowing right shifts: here the Q bit is part of the opcode decode
+@2reg_shrn_d .... ... . . . 1 ..... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=3 q=0 \
+ shift=%neon_rshift_i5
+@2reg_shrn_s .... ... . . . 01 .... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 \
+ shift=%neon_rshift_i4
+@2reg_shrn_h .... ... . . . 001 ... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
+ shift=%neon_rshift_i3
+
+# Long left shifts: again Q is part of opcode decode
+@2reg_shll_s .... ... . . . 1 shift:5 .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
+@2reg_shll_h .... ... . . . 01 shift:4 .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
+@2reg_shll_b .... ... . . . 001 shift:3 .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
+
+@2reg_vcvt .... ... . . . 1 ..... .... .... . q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
+@2reg_vcvt_f16 .... ... . . . 11 .... .... .... . q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
+
+VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
+VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
+VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
+VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
+
+VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
+VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
+VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
+VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
+
+VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
+VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
+VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
+VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
+
+VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
+VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
+VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
+VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
+
+VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
+VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
+VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
+VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
+
+VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
+VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
+VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
+VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
+
+VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
+VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
+VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
+VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
+
+VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
+VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
+VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
+VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
+
+VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_d
+VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_s
+VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_h
+VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_b
+
+VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
+VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
+VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
+VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
+
+VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
+VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
+VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
+VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
+
+VQSHLU_64_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
+VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
+VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
+VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b
+
+VQSHL_S_64_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
+VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
+VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
+
+VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
+VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
+VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
+
+VSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VRSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VRSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VRSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
+
+VQSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VQSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VQSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VQRSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VQRSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VQRSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
+
+# VQSHRN with signed input
+VQSHRN_S64_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
+VQSHRN_S32_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
+VQSHRN_S16_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
+
+# VQRSHRN with signed input
+VQRSHRN_S64_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
+VQRSHRN_S32_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
+VQRSHRN_S16_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
+
+# VQSHRN with unsigned input
+VQSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
+VQSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
+VQSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
+
+# VQRSHRN with unsigned input
+VQRSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
+VQRSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
+VQRSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
+
+VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
+
+VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
+
+# VCVT fixed<->float conversions
+VCVT_SH_2sh 1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_UH_2sh 1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HS_2sh 1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HU_2sh 1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+
+VCVT_SF_2sh 1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
+VCVT_UF_2sh 1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
+VCVT_FS_2sh 1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
+VCVT_FU_2sh 1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
+
+######################################################################
+# 1-reg-and-modified-immediate grouping:
+# 1111 001 i 1 D 000 imm:3 Vd:4 cmode:4 0 Q op 1 Vm:4
+######################################################################
+
+&1reg_imm vd q imm cmode op
+
+%asimd_imm_value 24:1 16:3 0:4
+
+@1reg_imm .... ... . . . ... ... .... .... . q:1 . . .... \
+ &1reg_imm imm=%asimd_imm_value vd=%vd_dp
+
+# The cmode/op bits here decode VORR/VBIC/VMOV/VMNV, but
+# not in a way we can conveniently represent in decodetree without
+# a lot of repetition:
+# VORR: op=0, (cmode & 1) && cmode < 12
+# VBIC: op=1, (cmode & 1) && cmode < 12
+# VMOV: everything else
+# So we have a single decode line and check the cmode/op in the
+# trans function.
+Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
+
+######################################################################
+# Within the "two registers, or three registers of different lengths"
+# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode
+# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar;
+# or they are a size field for the three-reg-different-lengths and
+# two-reg-and-scalar insn groups (where size cannot be 0b11). This
+# is slightly awkward for decodetree: we handle it with this
+# non-exclusive group which contains within it two exclusive groups:
+# one for the size=0b11 patterns, and one for the size-not-0b11
+# patterns. This allows us to check that none of the insns within
+# each subgroup accidentally overlap each other. Note that all the
+# trans functions for the size-not-0b11 patterns must check and
+# return false for size==3.
+######################################################################
+{
+ [
+ ##################################################################
+ # Miscellaneous size=0b11 insns
+ ##################################################################
+ VEXT 1111 001 0 1 . 11 .... .... imm:4 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+ VTBL 1111 001 1 1 . 11 .... .... 10 len:2 . op:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+ VDUP_scalar 1111 001 1 1 . 11 index:3 1 .... 11 000 q:1 . 0 .... \
+ vm=%vm_dp vd=%vd_dp size=0
+ VDUP_scalar 1111 001 1 1 . 11 index:2 10 .... 11 000 q:1 . 0 .... \
+ vm=%vm_dp vd=%vd_dp size=1
+ VDUP_scalar 1111 001 1 1 . 11 index:1 100 .... 11 000 q:1 . 0 .... \
+ vm=%vm_dp vd=%vd_dp size=2
+
+ ##################################################################
+ # 2-reg-misc grouping:
+ # 1111 001 11 D 11 size:2 opc1:2 Vd:4 0 opc2:4 q:1 M 0 Vm:4
+ ##################################################################
+
+ &2misc vd vm q size
+
+ @2misc .... ... .. . .. size:2 .. .... . .... q:1 . . .... \
+ &2misc vm=%vm_dp vd=%vd_dp
+ @2misc_q0 .... ... .. . .. size:2 .. .... . .... . . . .... \
+ &2misc vm=%vm_dp vd=%vd_dp q=0
+ @2misc_q1 .... ... .. . .. size:2 .. .... . .... . . . .... \
+ &2misc vm=%vm_dp vd=%vd_dp q=1
+
+ VREV64 1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc
+ VREV32 1111 001 11 . 11 .. 00 .... 0 0001 . . 0 .... @2misc
+ VREV16 1111 001 11 . 11 .. 00 .... 0 0010 . . 0 .... @2misc
+
+ VPADDL_S 1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc
+ VPADDL_U 1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc
+
+ AESE 1111 001 11 . 11 .. 00 .... 0 0110 0 . 0 .... @2misc_q1
+ AESD 1111 001 11 . 11 .. 00 .... 0 0110 1 . 0 .... @2misc_q1
+ AESMC 1111 001 11 . 11 .. 00 .... 0 0111 0 . 0 .... @2misc_q1
+ AESIMC 1111 001 11 . 11 .. 00 .... 0 0111 1 . 0 .... @2misc_q1
+
+ VCLS 1111 001 11 . 11 .. 00 .... 0 1000 . . 0 .... @2misc
+ VCLZ 1111 001 11 . 11 .. 00 .... 0 1001 . . 0 .... @2misc
+ VCNT 1111 001 11 . 11 .. 00 .... 0 1010 . . 0 .... @2misc
+
+ VMVN 1111 001 11 . 11 .. 00 .... 0 1011 . . 0 .... @2misc
+
+ VPADAL_S 1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc
+ VPADAL_U 1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc
+
+ VQABS 1111 001 11 . 11 .. 00 .... 0 1110 . . 0 .... @2misc
+ VQNEG 1111 001 11 . 11 .. 00 .... 0 1111 . . 0 .... @2misc
+
+ VCGT0 1111 001 11 . 11 .. 01 .... 0 0000 . . 0 .... @2misc
+ VCGE0 1111 001 11 . 11 .. 01 .... 0 0001 . . 0 .... @2misc
+ VCEQ0 1111 001 11 . 11 .. 01 .... 0 0010 . . 0 .... @2misc
+ VCLE0 1111 001 11 . 11 .. 01 .... 0 0011 . . 0 .... @2misc
+ VCLT0 1111 001 11 . 11 .. 01 .... 0 0100 . . 0 .... @2misc
+
+ SHA1H 1111 001 11 . 11 .. 01 .... 0 0101 1 . 0 .... @2misc_q1
+
+ VABS 1111 001 11 . 11 .. 01 .... 0 0110 . . 0 .... @2misc
+ VNEG 1111 001 11 . 11 .. 01 .... 0 0111 . . 0 .... @2misc
+
+ VCGT0_F 1111 001 11 . 11 .. 01 .... 0 1000 . . 0 .... @2misc
+ VCGE0_F 1111 001 11 . 11 .. 01 .... 0 1001 . . 0 .... @2misc
+ VCEQ0_F 1111 001 11 . 11 .. 01 .... 0 1010 . . 0 .... @2misc
+ VCLE0_F 1111 001 11 . 11 .. 01 .... 0 1011 . . 0 .... @2misc
+ VCLT0_F 1111 001 11 . 11 .. 01 .... 0 1100 . . 0 .... @2misc
+
+ VABS_F 1111 001 11 . 11 .. 01 .... 0 1110 . . 0 .... @2misc
+ VNEG_F 1111 001 11 . 11 .. 01 .... 0 1111 . . 0 .... @2misc
+
+ VSWP 1111 001 11 . 11 .. 10 .... 0 0000 . . 0 .... @2misc
+ VTRN 1111 001 11 . 11 .. 10 .... 0 0001 . . 0 .... @2misc
+ VUZP 1111 001 11 . 11 .. 10 .... 0 0010 . . 0 .... @2misc
+ VZIP 1111 001 11 . 11 .. 10 .... 0 0011 . . 0 .... @2misc
+
+ VMOVN 1111 001 11 . 11 .. 10 .... 0 0100 0 . 0 .... @2misc_q0
+ # VQMOVUN: unsigned result (source is always signed)
+ VQMOVUN 1111 001 11 . 11 .. 10 .... 0 0100 1 . 0 .... @2misc_q0
+ # VQMOVN: signed result, source may be signed (_S) or unsigned (_U)
+ VQMOVN_S 1111 001 11 . 11 .. 10 .... 0 0101 0 . 0 .... @2misc_q0
+ VQMOVN_U 1111 001 11 . 11 .. 10 .... 0 0101 1 . 0 .... @2misc_q0
+
+ VSHLL 1111 001 11 . 11 .. 10 .... 0 0110 0 . 0 .... @2misc_q0
+
+ SHA1SU1 1111 001 11 . 11 .. 10 .... 0 0111 0 . 0 .... @2misc_q1
+ SHA256SU0 1111 001 11 . 11 .. 10 .... 0 0111 1 . 0 .... @2misc_q1
+
+ VRINTN 1111 001 11 . 11 .. 10 .... 0 1000 . . 0 .... @2misc
+ VRINTX 1111 001 11 . 11 .. 10 .... 0 1001 . . 0 .... @2misc
+ VRINTA 1111 001 11 . 11 .. 10 .... 0 1010 . . 0 .... @2misc
+ VRINTZ 1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc
+
+ VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0
+ VCVT_B16_F32 1111 001 11 . 11 .. 10 .... 0 1100 1 . 0 .... @2misc_q0
+
+ VRINTM 1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc
+
+ VCVT_F32_F16 1111 001 11 . 11 .. 10 .... 0 1110 0 . 0 .... @2misc_q0
+
+ VRINTP 1111 001 11 . 11 .. 10 .... 0 1111 . . 0 .... @2misc
+
+ VCVTAS 1111 001 11 . 11 .. 11 .... 0 0000 . . 0 .... @2misc
+ VCVTAU 1111 001 11 . 11 .. 11 .... 0 0001 . . 0 .... @2misc
+ VCVTNS 1111 001 11 . 11 .. 11 .... 0 0010 . . 0 .... @2misc
+ VCVTNU 1111 001 11 . 11 .. 11 .... 0 0011 . . 0 .... @2misc
+ VCVTPS 1111 001 11 . 11 .. 11 .... 0 0100 . . 0 .... @2misc
+ VCVTPU 1111 001 11 . 11 .. 11 .... 0 0101 . . 0 .... @2misc
+ VCVTMS 1111 001 11 . 11 .. 11 .... 0 0110 . . 0 .... @2misc
+ VCVTMU 1111 001 11 . 11 .. 11 .... 0 0111 . . 0 .... @2misc
+
+ VRECPE 1111 001 11 . 11 .. 11 .... 0 1000 . . 0 .... @2misc
+ VRSQRTE 1111 001 11 . 11 .. 11 .... 0 1001 . . 0 .... @2misc
+ VRECPE_F 1111 001 11 . 11 .. 11 .... 0 1010 . . 0 .... @2misc
+ VRSQRTE_F 1111 001 11 . 11 .. 11 .... 0 1011 . . 0 .... @2misc
+ VCVT_FS 1111 001 11 . 11 .. 11 .... 0 1100 . . 0 .... @2misc
+ VCVT_FU 1111 001 11 . 11 .. 11 .... 0 1101 . . 0 .... @2misc
+ VCVT_SF 1111 001 11 . 11 .. 11 .... 0 1110 . . 0 .... @2misc
+ VCVT_UF 1111 001 11 . 11 .. 11 .... 0 1111 . . 0 .... @2misc
+ ]
+
+ # Subgroup for size != 0b11
+ [
+ ##################################################################
+ # 3-reg-different-length grouping:
+ # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4
+ ##################################################################
+
+ &3diff vm vn vd size
+
+ @3diff .... ... . . . size:2 .... .... .... . . . . .... \
+ &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+ VADDL_S_3d 1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+ VADDL_U_3d 1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+
+ VADDW_S_3d 1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+ VADDW_U_3d 1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+
+ VSUBL_S_3d 1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+ VSUBL_U_3d 1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+
+ VSUBW_S_3d 1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+ VSUBW_U_3d 1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+
+ VADDHN_3d 1111 001 0 1 . .. .... .... 0100 . 0 . 0 .... @3diff
+ VRADDHN_3d 1111 001 1 1 . .. .... .... 0100 . 0 . 0 .... @3diff
+
+ VABAL_S_3d 1111 001 0 1 . .. .... .... 0101 . 0 . 0 .... @3diff
+ VABAL_U_3d 1111 001 1 1 . .. .... .... 0101 . 0 . 0 .... @3diff
+
+ VSUBHN_3d 1111 001 0 1 . .. .... .... 0110 . 0 . 0 .... @3diff
+ VRSUBHN_3d 1111 001 1 1 . .. .... .... 0110 . 0 . 0 .... @3diff
+
+ VABDL_S_3d 1111 001 0 1 . .. .... .... 0111 . 0 . 0 .... @3diff
+ VABDL_U_3d 1111 001 1 1 . .. .... .... 0111 . 0 . 0 .... @3diff
+
+ VMLAL_S_3d 1111 001 0 1 . .. .... .... 1000 . 0 . 0 .... @3diff
+ VMLAL_U_3d 1111 001 1 1 . .. .... .... 1000 . 0 . 0 .... @3diff
+
+ VQDMLAL_3d 1111 001 0 1 . .. .... .... 1001 . 0 . 0 .... @3diff
+
+ VMLSL_S_3d 1111 001 0 1 . .. .... .... 1010 . 0 . 0 .... @3diff
+ VMLSL_U_3d 1111 001 1 1 . .. .... .... 1010 . 0 . 0 .... @3diff
+
+ VQDMLSL_3d 1111 001 0 1 . .. .... .... 1011 . 0 . 0 .... @3diff
+
+ VMULL_S_3d 1111 001 0 1 . .. .... .... 1100 . 0 . 0 .... @3diff
+ VMULL_U_3d 1111 001 1 1 . .. .... .... 1100 . 0 . 0 .... @3diff
+
+ VQDMULL_3d 1111 001 0 1 . .. .... .... 1101 . 0 . 0 .... @3diff
+
+ VMULL_P_3d 1111 001 0 1 . .. .... .... 1110 . 0 . 0 .... @3diff
+
+ ##################################################################
+ # 2-regs-plus-scalar grouping:
+ # 1111 001 Q 1 D sz!=11 Vn:4 Vd:4 opc:4 N 1 M 0 Vm:4
+ ##################################################################
+ &2scalar vm vn vd size q
+
+ @2scalar .... ... q:1 . . size:2 .... .... .... . . . . .... \
+ &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp
+ # For the 'long' ops the Q bit is part of insn decode
+ @2scalar_q0 .... ... . . . size:2 .... .... .... . . . . .... \
+ &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
+
+ VMLA_2sc 1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar
+ VMLA_F_2sc 1111 001 . 1 . .. .... .... 0001 . 1 . 0 .... @2scalar
+
+ VMLAL_S_2sc 1111 001 0 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+ VMLAL_U_2sc 1111 001 1 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+
+ VQDMLAL_2sc 1111 001 0 1 . .. .... .... 0011 . 1 . 0 .... @2scalar_q0
+
+ VMLS_2sc 1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar
+ VMLS_F_2sc 1111 001 . 1 . .. .... .... 0101 . 1 . 0 .... @2scalar
+
+ VMLSL_S_2sc 1111 001 0 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+ VMLSL_U_2sc 1111 001 1 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+
+ VQDMLSL_2sc 1111 001 0 1 . .. .... .... 0111 . 1 . 0 .... @2scalar_q0
+
+ VMUL_2sc 1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar
+ VMUL_F_2sc 1111 001 . 1 . .. .... .... 1001 . 1 . 0 .... @2scalar
+
+ VMULL_S_2sc 1111 001 0 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+ VMULL_U_2sc 1111 001 1 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+
+ VQDMULL_2sc 1111 001 0 1 . .. .... .... 1011 . 1 . 0 .... @2scalar_q0
+
+ VQDMULH_2sc 1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar
+ VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar
+
+ VQRDMLAH_2sc 1111 001 . 1 . .. .... .... 1110 . 1 . 0 .... @2scalar
+ VQRDMLSH_2sc 1111 001 . 1 . .. .... .... 1111 . 1 . 0 .... @2scalar
+ ]
+}
diff --git a/target/arm/tcg/neon-ls.decode b/target/arm/tcg/neon-ls.decode
new file mode 100644
index 0000000..c5f364c
--- /dev/null
+++ b/target/arm/tcg/neon-ls.decode
@@ -0,0 +1,52 @@
+# AArch32 Neon load/store instruction descriptions
+#
+# Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# Encodings for Neon load/store instructions where the T32 encoding
+# is a simple transformation of the A32 encoding.
+# More specifically, this file covers instructions where the A32 encoding is
+# 0b1111_0100_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
+# and the T32 encoding is
+# 0b1111_1001_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
+# This file works on the A32 encoding only; calling code for T32 has to
+# transform the insn into the A32 version first.
+
+%vd_dp 22:1 12:4
+
+# Neon load/store multiple structures
+
+VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
+ vd=%vd_dp
+
+# Neon load single element to all lanes
+
+VLD_all_lanes 1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
+ vd=%vd_dp
+
+# Neon load/store single structure to one lane
+%imm1_5_p1 5:1 !function=plus_1
+%imm1_6_p1 6:1 !function=plus_1
+
+VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 00 n:2 reg_idx:3 align:1 rm:4 \
+ vd=%vd_dp size=0 stride=1
+VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 01 n:2 reg_idx:2 . align:1 rm:4 \
+ vd=%vd_dp size=1 stride=%imm1_5_p1
+VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 10 n:2 reg_idx:1 . align:2 rm:4 \
+ vd=%vd_dp size=2 stride=%imm1_6_p1
diff --git a/target/arm/tcg/neon-shared.decode b/target/arm/tcg/neon-shared.decode
new file mode 100644
index 0000000..8e6bd0b
--- /dev/null
+++ b/target/arm/tcg/neon-shared.decode
@@ -0,0 +1,99 @@
+# AArch32 Neon instruction descriptions
+#
+# Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# Encodings for Neon instructions whose encoding is the same for
+# both A32 and T32.
+
+# More specifically, this covers:
+# 2reg scalar ext: 0b1111_1110_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
+# 3same ext: 0b1111_110x_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
+
+# VFP/Neon register fields; same as vfp.decode
+%vm_dp 5:1 0:4
+%vm_sp 0:4 5:1
+%vn_dp 7:1 16:4
+%vn_sp 16:4 7:1
+%vd_dp 22:1 12:4
+%vd_sp 12:4 22:1
+
+# For VCMLA/VCADD insns, convert the single-bit size field
+# which is 0 for fp16 and 1 for fp32 into a MO_* constant.
+# (Note that this is the reverse of the sense of the 1-bit size
+# field in the 3same_fp Neon insns.)
+%vcadd_size 20:1 !function=plus_1
+
+VCMLA 1111 110 rot:2 . 1 . .... .... 1000 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
+
+VCADD 1111 110 rot:1 1 . 0 . .... .... 1000 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
+
+VSDOT 1111 110 00 . 10 .... .... 1101 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUDOT 1111 110 00 . 10 .... .... 1101 . q:1 . 1 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUSDOT 1111 110 01 . 10 .... .... 1101 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VDOT_b16 1111 110 00 . 00 .... .... 1101 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+# VFM[AS]L
+VFML 1111 110 0 s:1 . 10 .... .... 1000 . 0 . 1 .... \
+ vm=%vm_sp vn=%vn_sp vd=%vd_dp q=0
+VFML 1111 110 0 s:1 . 10 .... .... 1000 . 1 . 1 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp q=1
+
+VSMMLA 1111 1100 0.10 .... .... 1100 .1.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUMMLA 1111 1100 0.10 .... .... 1100 .1.1 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUSMMLA 1111 1100 1.10 .... .... 1100 .1.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VMMLA_b16 1111 1100 0.00 .... .... 1100 .1.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VFMA_b16 1111 110 0 0.11 .... .... 1000 . q:1 . 1 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VCMLA_scalar 1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \
+ vn=%vn_dp vd=%vd_dp size=1
+VCMLA_scalar 1111 1110 1 . rot:2 .... .... 1000 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp size=2 index=0
+
+VSDOT_scalar 1111 1110 0 . 10 .... .... 1101 . q:1 index:1 0 vm:4 \
+ vn=%vn_dp vd=%vd_dp
+VUDOT_scalar 1111 1110 0 . 10 .... .... 1101 . q:1 index:1 1 vm:4 \
+ vn=%vn_dp vd=%vd_dp
+VUSDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
+ vn=%vn_dp vd=%vd_dp
+VSUDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 1 vm:4 \
+ vn=%vn_dp vd=%vd_dp
+VDOT_b16_scal 1111 1110 0 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
+ vn=%vn_dp vd=%vd_dp
+
+%vfml_scalar_q0_rm 0:3 5:1
+%vfml_scalar_q1_index 5:1 3:1
+VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 0 . 1 index:1 ... \
+ rm=%vfml_scalar_q0_rm vn=%vn_sp vd=%vd_dp q=0
+VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 1 . 1 . rm:3 \
+ index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp q=1
+VFMA_b16_scal 1111 1110 0.11 .... .... 1000 . q:1 . 1 . vm:3 \
+ index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp
diff --git a/target/arm/tcg/sme-fa64.decode b/target/arm/tcg/sme-fa64.decode
new file mode 100644
index 0000000..47708cc
--- /dev/null
+++ b/target/arm/tcg/sme-fa64.decode
@@ -0,0 +1,60 @@
+# AArch64 SME allowed instruction decoding
+#
+# Copyright (c) 2022 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# These patterns are taken from Appendix E1.1 of DDI0616 A.a,
+# Arm Architecture Reference Manual Supplement,
+# The Scalable Matrix Extension (SME), for Armv9-A
+
+{
+ [
+ OK 0-00 1110 0000 0001 0010 11-- ---- ---- # SMOV W|Xd,Vn.B[0]
+ OK 0-00 1110 0000 0010 0010 11-- ---- ---- # SMOV W|Xd,Vn.H[0]
+ OK 0100 1110 0000 0100 0010 11-- ---- ---- # SMOV Xd,Vn.S[0]
+ OK 0000 1110 0000 0001 0011 11-- ---- ---- # UMOV Wd,Vn.B[0]
+ OK 0000 1110 0000 0010 0011 11-- ---- ---- # UMOV Wd,Vn.H[0]
+ OK 0000 1110 0000 0100 0011 11-- ---- ---- # UMOV Wd,Vn.S[0]
+ OK 0100 1110 0000 1000 0011 11-- ---- ---- # UMOV Xd,Vn.D[0]
+ ]
+ FAIL 0--0 111- ---- ---- ---- ---- ---- ---- # Advanced SIMD vector operations
+}
+
+{
+ [
+ OK 0101 1110 --1- ---- 11-1 11-- ---- ---- # FMULX/FRECPS/FRSQRTS (scalar)
+ OK 0101 1110 -10- ---- 00-1 11-- ---- ---- # FMULX/FRECPS/FRSQRTS (scalar, FP16)
+ OK 01-1 1110 1-10 0001 11-1 10-- ---- ---- # FRECPE/FRSQRTE/FRECPX (scalar)
+ OK 01-1 1110 1111 1001 11-1 10-- ---- ---- # FRECPE/FRSQRTE/FRECPX (scalar, FP16)
+ ]
+ FAIL 01-1 111- ---- ---- ---- ---- ---- ---- # Advanced SIMD single-element operations
+}
+
+FAIL 0-00 110- ---- ---- ---- ---- ---- ---- # Advanced SIMD structure load/store
+FAIL 1100 1110 ---- ---- ---- ---- ---- ---- # Advanced SIMD cryptography extensions
+FAIL 0001 1110 0111 1110 0000 00-- ---- ---- # FJCVTZS
+
+# These are the "avoidance of doubt" final table of Illegal Advanced SIMD instructions
+# We don't actually need to include these, as the default is OK.
+# -001 111- ---- ---- ---- ---- ---- ---- # Scalar floating-point operations
+# --10 110- ---- ---- ---- ---- ---- ---- # Load/store pair of FP registers
+# --01 1100 ---- ---- ---- ---- ---- ---- # Load FP register (PC-relative literal)
+# --11 1100 --0- ---- ---- ---- ---- ---- # Load/store FP register (unscaled imm)
+# --11 1100 --1- ---- ---- ---- ---- --10 # Load/store FP register (register offset)
+# --11 1101 ---- ---- ---- ---- ---- ---- # Load/store FP register (scaled imm)
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
new file mode 100644
index 0000000..628804e
--- /dev/null
+++ b/target/arm/tcg/sme.decode
@@ -0,0 +1,88 @@
+# AArch64 SME instruction descriptions
+#
+# Copyright (c) 2022 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+### SME Misc
+
+ZERO 11000000 00 001 00000000000 imm:8
+
+### SME Move into/from Array
+
+%mova_rs 13:2 !function=plus_12
+&mova esz rs pg zr za_imm v:bool to_vec:bool
+
+MOVA 11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4 \
+ &mova to_vec=0 rs=%mova_rs
+MOVA 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za_imm:4 \
+ &mova to_vec=0 rs=%mova_rs esz=4
+
+MOVA 11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5 \
+ &mova to_vec=1 rs=%mova_rs
+MOVA 11000000 11 00001 1 v:1 .. pg:3 0 za_imm:4 zr:5 \
+ &mova to_vec=1 rs=%mova_rs esz=4
+
+### SME Memory
+
+&ldst esz rs pg rn rm za_imm v:bool st:bool
+
+LDST1 1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \
+ &ldst rs=%mova_rs
+LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \
+ &ldst esz=4 rs=%mova_rs
+
+&ldstr rv rn imm
+@ldstr ....... ... . ...... .. ... rn:5 . imm:4 \
+ &ldstr rv=%mova_rs
+
+LDR 1110000 100 0 000000 .. 000 ..... 0 .... @ldstr
+STR 1110000 100 1 000000 .. 000 ..... 0 .... @ldstr
+
+### SME Add Vector to Array
+
+&adda zad zn pm pn
+@adda_32 ........ .. ..... . pm:3 pn:3 zn:5 ... zad:2 &adda
+@adda_64 ........ .. ..... . pm:3 pn:3 zn:5 .. zad:3 &adda
+
+ADDHA_s 11000000 10 01000 0 ... ... ..... 000 .. @adda_32
+ADDVA_s 11000000 10 01000 1 ... ... ..... 000 .. @adda_32
+ADDHA_d 11000000 11 01000 0 ... ... ..... 00 ... @adda_64
+ADDVA_d 11000000 11 01000 1 ... ... ..... 00 ... @adda_64
+
+### SME Outer Product
+
+&op zad zn zm pm pn sub:bool
+@op_32 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op
+@op_64 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 . zad:3 &op
+
+FMOPA_s 10000000 100 ..... ... ... ..... . 00 .. @op_32
+FMOPA_d 10000000 110 ..... ... ... ..... . 0 ... @op_64
+
+BFMOPA 10000001 100 ..... ... ... ..... . 00 .. @op_32
+FMOPA_h 10000001 101 ..... ... ... ..... . 00 .. @op_32
+
+SMOPA_s 1010000 0 10 0 ..... ... ... ..... . 00 .. @op_32
+SUMOPA_s 1010000 0 10 1 ..... ... ... ..... . 00 .. @op_32
+USMOPA_s 1010000 1 10 0 ..... ... ... ..... . 00 .. @op_32
+UMOPA_s 1010000 1 10 1 ..... ... ... ..... . 00 .. @op_32
+
+SMOPA_d 1010000 0 11 0 ..... ... ... ..... . 0 ... @op_64
+SUMOPA_d 1010000 0 11 1 ..... ... ... ..... . 0 ... @op_64
+USMOPA_d 1010000 1 11 0 ..... ... ... ..... . 0 ... @op_64
+UMOPA_d 1010000 1 11 1 ..... ... ... ..... . 0 ... @op_64
diff --git a/target/arm/tcg/sve.decode b/target/arm/tcg/sve.decode
new file mode 100644
index 0000000..14b3a69
--- /dev/null
+++ b/target/arm/tcg/sve.decode
@@ -0,0 +1,1702 @@
+# AArch64 SVE instruction descriptions
+#
+# Copyright (c) 2017 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+###########################################################################
+# Named fields. These are primarily for disjoint fields.
+
+%imm4_16_p1 16:4 !function=plus_1
+%imm6_22_5 22:1 5:5
+%imm7_22_16 22:2 16:5
+%imm8_16_10 16:5 10:3
+%imm9_16_10 16:s6 10:3
+%size_23 23:2
+%dtype_23_13 23:2 13:2
+%index3_22_19 22:1 19:2
+%index3_19_11 19:2 11:1
+%index2_20_11 20:1 11:1
+
+# A combination of tsz:imm3 -- extract esize.
+%tszimm_esz 22:2 5:5 !function=tszimm_esz
+# A combination of tsz:imm3 -- extract (2 * esize) - (tsz:imm3)
+%tszimm_shr 22:2 5:5 !function=tszimm_shr
+# A combination of tsz:imm3 -- extract (tsz:imm3) - esize
+%tszimm_shl 22:2 5:5 !function=tszimm_shl
+
+# Similarly for the tszh/tszl pair at 22/16 for zzi
+%tszimm16_esz 22:2 16:5 !function=tszimm_esz
+%tszimm16_shr 22:2 16:5 !function=tszimm_shr
+%tszimm16_shl 22:2 16:5 !function=tszimm_shl
+
+# Signed 8-bit immediate, optionally shifted left by 8.
+%sh8_i8s 5:9 !function=expand_imm_sh8s
+# Unsigned 8-bit immediate, optionally shifted left by 8.
+%sh8_i8u 5:9 !function=expand_imm_sh8u
+
+# Unsigned load of msz into esz=2, represented as a dtype.
+%msz_dtype 23:2 !function=msz_dtype
+
+# Either a copy of rd (at bit 0), or a different source
+# as propagated via the MOVPRFX instruction.
+%reg_movprfx 0:5
+
+###########################################################################
+# Named attribute sets. These are used to make nice(er) names
+# when creating helpers common to those for the individual
+# instruction patterns.
+
+&rr_esz rd rn esz
+&rri rd rn imm
+&rr_dbm rd rn dbm
+&rrri rd rn rm imm
+&rri_esz rd rn imm esz
+&rrri_esz rd rn rm imm esz
+&rrr_esz rd rn rm esz
+&rrx_esz rd rn rm index esz
+&rpr_esz rd pg rn esz
+&rpr_s rd pg rn s
+&rprr_s rd pg rn rm s
+&rprr_esz rd pg rn rm esz
+&rrrr_esz rd ra rn rm esz
+&rrxr_esz rd rn rm ra index esz
+&rprrr_esz rd pg rn rm ra esz
+&rpri_esz rd pg rn imm esz
+&ptrue rd esz pat s
+&incdec_cnt rd pat esz imm d u
+&incdec2_cnt rd rn pat esz imm d u
+&incdec_pred rd pg esz d u
+&incdec2_pred rd rn pg esz d u
+&rprr_load rd pg rn rm dtype nreg
+&rpri_load rd pg rn imm dtype nreg
+&rprr_store rd pg rn rm msz esz nreg
+&rpri_store rd pg rn imm msz esz nreg
+&rprr_gather_load rd pg rn rm esz msz u ff xs scale
+&rpri_gather_load rd pg rn imm esz msz u ff
+&rprr_scatter_store rd pg rn rm esz msz xs scale
+&rpri_scatter_store rd pg rn imm esz msz
+
+###########################################################################
+# Named instruction formats. These are generally used to
+# reduce the amount of duplication between instruction patterns.
+
+# Two operand with unused vector element size
+@pd_pn_e0 ........ ........ ....... rn:4 . rd:4 &rr_esz esz=0
+
+# Two operand
+@pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz
+@rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz
+
+# Two operand with governing predicate, flags setting
+@pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s
+@pd_pg_pn_s0 ........ . . ...... .. pg:4 . rn:4 . rd:4 &rpr_s s=0
+
+# Three operand with unused vector element size
+@rd_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 &rrr_esz esz=0
+
+# Three predicate operand, with governing predicate, flag setting
+@pd_pg_pn_pm_s ........ . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4 &rprr_s
+
+# Three operand, vector element size
+@rd_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 &rrr_esz
+@pd_pn_pm ........ esz:2 .. rm:4 ....... rn:4 . rd:4 &rrr_esz
+@rdn_rm ........ esz:2 ...... ...... rm:5 rd:5 \
+ &rrr_esz rn=%reg_movprfx
+@rdn_rm_e0 ........ .. ...... ...... rm:5 rd:5 \
+ &rrr_esz rn=%reg_movprfx esz=0
+@rdn_sh_i8u ........ esz:2 ...... ...... ..... rd:5 \
+ &rri_esz rn=%reg_movprfx imm=%sh8_i8u
+@rdn_i8u ........ esz:2 ...... ... imm:8 rd:5 \
+ &rri_esz rn=%reg_movprfx
+@rdn_i8s ........ esz:2 ...... ... imm:s8 rd:5 \
+ &rri_esz rn=%reg_movprfx
+
+# Four operand, vector element size
+@rda_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 \
+ &rrrr_esz ra=%reg_movprfx
+
+# Four operand with unused vector element size
+@rda_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 \
+ &rrrr_esz esz=0 ra=%reg_movprfx
+@rdn_ra_rm_e0 ........ ... rm:5 ... ... ra:5 rd:5 \
+ &rrrr_esz esz=0 rn=%reg_movprfx
+
+# Three operand with "memory" size, aka immediate left shift
+@rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri
+
+# Two register operand, with governing predicate, vector element size
+@rdn_pg_rm ........ esz:2 ... ... ... pg:3 rm:5 rd:5 \
+ &rprr_esz rn=%reg_movprfx
+@rdm_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \
+ &rprr_esz rm=%reg_movprfx
+@rd_pg4_rn_rm ........ esz:2 . rm:5 .. pg:4 rn:5 rd:5 &rprr_esz
+@pd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4 &rprr_esz
+
+# Three register operand, with governing predicate, vector element size
+@rda_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 \
+ &rprrr_esz ra=%reg_movprfx
+@rdn_pg_ra_rm ........ esz:2 . rm:5 ... pg:3 ra:5 rd:5 \
+ &rprrr_esz rn=%reg_movprfx
+@rdn_pg_rm_ra ........ esz:2 . ra:5 ... pg:3 rm:5 rd:5 \
+ &rprrr_esz rn=%reg_movprfx
+@rd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 &rprr_esz
+
+# One register operand, with governing predicate, vector element size
+@rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz
+@rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz
+@pd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 . rd:4 &rpr_esz
+
+# One register operand, with governing predicate, no vector element size
+@rd_pg_rn_e0 ........ .. ... ... ... pg:3 rn:5 rd:5 &rpr_esz esz=0
+
+# Two register operands with a 6-bit signed immediate.
+@rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri
+
+# Two register operand, one immediate operand, with predicate,
+# element size encoded as TSZHL.
+@rdn_pg_tszimm_shl ........ .. ... ... ... pg:3 ..... rd:5 \
+ &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shl
+@rdn_pg_tszimm_shr ........ .. ... ... ... pg:3 ..... rd:5 \
+ &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shr
+
+# Similarly without predicate.
+@rd_rn_tszimm_shl ........ .. ... ... ...... rn:5 rd:5 \
+ &rri_esz esz=%tszimm16_esz imm=%tszimm16_shl
+@rd_rn_tszimm_shr ........ .. ... ... ...... rn:5 rd:5 \
+ &rri_esz esz=%tszimm16_esz imm=%tszimm16_shr
+
+# Two register operand, one immediate operand, with 4-bit predicate.
+# User must fill in imm.
+@rdn_pg4 ........ esz:2 .. pg:4 ... ........ rd:5 \
+ &rpri_esz rn=%reg_movprfx
+
+# Two register operand, one one-bit floating-point operand.
+@rdn_i1 ........ esz:2 ......... pg:3 .... imm:1 rd:5 \
+ &rpri_esz rn=%reg_movprfx
+
+# Two register operand, one encoded bitmask.
+@rdn_dbm ........ .. .... dbm:13 rd:5 \
+ &rr_dbm rn=%reg_movprfx
+
+# Predicate output, vector and immediate input,
+# controlling predicate, element size.
+@pd_pg_rn_i7 ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4 &rpri_esz
+@pd_pg_rn_i5 ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4 &rpri_esz
+
+# Basic Load/Store with 9-bit immediate offset
+@pd_rn_i9 ........ ........ ...... rn:5 . rd:4 \
+ &rri imm=%imm9_16_10
+@rd_rn_i9 ........ ........ ...... rn:5 rd:5 \
+ &rri imm=%imm9_16_10
+
+# One register, pattern, and uint4+1.
+# User must fill in U and D.
+@incdec_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \
+ &incdec_cnt imm=%imm4_16_p1
+@incdec2_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \
+ &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx
+
+# One register, predicate.
+# User must fill in U and D.
+@incdec_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 &incdec_pred
+@incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \
+ &incdec2_pred rn=%reg_movprfx
+
+# Loads; user must fill in NREG.
+@rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load
+@rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load
+
+@rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_load dtype=%msz_dtype
+@rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \
+ &rpri_load dtype=%msz_dtype
+
+# Gather Loads.
+@rprr_g_load_u ....... .. . . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rprr_g_load_xs_u ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_xs_u_sc ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_xs_sc ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_u_sc ....... .. . scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rprr_g_load_sc ....... .. . scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rpri_g_load ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rpri_gather_load
+
+# Stores; user must fill in ESZ, MSZ, NREG as needed.
+@rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store
+@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store
+@rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_store nreg=0
+@rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_scatter_store
+@rpri_scatter_store ....... msz:2 .. imm:5 ... pg:3 rn:5 rd:5 \
+ &rpri_scatter_store
+
+# Two registers and a scalar by N-bit index
+@rrx_3 ........ .. . .. rm:3 ...... rn:5 rd:5 \
+ &rrx_esz index=%index3_22_19
+@rrx_2 ........ .. . index:2 rm:3 ...... rn:5 rd:5 &rrx_esz
+@rrx_1 ........ .. . index:1 rm:4 ...... rn:5 rd:5 &rrx_esz
+
+# Two registers and a scalar by N-bit index, alternate
+@rrx_3a ........ .. . .. rm:3 ...... rn:5 rd:5 \
+ &rrx_esz index=%index3_19_11
+@rrx_2a ........ .. . . rm:4 ...... rn:5 rd:5 \
+ &rrx_esz index=%index2_20_11
+
+# Three registers and a scalar by N-bit index
+@rrxr_3 ........ .. . .. rm:3 ...... rn:5 rd:5 \
+ &rrxr_esz ra=%reg_movprfx index=%index3_22_19
+@rrxr_2 ........ .. . index:2 rm:3 ...... rn:5 rd:5 \
+ &rrxr_esz ra=%reg_movprfx
+@rrxr_1 ........ .. . index:1 rm:4 ...... rn:5 rd:5 \
+ &rrxr_esz ra=%reg_movprfx
+
+# Three registers and a scalar by N-bit index, alternate
+@rrxr_3a ........ .. ... rm:3 ...... rn:5 rd:5 \
+ &rrxr_esz ra=%reg_movprfx index=%index3_19_11
+@rrxr_2a ........ .. .. rm:4 ...... rn:5 rd:5 \
+ &rrxr_esz ra=%reg_movprfx index=%index2_20_11
+
+###########################################################################
+# Instruction patterns. Grouped according to the SVE encodingindex.xhtml.
+
+### SVE Integer Arithmetic - Binary Predicated Group
+
+# SVE bitwise logical vector operations (predicated)
+ORR_zpzz 00000100 .. 011 000 000 ... ..... ..... @rdn_pg_rm
+EOR_zpzz 00000100 .. 011 001 000 ... ..... ..... @rdn_pg_rm
+AND_zpzz 00000100 .. 011 010 000 ... ..... ..... @rdn_pg_rm
+BIC_zpzz 00000100 .. 011 011 000 ... ..... ..... @rdn_pg_rm
+
+# SVE integer add/subtract vectors (predicated)
+ADD_zpzz 00000100 .. 000 000 000 ... ..... ..... @rdn_pg_rm
+SUB_zpzz 00000100 .. 000 001 000 ... ..... ..... @rdn_pg_rm
+SUB_zpzz 00000100 .. 000 011 000 ... ..... ..... @rdm_pg_rn # SUBR
+
+# SVE integer min/max/difference (predicated)
+SMAX_zpzz 00000100 .. 001 000 000 ... ..... ..... @rdn_pg_rm
+UMAX_zpzz 00000100 .. 001 001 000 ... ..... ..... @rdn_pg_rm
+SMIN_zpzz 00000100 .. 001 010 000 ... ..... ..... @rdn_pg_rm
+UMIN_zpzz 00000100 .. 001 011 000 ... ..... ..... @rdn_pg_rm
+SABD_zpzz 00000100 .. 001 100 000 ... ..... ..... @rdn_pg_rm
+UABD_zpzz 00000100 .. 001 101 000 ... ..... ..... @rdn_pg_rm
+
+# SVE integer multiply/divide (predicated)
+MUL_zpzz 00000100 .. 010 000 000 ... ..... ..... @rdn_pg_rm
+SMULH_zpzz 00000100 .. 010 010 000 ... ..... ..... @rdn_pg_rm
+UMULH_zpzz 00000100 .. 010 011 000 ... ..... ..... @rdn_pg_rm
+# Note that divide requires size >= 2; below 2 is unallocated.
+SDIV_zpzz 00000100 .. 010 100 000 ... ..... ..... @rdn_pg_rm
+UDIV_zpzz 00000100 .. 010 101 000 ... ..... ..... @rdn_pg_rm
+SDIV_zpzz 00000100 .. 010 110 000 ... ..... ..... @rdm_pg_rn # SDIVR
+UDIV_zpzz 00000100 .. 010 111 000 ... ..... ..... @rdm_pg_rn # UDIVR
+
+### SVE Integer Reduction Group
+
+# SVE bitwise logical reduction (predicated)
+ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn
+EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn
+ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn
+
+# SVE constructive prefix (predicated)
+MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn
+MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn
+
+# SVE integer add reduction (predicated)
+# Note that saddv requires size != 3.
+UADDV 00000100 .. 000 001 001 ... ..... ..... @rd_pg_rn
+SADDV 00000100 .. 000 000 001 ... ..... ..... @rd_pg_rn
+
+# SVE integer min/max reduction (predicated)
+SMAXV 00000100 .. 001 000 001 ... ..... ..... @rd_pg_rn
+UMAXV 00000100 .. 001 001 001 ... ..... ..... @rd_pg_rn
+SMINV 00000100 .. 001 010 001 ... ..... ..... @rd_pg_rn
+UMINV 00000100 .. 001 011 001 ... ..... ..... @rd_pg_rn
+
+### SVE Shift by Immediate - Predicated Group
+
+# SVE bitwise shift by immediate (predicated)
+ASR_zpzi 00000100 .. 000 000 100 ... .. ... ..... @rdn_pg_tszimm_shr
+LSR_zpzi 00000100 .. 000 001 100 ... .. ... ..... @rdn_pg_tszimm_shr
+LSL_zpzi 00000100 .. 000 011 100 ... .. ... ..... @rdn_pg_tszimm_shl
+ASRD 00000100 .. 000 100 100 ... .. ... ..... @rdn_pg_tszimm_shr
+SQSHL_zpzi 00000100 .. 000 110 100 ... .. ... ..... @rdn_pg_tszimm_shl
+UQSHL_zpzi 00000100 .. 000 111 100 ... .. ... ..... @rdn_pg_tszimm_shl
+SRSHR 00000100 .. 001 100 100 ... .. ... ..... @rdn_pg_tszimm_shr
+URSHR 00000100 .. 001 101 100 ... .. ... ..... @rdn_pg_tszimm_shr
+SQSHLU 00000100 .. 001 111 100 ... .. ... ..... @rdn_pg_tszimm_shl
+
+# SVE bitwise shift by vector (predicated)
+ASR_zpzz 00000100 .. 010 000 100 ... ..... ..... @rdn_pg_rm
+LSR_zpzz 00000100 .. 010 001 100 ... ..... ..... @rdn_pg_rm
+LSL_zpzz 00000100 .. 010 011 100 ... ..... ..... @rdn_pg_rm
+ASR_zpzz 00000100 .. 010 100 100 ... ..... ..... @rdm_pg_rn # ASRR
+LSR_zpzz 00000100 .. 010 101 100 ... ..... ..... @rdm_pg_rn # LSRR
+LSL_zpzz 00000100 .. 010 111 100 ... ..... ..... @rdm_pg_rn # LSLR
+
+# SVE bitwise shift by wide elements (predicated)
+# Note these require size != 3.
+ASR_zpzw 00000100 .. 011 000 100 ... ..... ..... @rdn_pg_rm
+LSR_zpzw 00000100 .. 011 001 100 ... ..... ..... @rdn_pg_rm
+LSL_zpzw 00000100 .. 011 011 100 ... ..... ..... @rdn_pg_rm
+
+### SVE Integer Arithmetic - Unary Predicated Group
+
+# SVE unary bit operations (predicated)
+# Note esz != 0 for FABS and FNEG.
+CLS 00000100 .. 011 000 101 ... ..... ..... @rd_pg_rn
+CLZ 00000100 .. 011 001 101 ... ..... ..... @rd_pg_rn
+CNT_zpz 00000100 .. 011 010 101 ... ..... ..... @rd_pg_rn
+CNOT 00000100 .. 011 011 101 ... ..... ..... @rd_pg_rn
+NOT_zpz 00000100 .. 011 110 101 ... ..... ..... @rd_pg_rn
+FABS 00000100 .. 011 100 101 ... ..... ..... @rd_pg_rn
+FNEG 00000100 .. 011 101 101 ... ..... ..... @rd_pg_rn
+
+# SVE integer unary operations (predicated)
+# Note esz > original size for extensions.
+ABS 00000100 .. 010 110 101 ... ..... ..... @rd_pg_rn
+NEG 00000100 .. 010 111 101 ... ..... ..... @rd_pg_rn
+SXTB 00000100 .. 010 000 101 ... ..... ..... @rd_pg_rn
+UXTB 00000100 .. 010 001 101 ... ..... ..... @rd_pg_rn
+SXTH 00000100 .. 010 010 101 ... ..... ..... @rd_pg_rn
+UXTH 00000100 .. 010 011 101 ... ..... ..... @rd_pg_rn
+SXTW 00000100 .. 010 100 101 ... ..... ..... @rd_pg_rn
+UXTW 00000100 .. 010 101 101 ... ..... ..... @rd_pg_rn
+
+### SVE Floating Point Compare - Vectors Group
+
+# SVE floating-point compare vectors
+FCMGE_ppzz 01100101 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm
+FCMGT_ppzz 01100101 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm
+FCMEQ_ppzz 01100101 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm
+FCMNE_ppzz 01100101 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm
+FCMUO_ppzz 01100101 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm
+FACGE_ppzz 01100101 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm
+FACGT_ppzz 01100101 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm
+
+### SVE Integer Multiply-Add Group
+
+# SVE integer multiply-add writing addend (predicated)
+MLA 00000100 .. 0 ..... 010 ... ..... ..... @rda_pg_rn_rm
+MLS 00000100 .. 0 ..... 011 ... ..... ..... @rda_pg_rn_rm
+
+# SVE integer multiply-add writing multiplicand (predicated)
+MLA 00000100 .. 0 ..... 110 ... ..... ..... @rdn_pg_ra_rm # MAD
+MLS 00000100 .. 0 ..... 111 ... ..... ..... @rdn_pg_ra_rm # MSB
+
+### SVE Integer Arithmetic - Unpredicated Group
+
+# SVE integer add/subtract vectors (unpredicated)
+ADD_zzz 00000100 .. 1 ..... 000 000 ..... ..... @rd_rn_rm
+SUB_zzz 00000100 .. 1 ..... 000 001 ..... ..... @rd_rn_rm
+SQADD_zzz 00000100 .. 1 ..... 000 100 ..... ..... @rd_rn_rm
+UQADD_zzz 00000100 .. 1 ..... 000 101 ..... ..... @rd_rn_rm
+SQSUB_zzz 00000100 .. 1 ..... 000 110 ..... ..... @rd_rn_rm
+UQSUB_zzz 00000100 .. 1 ..... 000 111 ..... ..... @rd_rn_rm
+
+### SVE Logical - Unpredicated Group
+
+# SVE bitwise logical operations (unpredicated)
+AND_zzz 00000100 00 1 ..... 001 100 ..... ..... @rd_rn_rm_e0
+ORR_zzz 00000100 01 1 ..... 001 100 ..... ..... @rd_rn_rm_e0
+EOR_zzz 00000100 10 1 ..... 001 100 ..... ..... @rd_rn_rm_e0
+BIC_zzz 00000100 11 1 ..... 001 100 ..... ..... @rd_rn_rm_e0
+
+XAR 00000100 .. 1 ..... 001 101 rm:5 rd:5 &rrri_esz \
+ rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr
+
+# SVE2 bitwise ternary operations
+EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0
+BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
+BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0
+BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
+BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
+NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
+
+### SVE Index Generation Group
+
+# SVE index generation (immediate start, immediate increment)
+INDEX_ii 00000100 esz:2 1 imm2:s5 010000 imm1:s5 rd:5
+
+# SVE index generation (immediate start, register increment)
+INDEX_ir 00000100 esz:2 1 rm:5 010010 imm:s5 rd:5
+
+# SVE index generation (register start, immediate increment)
+INDEX_ri 00000100 esz:2 1 imm:s5 010001 rn:5 rd:5
+
+# SVE index generation (register start, register increment)
+INDEX_rr 00000100 .. 1 ..... 010011 ..... ..... @rd_rn_rm
+
+### SVE / Streaming SVE Stack Allocation Group
+
+# SVE stack frame adjustment
+ADDVL 00000100 001 ..... 01010 ...... ..... @rd_rn_i6
+ADDSVL 00000100 001 ..... 01011 ...... ..... @rd_rn_i6
+ADDPL 00000100 011 ..... 01010 ...... ..... @rd_rn_i6
+ADDSPL 00000100 011 ..... 01011 ...... ..... @rd_rn_i6
+
+# SVE stack frame size
+RDVL 00000100 101 11111 01010 imm:s6 rd:5
+RDSVL 00000100 101 11111 01011 imm:s6 rd:5
+
+### SVE Bitwise Shift - Unpredicated Group
+
+# SVE bitwise shift by immediate (unpredicated)
+ASR_zzi 00000100 .. 1 ..... 1001 00 ..... ..... @rd_rn_tszimm_shr
+LSR_zzi 00000100 .. 1 ..... 1001 01 ..... ..... @rd_rn_tszimm_shr
+LSL_zzi 00000100 .. 1 ..... 1001 11 ..... ..... @rd_rn_tszimm_shl
+
+# SVE bitwise shift by wide elements (unpredicated)
+# Note esz != 3
+ASR_zzw 00000100 .. 1 ..... 1000 00 ..... ..... @rd_rn_rm
+LSR_zzw 00000100 .. 1 ..... 1000 01 ..... ..... @rd_rn_rm
+LSL_zzw 00000100 .. 1 ..... 1000 11 ..... ..... @rd_rn_rm
+
+### SVE Compute Vector Address Group
+
+# SVE vector address generation
+ADR_s32 00000100 00 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm
+ADR_u32 00000100 01 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm
+ADR_p32 00000100 10 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm
+ADR_p64 00000100 11 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm
+
+### SVE Integer Misc - Unpredicated Group
+
+# SVE constructive prefix (unpredicated)
+MOVPRFX 00000100 00 1 00000 101111 rn:5 rd:5
+
+# SVE floating-point exponential accelerator
+# Note esz != 0
+FEXPA 00000100 .. 1 00000 101110 ..... ..... @rd_rn
+
+# SVE floating-point trig select coefficient
+# Note esz != 0
+FTSSEL 00000100 .. 1 ..... 101100 ..... ..... @rd_rn_rm
+
+### SVE Element Count Group
+
+# SVE element count
+CNT_r 00000100 .. 10 .... 1110 0 0 ..... ..... @incdec_cnt d=0 u=1
+
+# SVE inc/dec register by element count
+INCDEC_r 00000100 .. 11 .... 1110 0 d:1 ..... ..... @incdec_cnt u=1
+
+# SVE saturating inc/dec register by element count
+SINCDEC_r_32 00000100 .. 10 .... 1111 d:1 u:1 ..... ..... @incdec_cnt
+SINCDEC_r_64 00000100 .. 11 .... 1111 d:1 u:1 ..... ..... @incdec_cnt
+
+# SVE inc/dec vector by element count
+# Note this requires esz != 0.
+INCDEC_v 00000100 .. 1 1 .... 1100 0 d:1 ..... ..... @incdec2_cnt u=1
+
+# SVE saturating inc/dec vector by element count
+# Note these require esz != 0.
+SINCDEC_v 00000100 .. 1 0 .... 1100 d:1 u:1 ..... ..... @incdec2_cnt
+
+### SVE Bitwise Immediate Group
+
+# SVE bitwise logical with immediate (unpredicated)
+ORR_zzi 00000101 00 0000 ............. ..... @rdn_dbm
+EOR_zzi 00000101 01 0000 ............. ..... @rdn_dbm
+AND_zzi 00000101 10 0000 ............. ..... @rdn_dbm
+
+# SVE broadcast bitmask immediate
+DUPM 00000101 11 0000 dbm:13 rd:5
+
+### SVE Integer Wide Immediate - Predicated Group
+
+# SVE copy floating-point immediate (predicated)
+FCPY 00000101 .. 01 .... 110 imm:8 ..... @rdn_pg4
+
+# SVE copy integer immediate (predicated)
+{
+ INVALID 00000101 00 01 ---- 01 1 -------- -----
+ CPY_m_i 00000101 .. 01 .... 01 . ........ ..... @rdn_pg4 imm=%sh8_i8s
+}
+{
+ INVALID 00000101 00 01 ---- 00 1 -------- -----
+ CPY_z_i 00000101 .. 01 .... 00 . ........ ..... @rdn_pg4 imm=%sh8_i8s
+}
+
+### SVE Permute - Extract Group
+
+# SVE extract vector (destructive)
+EXT 00000101 001 ..... 000 ... rm:5 rd:5 \
+ &rrri rn=%reg_movprfx imm=%imm8_16_10
+
+# SVE2 extract vector (constructive)
+EXT_sve2 00000101 011 ..... 000 ... rn:5 rd:5 \
+ &rri imm=%imm8_16_10
+
+### SVE Permute - Unpredicated Group
+
+# SVE broadcast general register
+DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn
+
+# SVE broadcast indexed element
+DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \
+ &rri imm=%imm7_22_16
+
+# SVE insert SIMD&FP scalar register
+INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm
+
+# SVE insert general register
+INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm
+
+# SVE reverse vector elements
+REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn
+
+# SVE vector table lookup
+TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm
+
+# SVE unpack vector elements
+UNPK 00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5
+
+# SVE2 Table Lookup (three sources)
+
+TBL_sve2 00000101 .. 1 ..... 001010 ..... ..... @rd_rn_rm
+TBX 00000101 .. 1 ..... 001011 ..... ..... @rd_rn_rm
+
+### SVE Permute - Predicates Group
+
+# SVE permute predicate elements
+ZIP1_p 00000101 .. 10 .... 010 000 0 .... 0 .... @pd_pn_pm
+ZIP2_p 00000101 .. 10 .... 010 001 0 .... 0 .... @pd_pn_pm
+UZP1_p 00000101 .. 10 .... 010 010 0 .... 0 .... @pd_pn_pm
+UZP2_p 00000101 .. 10 .... 010 011 0 .... 0 .... @pd_pn_pm
+TRN1_p 00000101 .. 10 .... 010 100 0 .... 0 .... @pd_pn_pm
+TRN2_p 00000101 .. 10 .... 010 101 0 .... 0 .... @pd_pn_pm
+
+# SVE reverse predicate elements
+REV_p 00000101 .. 11 0100 010 000 0 .... 0 .... @pd_pn
+
+# SVE unpack predicate elements
+PUNPKLO 00000101 00 11 0000 010 000 0 .... 0 .... @pd_pn_e0
+PUNPKHI 00000101 00 11 0001 010 000 0 .... 0 .... @pd_pn_e0
+
+### SVE Permute - Interleaving Group
+
+# SVE permute vector elements
+ZIP1_z 00000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm
+ZIP2_z 00000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm
+UZP1_z 00000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm
+UZP2_z 00000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm
+TRN1_z 00000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm
+TRN2_z 00000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm
+
+# SVE2 permute vector segments
+ZIP1_q 00000101 10 1 ..... 000 000 ..... ..... @rd_rn_rm_e0
+ZIP2_q 00000101 10 1 ..... 000 001 ..... ..... @rd_rn_rm_e0
+UZP1_q 00000101 10 1 ..... 000 010 ..... ..... @rd_rn_rm_e0
+UZP2_q 00000101 10 1 ..... 000 011 ..... ..... @rd_rn_rm_e0
+TRN1_q 00000101 10 1 ..... 000 110 ..... ..... @rd_rn_rm_e0
+TRN2_q 00000101 10 1 ..... 000 111 ..... ..... @rd_rn_rm_e0
+
+### SVE Permute - Predicated Group
+
+# SVE compress active elements
+# Note esz >= 2
+COMPACT 00000101 .. 100001 100 ... ..... ..... @rd_pg_rn
+
+# SVE conditionally broadcast element to vector
+CLASTA_z 00000101 .. 10100 0 100 ... ..... ..... @rdn_pg_rm
+CLASTB_z 00000101 .. 10100 1 100 ... ..... ..... @rdn_pg_rm
+
+# SVE conditionally copy element to SIMD&FP scalar
+CLASTA_v 00000101 .. 10101 0 100 ... ..... ..... @rd_pg_rn
+CLASTB_v 00000101 .. 10101 1 100 ... ..... ..... @rd_pg_rn
+
+# SVE conditionally copy element to general register
+CLASTA_r 00000101 .. 11000 0 101 ... ..... ..... @rd_pg_rn
+CLASTB_r 00000101 .. 11000 1 101 ... ..... ..... @rd_pg_rn
+
+# SVE copy element to SIMD&FP scalar register
+LASTA_v 00000101 .. 10001 0 100 ... ..... ..... @rd_pg_rn
+LASTB_v 00000101 .. 10001 1 100 ... ..... ..... @rd_pg_rn
+
+# SVE copy element to general register
+LASTA_r 00000101 .. 10000 0 101 ... ..... ..... @rd_pg_rn
+LASTB_r 00000101 .. 10000 1 101 ... ..... ..... @rd_pg_rn
+
+# SVE copy element from SIMD&FP scalar register
+CPY_m_v 00000101 .. 100000 100 ... ..... ..... @rd_pg_rn
+
+# SVE copy element from general register to vector (predicated)
+CPY_m_r 00000101 .. 101000 101 ... ..... ..... @rd_pg_rn
+
+# SVE reverse within elements
+# Note esz >= operation size
+REVB 00000101 .. 1001 00 100 ... ..... ..... @rd_pg_rn
+REVH 00000101 .. 1001 01 100 ... ..... ..... @rd_pg_rn
+REVW 00000101 .. 1001 10 100 ... ..... ..... @rd_pg_rn
+RBIT 00000101 .. 1001 11 100 ... ..... ..... @rd_pg_rn
+REVD 00000101 00 1011 10 100 ... ..... ..... @rd_pg_rn_e0
+
+# SVE vector splice (predicated, destructive)
+SPLICE 00000101 .. 101 100 100 ... ..... ..... @rdn_pg_rm
+
+# SVE2 vector splice (predicated, constructive)
+SPLICE_sve2 00000101 .. 101 101 100 ... ..... ..... @rd_pg_rn
+
+### SVE Select Vectors Group
+
+# SVE select vector elements (predicated)
+SEL_zpzz 00000101 .. 1 ..... 11 .... ..... ..... @rd_pg4_rn_rm
+
+### SVE Integer Compare - Vectors Group
+
+# SVE integer compare_vectors
+CMPHS_ppzz 00100100 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_rm
+CMPHI_ppzz 00100100 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_rm
+CMPGE_ppzz 00100100 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
+CMPGT_ppzz 00100100 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
+CMPEQ_ppzz 00100100 .. 0 ..... 101 ... ..... 0 .... @pd_pg_rn_rm
+CMPNE_ppzz 00100100 .. 0 ..... 101 ... ..... 1 .... @pd_pg_rn_rm
+
+# SVE integer compare with wide elements
+# Note these require esz != 3.
+CMPEQ_ppzw 00100100 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_rm
+CMPNE_ppzw 00100100 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_rm
+CMPGE_ppzw 00100100 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm
+CMPGT_ppzw 00100100 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm
+CMPLT_ppzw 00100100 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm
+CMPLE_ppzw 00100100 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm
+CMPHS_ppzw 00100100 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm
+CMPHI_ppzw 00100100 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm
+CMPLO_ppzw 00100100 .. 0 ..... 111 ... ..... 0 .... @pd_pg_rn_rm
+CMPLS_ppzw 00100100 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm
+
+### SVE Integer Compare - Unsigned Immediate Group
+
+# SVE integer compare with unsigned immediate
+CMPHS_ppzi 00100100 .. 1 ....... 0 ... ..... 0 .... @pd_pg_rn_i7
+CMPHI_ppzi 00100100 .. 1 ....... 0 ... ..... 1 .... @pd_pg_rn_i7
+CMPLO_ppzi 00100100 .. 1 ....... 1 ... ..... 0 .... @pd_pg_rn_i7
+CMPLS_ppzi 00100100 .. 1 ....... 1 ... ..... 1 .... @pd_pg_rn_i7
+
+### SVE Integer Compare - Signed Immediate Group
+
+# SVE integer compare with signed immediate
+CMPGE_ppzi 00100101 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_i5
+CMPGT_ppzi 00100101 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_i5
+CMPLT_ppzi 00100101 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_i5
+CMPLE_ppzi 00100101 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_i5
+CMPEQ_ppzi 00100101 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_i5
+CMPNE_ppzi 00100101 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_i5
+
+### SVE Predicate Logical Operations Group
+
+# SVE predicate logical operations
+AND_pppp 00100101 0. 00 .... 01 .... 0 .... 0 .... @pd_pg_pn_pm_s
+BIC_pppp 00100101 0. 00 .... 01 .... 0 .... 1 .... @pd_pg_pn_pm_s
+EOR_pppp 00100101 0. 00 .... 01 .... 1 .... 0 .... @pd_pg_pn_pm_s
+SEL_pppp 00100101 0. 00 .... 01 .... 1 .... 1 .... @pd_pg_pn_pm_s
+ORR_pppp 00100101 1. 00 .... 01 .... 0 .... 0 .... @pd_pg_pn_pm_s
+ORN_pppp 00100101 1. 00 .... 01 .... 0 .... 1 .... @pd_pg_pn_pm_s
+NOR_pppp 00100101 1. 00 .... 01 .... 1 .... 0 .... @pd_pg_pn_pm_s
+NAND_pppp 00100101 1. 00 .... 01 .... 1 .... 1 .... @pd_pg_pn_pm_s
+
+### SVE Predicate Misc Group
+
+# SVE predicate test
+PTEST 00100101 01 010000 11 pg:4 0 rn:4 0 0000
+
+# SVE predicate initialize
+PTRUE 00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4
+
+# SVE initialize FFR
+SETFFR 00100101 0010 1100 1001 0000 0000 0000
+
+# SVE zero predicate register
+PFALSE 00100101 0001 1000 1110 0100 0000 rd:4
+
+# SVE predicate read from FFR (predicated)
+RDFFR_p 00100101 0 s:1 0110001111000 pg:4 0 rd:4
+
+# SVE predicate read from FFR (unpredicated)
+RDFFR 00100101 0001 1001 1111 0000 0000 rd:4
+
+# SVE FFR write from predicate (WRFFR)
+WRFFR 00100101 0010 1000 1001 000 rn:4 00000
+
+# SVE predicate first active
+PFIRST 00100101 01 011 000 11000 00 .... 0 .... @pd_pn_e0
+
+# SVE predicate next active
+PNEXT 00100101 .. 011 001 11000 10 .... 0 .... @pd_pn
+
+### SVE Partition Break Group
+
+# SVE propagate break from previous partition
+BRKPA 00100101 0. 00 .... 11 .... 0 .... 0 .... @pd_pg_pn_pm_s
+BRKPB 00100101 0. 00 .... 11 .... 0 .... 1 .... @pd_pg_pn_pm_s
+
+# SVE partition break condition
+BRKA_z 00100101 0. 01000001 .... 0 .... 0 .... @pd_pg_pn_s
+BRKB_z 00100101 1. 01000001 .... 0 .... 0 .... @pd_pg_pn_s
+BRKA_m 00100101 00 01000001 .... 0 .... 1 .... @pd_pg_pn_s0
+BRKB_m 00100101 10 01000001 .... 0 .... 1 .... @pd_pg_pn_s0
+
+# SVE propagate break to next partition
+BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s
+
+### SVE Predicate Count Group
+
+# SVE predicate count
+CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn
+
+# SVE inc/dec register by predicate count
+INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1
+
+# SVE inc/dec vector by predicate count
+INCDECP_z 00100101 .. 10110 d:1 10000 00 .... ..... @incdec2_pred u=1
+
+# SVE saturating inc/dec register by predicate count
+SINCDECP_r_32 00100101 .. 1010 d:1 u:1 10001 00 .... ..... @incdec_pred
+SINCDECP_r_64 00100101 .. 1010 d:1 u:1 10001 10 .... ..... @incdec_pred
+
+# SVE saturating inc/dec vector by predicate count
+SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred
+
+### SVE Integer Compare - Scalars Group
+
+# SVE conditionally terminate scalars
+CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000
+
+# SVE integer compare scalar count and limit
+WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4
+
+# SVE2 pointer conflict compare
+WHILE_ptr 00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4
+
+### SVE Integer Wide Immediate - Unpredicated Group
+
+# SVE broadcast floating-point immediate (unpredicated)
+FDUP 00100101 esz:2 111 00 1110 imm:8 rd:5
+
+# SVE broadcast integer immediate (unpredicated)
+{
+ INVALID 00100101 00 111 00 011 1 -------- -----
+ DUP_i 00100101 esz:2 111 00 011 . ........ rd:5 imm=%sh8_i8s
+}
+
+# SVE integer add/subtract immediate (unpredicated)
+{
+ INVALID 00100101 00 100 000 11 1 -------- -----
+ ADD_zzi 00100101 .. 100 000 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 001 11 1 -------- -----
+ SUB_zzi 00100101 .. 100 001 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 011 11 1 -------- -----
+ SUBR_zzi 00100101 .. 100 011 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 100 11 1 -------- -----
+ SQADD_zzi 00100101 .. 100 100 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 101 11 1 -------- -----
+ UQADD_zzi 00100101 .. 100 101 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 110 11 1 -------- -----
+ SQSUB_zzi 00100101 .. 100 110 11 . ........ ..... @rdn_sh_i8u
+}
+{
+ INVALID 00100101 00 100 111 11 1 -------- -----
+ UQSUB_zzi 00100101 .. 100 111 11 . ........ ..... @rdn_sh_i8u
+}
+
+# SVE integer min/max immediate (unpredicated)
+SMAX_zzi 00100101 .. 101 000 110 ........ ..... @rdn_i8s
+UMAX_zzi 00100101 .. 101 001 110 ........ ..... @rdn_i8u
+SMIN_zzi 00100101 .. 101 010 110 ........ ..... @rdn_i8s
+UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u
+
+# SVE integer multiply immediate (unpredicated)
+MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s
+
+# SVE integer dot product (unpredicated)
+DOT_zzzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE2 complex dot product (vectors)
+CDOT_zzzz 01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5 ra=%reg_movprfx
+
+#### SVE Multiply - Indexed
+
+# SVE integer dot product (indexed)
+SDOT_zzxw_s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2
+SDOT_zzxw_d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3
+UDOT_zzxw_s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2
+UDOT_zzxw_d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3
+
+# SVE2 integer multiply-add (indexed)
+MLA_zzxz_h 01000100 0. 1 ..... 000010 ..... ..... @rrxr_3 esz=1
+MLA_zzxz_s 01000100 10 1 ..... 000010 ..... ..... @rrxr_2 esz=2
+MLA_zzxz_d 01000100 11 1 ..... 000010 ..... ..... @rrxr_1 esz=3
+MLS_zzxz_h 01000100 0. 1 ..... 000011 ..... ..... @rrxr_3 esz=1
+MLS_zzxz_s 01000100 10 1 ..... 000011 ..... ..... @rrxr_2 esz=2
+MLS_zzxz_d 01000100 11 1 ..... 000011 ..... ..... @rrxr_1 esz=3
+
+# SVE2 saturating multiply-add high (indexed)
+SQRDMLAH_zzxz_h 01000100 0. 1 ..... 000100 ..... ..... @rrxr_3 esz=1
+SQRDMLAH_zzxz_s 01000100 10 1 ..... 000100 ..... ..... @rrxr_2 esz=2
+SQRDMLAH_zzxz_d 01000100 11 1 ..... 000100 ..... ..... @rrxr_1 esz=3
+SQRDMLSH_zzxz_h 01000100 0. 1 ..... 000101 ..... ..... @rrxr_3 esz=1
+SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... ..... @rrxr_2 esz=2
+SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... ..... @rrxr_1 esz=3
+
+# SVE mixed sign dot product (indexed)
+USDOT_zzxw_s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2
+SUDOT_zzxw_s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2
+
+# SVE2 saturating multiply-add (indexed)
+SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... ..... @rrxr_3a esz=2
+SQDMLALB_zzxw_d 01000100 11 1 ..... 0010.0 ..... ..... @rrxr_2a esz=3
+SQDMLALT_zzxw_s 01000100 10 1 ..... 0010.1 ..... ..... @rrxr_3a esz=2
+SQDMLALT_zzxw_d 01000100 11 1 ..... 0010.1 ..... ..... @rrxr_2a esz=3
+SQDMLSLB_zzxw_s 01000100 10 1 ..... 0011.0 ..... ..... @rrxr_3a esz=2
+SQDMLSLB_zzxw_d 01000100 11 1 ..... 0011.0 ..... ..... @rrxr_2a esz=3
+SQDMLSLT_zzxw_s 01000100 10 1 ..... 0011.1 ..... ..... @rrxr_3a esz=2
+SQDMLSLT_zzxw_d 01000100 11 1 ..... 0011.1 ..... ..... @rrxr_2a esz=3
+
+# SVE2 complex integer dot product (indexed)
+CDOT_zzxw_s 01000100 10 1 index:2 rm:3 0100 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+CDOT_zzxw_d 01000100 11 1 index:1 rm:4 0100 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE2 complex integer multiply-add (indexed)
+CMLA_zzxz_h 01000100 10 1 index:2 rm:3 0110 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+CMLA_zzxz_s 01000100 11 1 index:1 rm:4 0110 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE2 complex saturating integer multiply-add (indexed)
+SQRDCMLAH_zzxz_h 01000100 10 1 index:2 rm:3 0111 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+SQRDCMLAH_zzxz_s 01000100 11 1 index:1 rm:4 0111 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE2 multiply-add long (indexed)
+SMLALB_zzxw_s 01000100 10 1 ..... 1000.0 ..... ..... @rrxr_3a esz=2
+SMLALB_zzxw_d 01000100 11 1 ..... 1000.0 ..... ..... @rrxr_2a esz=3
+SMLALT_zzxw_s 01000100 10 1 ..... 1000.1 ..... ..... @rrxr_3a esz=2
+SMLALT_zzxw_d 01000100 11 1 ..... 1000.1 ..... ..... @rrxr_2a esz=3
+UMLALB_zzxw_s 01000100 10 1 ..... 1001.0 ..... ..... @rrxr_3a esz=2
+UMLALB_zzxw_d 01000100 11 1 ..... 1001.0 ..... ..... @rrxr_2a esz=3
+UMLALT_zzxw_s 01000100 10 1 ..... 1001.1 ..... ..... @rrxr_3a esz=2
+UMLALT_zzxw_d 01000100 11 1 ..... 1001.1 ..... ..... @rrxr_2a esz=3
+SMLSLB_zzxw_s 01000100 10 1 ..... 1010.0 ..... ..... @rrxr_3a esz=2
+SMLSLB_zzxw_d 01000100 11 1 ..... 1010.0 ..... ..... @rrxr_2a esz=3
+SMLSLT_zzxw_s 01000100 10 1 ..... 1010.1 ..... ..... @rrxr_3a esz=2
+SMLSLT_zzxw_d 01000100 11 1 ..... 1010.1 ..... ..... @rrxr_2a esz=3
+UMLSLB_zzxw_s 01000100 10 1 ..... 1011.0 ..... ..... @rrxr_3a esz=2
+UMLSLB_zzxw_d 01000100 11 1 ..... 1011.0 ..... ..... @rrxr_2a esz=3
+UMLSLT_zzxw_s 01000100 10 1 ..... 1011.1 ..... ..... @rrxr_3a esz=2
+UMLSLT_zzxw_d 01000100 11 1 ..... 1011.1 ..... ..... @rrxr_2a esz=3
+
+# SVE2 integer multiply long (indexed)
+SMULLB_zzx_s 01000100 10 1 ..... 1100.0 ..... ..... @rrx_3a esz=2
+SMULLB_zzx_d 01000100 11 1 ..... 1100.0 ..... ..... @rrx_2a esz=3
+SMULLT_zzx_s 01000100 10 1 ..... 1100.1 ..... ..... @rrx_3a esz=2
+SMULLT_zzx_d 01000100 11 1 ..... 1100.1 ..... ..... @rrx_2a esz=3
+UMULLB_zzx_s 01000100 10 1 ..... 1101.0 ..... ..... @rrx_3a esz=2
+UMULLB_zzx_d 01000100 11 1 ..... 1101.0 ..... ..... @rrx_2a esz=3
+UMULLT_zzx_s 01000100 10 1 ..... 1101.1 ..... ..... @rrx_3a esz=2
+UMULLT_zzx_d 01000100 11 1 ..... 1101.1 ..... ..... @rrx_2a esz=3
+
+# SVE2 saturating multiply (indexed)
+SQDMULLB_zzx_s 01000100 10 1 ..... 1110.0 ..... ..... @rrx_3a esz=2
+SQDMULLB_zzx_d 01000100 11 1 ..... 1110.0 ..... ..... @rrx_2a esz=3
+SQDMULLT_zzx_s 01000100 10 1 ..... 1110.1 ..... ..... @rrx_3a esz=2
+SQDMULLT_zzx_d 01000100 11 1 ..... 1110.1 ..... ..... @rrx_2a esz=3
+
+# SVE2 saturating multiply high (indexed)
+SQDMULH_zzx_h 01000100 0. 1 ..... 111100 ..... ..... @rrx_3 esz=1
+SQDMULH_zzx_s 01000100 10 1 ..... 111100 ..... ..... @rrx_2 esz=2
+SQDMULH_zzx_d 01000100 11 1 ..... 111100 ..... ..... @rrx_1 esz=3
+SQRDMULH_zzx_h 01000100 0. 1 ..... 111101 ..... ..... @rrx_3 esz=1
+SQRDMULH_zzx_s 01000100 10 1 ..... 111101 ..... ..... @rrx_2 esz=2
+SQRDMULH_zzx_d 01000100 11 1 ..... 111101 ..... ..... @rrx_1 esz=3
+
+# SVE2 integer multiply (indexed)
+MUL_zzx_h 01000100 0. 1 ..... 111110 ..... ..... @rrx_3 esz=1
+MUL_zzx_s 01000100 10 1 ..... 111110 ..... ..... @rrx_2 esz=2
+MUL_zzx_d 01000100 11 1 ..... 111110 ..... ..... @rrx_1 esz=3
+
+# SVE floating-point complex add (predicated)
+FCADD 01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \
+ rn=%reg_movprfx
+
+# SVE floating-point complex multiply-add (predicated)
+FCMLA_zpzzz 01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE floating-point complex multiply-add (indexed)
+FCMLA_zzxz 01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx esz=1
+FCMLA_zzxz 01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx esz=2
+
+### SVE FP Multiply-Add Indexed Group
+
+# SVE floating-point multiply-add (indexed)
+FMLA_zzxz 01100100 0. 1 ..... 000000 ..... ..... @rrxr_3 esz=1
+FMLA_zzxz 01100100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2
+FMLA_zzxz 01100100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3
+FMLS_zzxz 01100100 0. 1 ..... 000001 ..... ..... @rrxr_3 esz=1
+FMLS_zzxz 01100100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2
+FMLS_zzxz 01100100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3
+
+### SVE FP Multiply Indexed Group
+
+# SVE floating-point multiply (indexed)
+FMUL_zzx 01100100 0. 1 ..... 001000 ..... ..... @rrx_3 esz=1
+FMUL_zzx 01100100 10 1 ..... 001000 ..... ..... @rrx_2 esz=2
+FMUL_zzx 01100100 11 1 ..... 001000 ..... ..... @rrx_1 esz=3
+
+### SVE FP Fast Reduction Group
+
+FADDV 01100101 .. 000 000 001 ... ..... ..... @rd_pg_rn
+FMAXNMV 01100101 .. 000 100 001 ... ..... ..... @rd_pg_rn
+FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn
+FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn
+FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn
+
+## SVE Floating Point Unary Operations - Unpredicated Group
+
+FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn
+FRSQRTE 01100101 .. 001 111 001100 ..... ..... @rd_rn
+
+### SVE FP Compare with Zero Group
+
+FCMGE_ppz0 01100101 .. 0100 00 001 ... ..... 0 .... @pd_pg_rn
+FCMGT_ppz0 01100101 .. 0100 00 001 ... ..... 1 .... @pd_pg_rn
+FCMLT_ppz0 01100101 .. 0100 01 001 ... ..... 0 .... @pd_pg_rn
+FCMLE_ppz0 01100101 .. 0100 01 001 ... ..... 1 .... @pd_pg_rn
+FCMEQ_ppz0 01100101 .. 0100 10 001 ... ..... 0 .... @pd_pg_rn
+FCMNE_ppz0 01100101 .. 0100 11 001 ... ..... 0 .... @pd_pg_rn
+
+### SVE FP Accumulating Reduction Group
+
+# SVE floating-point serial reduction (predicated)
+FADDA 01100101 .. 011 000 001 ... ..... ..... @rdn_pg_rm
+
+### SVE Floating Point Arithmetic - Unpredicated Group
+
+# SVE floating-point arithmetic (unpredicated)
+FADD_zzz 01100101 .. 0 ..... 000 000 ..... ..... @rd_rn_rm
+FSUB_zzz 01100101 .. 0 ..... 000 001 ..... ..... @rd_rn_rm
+FMUL_zzz 01100101 .. 0 ..... 000 010 ..... ..... @rd_rn_rm
+FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm
+FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm
+FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm
+
+### SVE FP Arithmetic Predicated Group
+
+# SVE floating-point arithmetic (predicated)
+FADD_zpzz 01100101 .. 00 0000 100 ... ..... ..... @rdn_pg_rm
+FSUB_zpzz 01100101 .. 00 0001 100 ... ..... ..... @rdn_pg_rm
+FMUL_zpzz 01100101 .. 00 0010 100 ... ..... ..... @rdn_pg_rm
+FSUB_zpzz 01100101 .. 00 0011 100 ... ..... ..... @rdm_pg_rn # FSUBR
+FMAXNM_zpzz 01100101 .. 00 0100 100 ... ..... ..... @rdn_pg_rm
+FMINNM_zpzz 01100101 .. 00 0101 100 ... ..... ..... @rdn_pg_rm
+FMAX_zpzz 01100101 .. 00 0110 100 ... ..... ..... @rdn_pg_rm
+FMIN_zpzz 01100101 .. 00 0111 100 ... ..... ..... @rdn_pg_rm
+FABD 01100101 .. 00 1000 100 ... ..... ..... @rdn_pg_rm
+FSCALE 01100101 .. 00 1001 100 ... ..... ..... @rdn_pg_rm
+FMULX 01100101 .. 00 1010 100 ... ..... ..... @rdn_pg_rm
+FDIV 01100101 .. 00 1100 100 ... ..... ..... @rdm_pg_rn # FDIVR
+FDIV 01100101 .. 00 1101 100 ... ..... ..... @rdn_pg_rm
+
+# SVE floating-point arithmetic with immediate (predicated)
+FADD_zpzi 01100101 .. 011 000 100 ... 0000 . ..... @rdn_i1
+FSUB_zpzi 01100101 .. 011 001 100 ... 0000 . ..... @rdn_i1
+FMUL_zpzi 01100101 .. 011 010 100 ... 0000 . ..... @rdn_i1
+FSUBR_zpzi 01100101 .. 011 011 100 ... 0000 . ..... @rdn_i1
+FMAXNM_zpzi 01100101 .. 011 100 100 ... 0000 . ..... @rdn_i1
+FMINNM_zpzi 01100101 .. 011 101 100 ... 0000 . ..... @rdn_i1
+FMAX_zpzi 01100101 .. 011 110 100 ... 0000 . ..... @rdn_i1
+FMIN_zpzi 01100101 .. 011 111 100 ... 0000 . ..... @rdn_i1
+
+# SVE floating-point trig multiply-add coefficient
+FTMAD 01100101 esz:2 010 imm:3 100000 rm:5 rd:5 rn=%reg_movprfx
+
+### SVE FP Multiply-Add Group
+
+# SVE floating-point multiply-accumulate writing addend
+FMLA_zpzzz 01100101 .. 1 ..... 000 ... ..... ..... @rda_pg_rn_rm
+FMLS_zpzzz 01100101 .. 1 ..... 001 ... ..... ..... @rda_pg_rn_rm
+FNMLA_zpzzz 01100101 .. 1 ..... 010 ... ..... ..... @rda_pg_rn_rm
+FNMLS_zpzzz 01100101 .. 1 ..... 011 ... ..... ..... @rda_pg_rn_rm
+
+# SVE floating-point multiply-accumulate writing multiplicand
+# Alter the operand extraction order and reuse the helpers from above.
+# FMAD, FMSB, FNMAD, FNMS
+FMLA_zpzzz 01100101 .. 1 ..... 100 ... ..... ..... @rdn_pg_rm_ra
+FMLS_zpzzz 01100101 .. 1 ..... 101 ... ..... ..... @rdn_pg_rm_ra
+FNMLA_zpzzz 01100101 .. 1 ..... 110 ... ..... ..... @rdn_pg_rm_ra
+FNMLS_zpzzz 01100101 .. 1 ..... 111 ... ..... ..... @rdn_pg_rm_ra
+
+### SVE FP Unary Operations Predicated Group
+
+# SVE floating-point convert precision
+FCVT_sh 01100101 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_hs 01100101 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+BFCVT 01100101 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_dh 01100101 11 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_hd 01100101 11 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_ds 01100101 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_sd 01100101 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
+
+# SVE floating-point convert to integer
+FCVTZS_hh 01100101 01 011 01 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hh 01100101 01 011 01 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_hs 01100101 01 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hs 01100101 01 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_hd 01100101 01 011 11 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hd 01100101 01 011 11 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_ss 01100101 10 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_ss 01100101 10 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_ds 01100101 11 011 00 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_ds 01100101 11 011 00 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_sd 01100101 11 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_sd 01100101 11 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_dd 01100101 11 011 11 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_dd 01100101 11 011 11 1 101 ... ..... ..... @rd_pg_rn_e0
+
+# SVE floating-point round to integral value
+FRINTN 01100101 .. 000 000 101 ... ..... ..... @rd_pg_rn
+FRINTP 01100101 .. 000 001 101 ... ..... ..... @rd_pg_rn
+FRINTM 01100101 .. 000 010 101 ... ..... ..... @rd_pg_rn
+FRINTZ 01100101 .. 000 011 101 ... ..... ..... @rd_pg_rn
+FRINTA 01100101 .. 000 100 101 ... ..... ..... @rd_pg_rn
+FRINTX 01100101 .. 000 110 101 ... ..... ..... @rd_pg_rn
+FRINTI 01100101 .. 000 111 101 ... ..... ..... @rd_pg_rn
+
+# SVE floating-point unary operations
+FRECPX 01100101 .. 001 100 101 ... ..... ..... @rd_pg_rn
+FSQRT 01100101 .. 001 101 101 ... ..... ..... @rd_pg_rn
+
+# SVE integer convert to floating-point
+SCVTF_hh 01100101 01 010 01 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_sh 01100101 01 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_dh 01100101 01 010 11 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_ss 01100101 10 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_sd 01100101 11 010 00 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_ds 01100101 11 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_dd 01100101 11 010 11 0 101 ... ..... ..... @rd_pg_rn_e0
+
+UCVTF_hh 01100101 01 010 01 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_sh 01100101 01 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_dh 01100101 01 010 11 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_ss 01100101 10 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_sd 01100101 11 010 00 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_ds 01100101 11 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_dd 01100101 11 010 11 1 101 ... ..... ..... @rd_pg_rn_e0
+
+### SVE Memory - 32-bit Gather and Unsized Contiguous Group
+
+# SVE load predicate register
+LDR_pri 10000101 10 ...... 000 ... ..... 0 .... @pd_rn_i9
+
+# SVE load vector register
+LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9
+
+# SVE load and broadcast element
+LD1R_zpri 1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \
+ &rpri_load dtype=%dtype_23_13 nreg=0
+
+# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets)
+# SVE 32-bit gather load (scalar plus 32-bit scaled offsets)
+LD1_zprz 1000010 00 .0 ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u esz=2 msz=0 scale=0
+LD1_zprz 1000010 01 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=2 msz=1
+LD1_zprz 1000010 10 .. ..... 01. ... ..... ..... \
+ @rprr_g_load_xs_sc esz=2 msz=2 u=1
+
+# SVE 32-bit gather load (vector plus immediate)
+LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \
+ @rpri_g_load esz=2
+
+### SVE Memory Contiguous Load Group
+
+# SVE contiguous load (scalar plus scalar)
+LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0
+
+# SVE contiguous first-fault load (scalar plus scalar)
+LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0
+
+# SVE contiguous load (scalar plus immediate)
+LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0
+
+# SVE contiguous non-fault load (scalar plus immediate)
+LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
+
+# SVE contiguous non-temporal load (scalar plus scalar)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus scalar)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz
+
+# SVE contiguous non-temporal load (scalar plus immediate)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus immediate)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz
+
+# SVE load and broadcast quadword (scalar plus scalar)
+LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \
+ @rprr_load_msz nreg=0
+LD1RO_zprr 1010010 .. 01 ..... 000 ... ..... ..... \
+ @rprr_load_msz nreg=0
+
+# SVE load and broadcast quadword (scalar plus immediate)
+# LD1RQB, LD1RQH, LD1RQS, LD1RQD
+LD1RQ_zpri 1010010 .. 00 0.... 001 ... ..... ..... \
+ @rpri_load_msz nreg=0
+LD1RO_zpri 1010010 .. 01 0.... 001 ... ..... ..... \
+ @rpri_load_msz nreg=0
+
+# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
+PRF_ns 1000010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 32-bit gather prefetch (vector plus immediate)
+PRF_ns 1000010 -- 00 ----- 111 --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus immediate)
+PRF 1000010 11 1- ----- 0-- --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus scalar)
+PRF_rr 1000010 -- 00 rm:5 110 --- ----- 0 ----
+
+### SVE Memory 64-bit Gather Group
+
+# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets)
+# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
+LD1_zprz 1100010 00 .0 ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u esz=3 msz=0 scale=0
+LD1_zprz 1100010 01 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=3 msz=1
+LD1_zprz 1100010 10 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=3 msz=2
+LD1_zprz 1100010 11 .. ..... 01. ... ..... ..... \
+ @rprr_g_load_xs_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets)
+# SVE 64-bit gather load (scalar plus 64-bit scaled offsets)
+LD1_zprz 1100010 00 10 ..... 1.. ... ..... ..... \
+ @rprr_g_load_u esz=3 msz=0 scale=0
+LD1_zprz 1100010 01 1. ..... 1.. ... ..... ..... \
+ @rprr_g_load_u_sc esz=3 msz=1
+LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \
+ @rprr_g_load_u_sc esz=3 msz=2
+LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \
+ @rprr_g_load_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (vector plus immediate)
+LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \
+ @rpri_g_load esz=3
+
+# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets)
+PRF_ns 1100010 00 11 ----- 1-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets)
+PRF_ns 1100010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (vector plus immediate)
+PRF_ns 1100010 -- 00 ----- 111 --- ----- 0 ----
+
+### SVE Memory Store Group
+
+# SVE store predicate register
+STR_pri 1110010 11 0. ..... 000 ... ..... 0 .... @pd_rn_i9
+
+# SVE store vector register
+STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9
+
+# SVE contiguous store (scalar plus immediate)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \
+ @rpri_store_msz nreg=0
+
+# SVE contiguous store (scalar plus scalar)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+# Enumerate msz lest we conflict with STR_zri.
+ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=0
+ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=1
+ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=2
+ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \
+ @rprr_store msz=3 esz=3 nreg=0
+
+# SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0)
+# SVE store multiple structures (scalar plus immediate) (nreg != 0)
+ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \
+ @rpri_store_msz esz=%size_23
+
+# SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0)
+# SVE store multiple structures (scalar plus scalar) (nreg != 0)
+ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \
+ @rprr_store esz=%size_23
+
+# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
+# Require msz > 0 && msz <= esz.
+ST1_zprz 1110010 .. 11 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=2 scale=1
+ST1_zprz 1110010 .. 11 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=2 scale=1
+
+# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets)
+# Require msz <= esz.
+ST1_zprz 1110010 .. 10 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=2 scale=0
+ST1_zprz 1110010 .. 10 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=2 scale=0
+
+# SVE 64-bit scatter store (scalar plus 64-bit scaled offset)
+# Require msz > 0
+ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \
+ @rprr_scatter_store xs=2 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset)
+ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \
+ @rprr_scatter_store xs=2 esz=3 scale=0
+
+# SVE 64-bit scatter store (vector plus immediate)
+ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \
+ @rpri_scatter_store esz=3
+
+# SVE 32-bit scatter store (vector plus immediate)
+ST1_zpiz 1110010 .. 11 ..... 101 ... ..... ..... \
+ @rpri_scatter_store esz=2
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset)
+# Require msz > 0
+ST1_zprz 1110010 .. 01 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=3 scale=1
+ST1_zprz 1110010 .. 01 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset)
+ST1_zprz 1110010 .. 00 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=3 scale=0
+ST1_zprz 1110010 .. 00 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=3 scale=0
+
+#### SVE2 Support
+
+### SVE2 Integer Multiply - Unpredicated
+
+# SVE2 integer multiply vectors (unpredicated)
+MUL_zzz 00000100 .. 1 ..... 0110 00 ..... ..... @rd_rn_rm
+SMULH_zzz 00000100 .. 1 ..... 0110 10 ..... ..... @rd_rn_rm
+UMULH_zzz 00000100 .. 1 ..... 0110 11 ..... ..... @rd_rn_rm
+PMUL_zzz 00000100 00 1 ..... 0110 01 ..... ..... @rd_rn_rm_e0
+
+# SVE2 signed saturating doubling multiply high (unpredicated)
+SQDMULH_zzz 00000100 .. 1 ..... 0111 00 ..... ..... @rd_rn_rm
+SQRDMULH_zzz 00000100 .. 1 ..... 0111 01 ..... ..... @rd_rn_rm
+
+### SVE2 Integer - Predicated
+
+SADALP_zpzz 01000100 .. 000 100 101 ... ..... ..... @rdm_pg_rn
+UADALP_zpzz 01000100 .. 000 101 101 ... ..... ..... @rdm_pg_rn
+
+### SVE2 integer unary operations (predicated)
+
+URECPE 01000100 .. 000 000 101 ... ..... ..... @rd_pg_rn
+URSQRTE 01000100 .. 000 001 101 ... ..... ..... @rd_pg_rn
+SQABS 01000100 .. 001 000 101 ... ..... ..... @rd_pg_rn
+SQNEG 01000100 .. 001 001 101 ... ..... ..... @rd_pg_rn
+
+### SVE2 saturating/rounding bitwise shift left (predicated)
+
+SRSHL 01000100 .. 000 010 100 ... ..... ..... @rdn_pg_rm
+URSHL 01000100 .. 000 011 100 ... ..... ..... @rdn_pg_rm
+SRSHL 01000100 .. 000 110 100 ... ..... ..... @rdm_pg_rn # SRSHLR
+URSHL 01000100 .. 000 111 100 ... ..... ..... @rdm_pg_rn # URSHLR
+
+SQSHL 01000100 .. 001 000 100 ... ..... ..... @rdn_pg_rm
+UQSHL 01000100 .. 001 001 100 ... ..... ..... @rdn_pg_rm
+SQSHL 01000100 .. 001 100 100 ... ..... ..... @rdm_pg_rn # SQSHLR
+UQSHL 01000100 .. 001 101 100 ... ..... ..... @rdm_pg_rn # UQSHLR
+
+SQRSHL 01000100 .. 001 010 100 ... ..... ..... @rdn_pg_rm
+UQRSHL 01000100 .. 001 011 100 ... ..... ..... @rdn_pg_rm
+SQRSHL 01000100 .. 001 110 100 ... ..... ..... @rdm_pg_rn # SQRSHLR
+UQRSHL 01000100 .. 001 111 100 ... ..... ..... @rdm_pg_rn # UQRSHLR
+
+### SVE2 integer halving add/subtract (predicated)
+
+SHADD 01000100 .. 010 000 100 ... ..... ..... @rdn_pg_rm
+UHADD 01000100 .. 010 001 100 ... ..... ..... @rdn_pg_rm
+SHSUB 01000100 .. 010 010 100 ... ..... ..... @rdn_pg_rm
+UHSUB 01000100 .. 010 011 100 ... ..... ..... @rdn_pg_rm
+SRHADD 01000100 .. 010 100 100 ... ..... ..... @rdn_pg_rm
+URHADD 01000100 .. 010 101 100 ... ..... ..... @rdn_pg_rm
+SHSUB 01000100 .. 010 110 100 ... ..... ..... @rdm_pg_rn # SHSUBR
+UHSUB 01000100 .. 010 111 100 ... ..... ..... @rdm_pg_rn # UHSUBR
+
+### SVE2 integer pairwise arithmetic
+
+ADDP 01000100 .. 010 001 101 ... ..... ..... @rdn_pg_rm
+SMAXP 01000100 .. 010 100 101 ... ..... ..... @rdn_pg_rm
+UMAXP 01000100 .. 010 101 101 ... ..... ..... @rdn_pg_rm
+SMINP 01000100 .. 010 110 101 ... ..... ..... @rdn_pg_rm
+UMINP 01000100 .. 010 111 101 ... ..... ..... @rdn_pg_rm
+
+### SVE2 saturating add/subtract (predicated)
+
+SQADD_zpzz 01000100 .. 011 000 100 ... ..... ..... @rdn_pg_rm
+UQADD_zpzz 01000100 .. 011 001 100 ... ..... ..... @rdn_pg_rm
+SQSUB_zpzz 01000100 .. 011 010 100 ... ..... ..... @rdn_pg_rm
+UQSUB_zpzz 01000100 .. 011 011 100 ... ..... ..... @rdn_pg_rm
+SUQADD 01000100 .. 011 100 100 ... ..... ..... @rdn_pg_rm
+USQADD 01000100 .. 011 101 100 ... ..... ..... @rdn_pg_rm
+SQSUB_zpzz 01000100 .. 011 110 100 ... ..... ..... @rdm_pg_rn # SQSUBR
+UQSUB_zpzz 01000100 .. 011 111 100 ... ..... ..... @rdm_pg_rn # UQSUBR
+
+#### SVE2 Widening Integer Arithmetic
+
+## SVE2 integer add/subtract long
+
+SADDLB 01000101 .. 0 ..... 00 0000 ..... ..... @rd_rn_rm
+SADDLT 01000101 .. 0 ..... 00 0001 ..... ..... @rd_rn_rm
+UADDLB 01000101 .. 0 ..... 00 0010 ..... ..... @rd_rn_rm
+UADDLT 01000101 .. 0 ..... 00 0011 ..... ..... @rd_rn_rm
+
+SSUBLB 01000101 .. 0 ..... 00 0100 ..... ..... @rd_rn_rm
+SSUBLT 01000101 .. 0 ..... 00 0101 ..... ..... @rd_rn_rm
+USUBLB 01000101 .. 0 ..... 00 0110 ..... ..... @rd_rn_rm
+USUBLT 01000101 .. 0 ..... 00 0111 ..... ..... @rd_rn_rm
+
+SABDLB 01000101 .. 0 ..... 00 1100 ..... ..... @rd_rn_rm
+SABDLT 01000101 .. 0 ..... 00 1101 ..... ..... @rd_rn_rm
+UABDLB 01000101 .. 0 ..... 00 1110 ..... ..... @rd_rn_rm
+UABDLT 01000101 .. 0 ..... 00 1111 ..... ..... @rd_rn_rm
+
+## SVE2 integer add/subtract interleaved long
+
+SADDLBT 01000101 .. 0 ..... 1000 00 ..... ..... @rd_rn_rm
+SSUBLBT 01000101 .. 0 ..... 1000 10 ..... ..... @rd_rn_rm
+SSUBLTB 01000101 .. 0 ..... 1000 11 ..... ..... @rd_rn_rm
+
+## SVE2 integer add/subtract wide
+
+SADDWB 01000101 .. 0 ..... 010 000 ..... ..... @rd_rn_rm
+SADDWT 01000101 .. 0 ..... 010 001 ..... ..... @rd_rn_rm
+UADDWB 01000101 .. 0 ..... 010 010 ..... ..... @rd_rn_rm
+UADDWT 01000101 .. 0 ..... 010 011 ..... ..... @rd_rn_rm
+
+SSUBWB 01000101 .. 0 ..... 010 100 ..... ..... @rd_rn_rm
+SSUBWT 01000101 .. 0 ..... 010 101 ..... ..... @rd_rn_rm
+USUBWB 01000101 .. 0 ..... 010 110 ..... ..... @rd_rn_rm
+USUBWT 01000101 .. 0 ..... 010 111 ..... ..... @rd_rn_rm
+
+## SVE2 integer multiply long
+
+SQDMULLB_zzz 01000101 .. 0 ..... 011 000 ..... ..... @rd_rn_rm
+SQDMULLT_zzz 01000101 .. 0 ..... 011 001 ..... ..... @rd_rn_rm
+PMULLB 01000101 .. 0 ..... 011 010 ..... ..... @rd_rn_rm
+PMULLT 01000101 .. 0 ..... 011 011 ..... ..... @rd_rn_rm
+SMULLB_zzz 01000101 .. 0 ..... 011 100 ..... ..... @rd_rn_rm
+SMULLT_zzz 01000101 .. 0 ..... 011 101 ..... ..... @rd_rn_rm
+UMULLB_zzz 01000101 .. 0 ..... 011 110 ..... ..... @rd_rn_rm
+UMULLT_zzz 01000101 .. 0 ..... 011 111 ..... ..... @rd_rn_rm
+
+## SVE2 bitwise shift left long
+
+# Note bit23 == 0 is handled by esz > 0 in do_sve2_shll_tb.
+SSHLLB 01000101 .. 0 ..... 1010 00 ..... ..... @rd_rn_tszimm_shl
+SSHLLT 01000101 .. 0 ..... 1010 01 ..... ..... @rd_rn_tszimm_shl
+USHLLB 01000101 .. 0 ..... 1010 10 ..... ..... @rd_rn_tszimm_shl
+USHLLT 01000101 .. 0 ..... 1010 11 ..... ..... @rd_rn_tszimm_shl
+
+## SVE2 bitwise exclusive-or interleaved
+
+EORBT 01000101 .. 0 ..... 10010 0 ..... ..... @rd_rn_rm
+EORTB 01000101 .. 0 ..... 10010 1 ..... ..... @rd_rn_rm
+
+## SVE integer matrix multiply accumulate
+
+SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
+USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
+UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
+
+## SVE2 bitwise permute
+
+BEXT 01000101 .. 0 ..... 1011 00 ..... ..... @rd_rn_rm
+BDEP 01000101 .. 0 ..... 1011 01 ..... ..... @rd_rn_rm
+BGRP 01000101 .. 0 ..... 1011 10 ..... ..... @rd_rn_rm
+
+#### SVE2 Accumulate
+
+## SVE2 complex integer add
+
+CADD_rot90 01000101 .. 00000 0 11011 0 ..... ..... @rdn_rm
+CADD_rot270 01000101 .. 00000 0 11011 1 ..... ..... @rdn_rm
+SQCADD_rot90 01000101 .. 00000 1 11011 0 ..... ..... @rdn_rm
+SQCADD_rot270 01000101 .. 00000 1 11011 1 ..... ..... @rdn_rm
+
+## SVE2 integer absolute difference and accumulate long
+
+SABALB 01000101 .. 0 ..... 1100 00 ..... ..... @rda_rn_rm
+SABALT 01000101 .. 0 ..... 1100 01 ..... ..... @rda_rn_rm
+UABALB 01000101 .. 0 ..... 1100 10 ..... ..... @rda_rn_rm
+UABALT 01000101 .. 0 ..... 1100 11 ..... ..... @rda_rn_rm
+
+## SVE2 integer add/subtract long with carry
+
+# ADC and SBC decoded via size in helper dispatch.
+ADCLB 01000101 .. 0 ..... 11010 0 ..... ..... @rda_rn_rm
+ADCLT 01000101 .. 0 ..... 11010 1 ..... ..... @rda_rn_rm
+
+## SVE2 bitwise shift right and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SSRA 01000101 .. 0 ..... 1110 00 ..... ..... @rd_rn_tszimm_shr
+USRA 01000101 .. 0 ..... 1110 01 ..... ..... @rd_rn_tszimm_shr
+SRSRA 01000101 .. 0 ..... 1110 10 ..... ..... @rd_rn_tszimm_shr
+URSRA 01000101 .. 0 ..... 1110 11 ..... ..... @rd_rn_tszimm_shr
+
+## SVE2 bitwise shift and insert
+
+SRI 01000101 .. 0 ..... 11110 0 ..... ..... @rd_rn_tszimm_shr
+SLI 01000101 .. 0 ..... 11110 1 ..... ..... @rd_rn_tszimm_shl
+
+## SVE2 integer absolute difference and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SABA 01000101 .. 0 ..... 11111 0 ..... ..... @rd_rn_rm
+UABA 01000101 .. 0 ..... 11111 1 ..... ..... @rd_rn_rm
+
+#### SVE2 Narrowing
+
+## SVE2 saturating extract narrow
+
+# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0.
+SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl
+SQXTNT 01000101 .. 1 ..... 010 001 ..... ..... @rd_rn_tszimm_shl
+UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl
+UQXTNT 01000101 .. 1 ..... 010 011 ..... ..... @rd_rn_tszimm_shl
+SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl
+SQXTUNT 01000101 .. 1 ..... 010 101 ..... ..... @rd_rn_tszimm_shl
+
+## SVE2 bitwise shift right narrow
+
+# Bit 23 == 0 is handled by esz > 0 in the translator.
+SQSHRUNB 01000101 .. 1 ..... 00 0000 ..... ..... @rd_rn_tszimm_shr
+SQSHRUNT 01000101 .. 1 ..... 00 0001 ..... ..... @rd_rn_tszimm_shr
+SQRSHRUNB 01000101 .. 1 ..... 00 0010 ..... ..... @rd_rn_tszimm_shr
+SQRSHRUNT 01000101 .. 1 ..... 00 0011 ..... ..... @rd_rn_tszimm_shr
+SHRNB 01000101 .. 1 ..... 00 0100 ..... ..... @rd_rn_tszimm_shr
+SHRNT 01000101 .. 1 ..... 00 0101 ..... ..... @rd_rn_tszimm_shr
+RSHRNB 01000101 .. 1 ..... 00 0110 ..... ..... @rd_rn_tszimm_shr
+RSHRNT 01000101 .. 1 ..... 00 0111 ..... ..... @rd_rn_tszimm_shr
+SQSHRNB 01000101 .. 1 ..... 00 1000 ..... ..... @rd_rn_tszimm_shr
+SQSHRNT 01000101 .. 1 ..... 00 1001 ..... ..... @rd_rn_tszimm_shr
+SQRSHRNB 01000101 .. 1 ..... 00 1010 ..... ..... @rd_rn_tszimm_shr
+SQRSHRNT 01000101 .. 1 ..... 00 1011 ..... ..... @rd_rn_tszimm_shr
+UQSHRNB 01000101 .. 1 ..... 00 1100 ..... ..... @rd_rn_tszimm_shr
+UQSHRNT 01000101 .. 1 ..... 00 1101 ..... ..... @rd_rn_tszimm_shr
+UQRSHRNB 01000101 .. 1 ..... 00 1110 ..... ..... @rd_rn_tszimm_shr
+UQRSHRNT 01000101 .. 1 ..... 00 1111 ..... ..... @rd_rn_tszimm_shr
+
+## SVE2 integer add/subtract narrow high part
+
+ADDHNB 01000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm
+ADDHNT 01000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm
+RADDHNB 01000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm
+RADDHNT 01000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm
+SUBHNB 01000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm
+SUBHNT 01000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm
+RSUBHNB 01000101 .. 1 ..... 011 110 ..... ..... @rd_rn_rm
+RSUBHNT 01000101 .. 1 ..... 011 111 ..... ..... @rd_rn_rm
+
+### SVE2 Character Match
+
+MATCH 01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
+NMATCH 01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
+
+### SVE2 Histogram Computation
+
+HISTCNT 01000101 .. 1 ..... 110 ... ..... ..... @rd_pg_rn_rm
+HISTSEG 01000101 .. 1 ..... 101 000 ..... ..... @rd_rn_rm
+
+## SVE2 floating-point pairwise operations
+
+FADDP 01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
+FMAXNMP 01100100 .. 010 10 0 100 ... ..... ..... @rdn_pg_rm
+FMINNMP 01100100 .. 010 10 1 100 ... ..... ..... @rdn_pg_rm
+FMAXP 01100100 .. 010 11 0 100 ... ..... ..... @rdn_pg_rm
+FMINP 01100100 .. 010 11 1 100 ... ..... ..... @rdn_pg_rm
+
+#### SVE Integer Multiply-Add (unpredicated)
+
+## SVE2 saturating multiply-add long
+
+SQDMLALB_zzzw 01000100 .. 0 ..... 0110 00 ..... ..... @rda_rn_rm
+SQDMLALT_zzzw 01000100 .. 0 ..... 0110 01 ..... ..... @rda_rn_rm
+SQDMLSLB_zzzw 01000100 .. 0 ..... 0110 10 ..... ..... @rda_rn_rm
+SQDMLSLT_zzzw 01000100 .. 0 ..... 0110 11 ..... ..... @rda_rn_rm
+
+## SVE2 saturating multiply-add interleaved long
+
+SQDMLALBT 01000100 .. 0 ..... 00001 0 ..... ..... @rda_rn_rm
+SQDMLSLBT 01000100 .. 0 ..... 00001 1 ..... ..... @rda_rn_rm
+
+## SVE2 saturating multiply-add high
+
+SQRDMLAH_zzzz 01000100 .. 0 ..... 01110 0 ..... ..... @rda_rn_rm
+SQRDMLSH_zzzz 01000100 .. 0 ..... 01110 1 ..... ..... @rda_rn_rm
+
+## SVE2 integer multiply-add long
+
+SMLALB_zzzw 01000100 .. 0 ..... 010 000 ..... ..... @rda_rn_rm
+SMLALT_zzzw 01000100 .. 0 ..... 010 001 ..... ..... @rda_rn_rm
+UMLALB_zzzw 01000100 .. 0 ..... 010 010 ..... ..... @rda_rn_rm
+UMLALT_zzzw 01000100 .. 0 ..... 010 011 ..... ..... @rda_rn_rm
+SMLSLB_zzzw 01000100 .. 0 ..... 010 100 ..... ..... @rda_rn_rm
+SMLSLT_zzzw 01000100 .. 0 ..... 010 101 ..... ..... @rda_rn_rm
+UMLSLB_zzzw 01000100 .. 0 ..... 010 110 ..... ..... @rda_rn_rm
+UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm
+
+## SVE2 complex integer multiply-add
+
+CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx
+SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx
+
+## SVE mixed sign dot product
+
+USDOT_zzzz 01000100 .. 0 ..... 011 110 ..... ..... @rda_rn_rm
+
+### SVE2 floating point matrix multiply accumulate
+BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
+FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
+FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
+
+### SVE2 Memory Gather Load Group
+
+# SVE2 64-bit gather non-temporal load (scalar plus 64-bit unscaled offsets)
+LDNT1_zprz 1100010 msz:2 00 rm:5 1 u:1 0 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2 esz=3 scale=0 ff=0
+
+# SVE2 32-bit gather non-temporal load (scalar plus 32-bit unscaled offsets)
+LDNT1_zprz 1000010 msz:2 00 rm:5 10 u:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=0 esz=2 scale=0 ff=0
+
+### SVE2 Memory Store Group
+
+# SVE2 64-bit scatter non-temporal store (vector plus scalar)
+STNT1_zprz 1110010 .. 00 ..... 001 ... ..... ..... \
+ @rprr_scatter_store xs=2 esz=3 scale=0
+
+# SVE2 32-bit scatter non-temporal store (vector plus scalar)
+STNT1_zprz 1110010 .. 10 ..... 001 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=2 scale=0
+
+### SVE2 Crypto Extensions
+
+# SVE2 crypto unary operations
+# AESMC and AESIMC
+AESMC 01000101 00 10000011100 decrypt:1 00000 rd:5
+
+# SVE2 crypto destructive binary operations
+AESE 01000101 00 10001 0 11100 0 ..... ..... @rdn_rm_e0
+AESD 01000101 00 10001 0 11100 1 ..... ..... @rdn_rm_e0
+SM4E 01000101 00 10001 1 11100 0 ..... ..... @rdn_rm_e0
+
+# SVE2 crypto constructive binary operations
+SM4EKEY 01000101 00 1 ..... 11110 0 ..... ..... @rd_rn_rm_e0
+RAX1 01000101 00 1 ..... 11110 1 ..... ..... @rd_rn_rm_e0
+
+### SVE2 floating-point convert precision odd elements
+FCVTXNT_ds 01100100 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVTX_ds 01100101 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVTNT_sh 01100100 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+BFCVTNT 01100100 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVTLT_hs 01100100 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+FCVTNT_ds 01100100 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
+
+### SVE2 floating-point convert to integer
+FLOGB 01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5 &rpr_esz
+
+### SVE2 floating-point multiply-add long (vectors)
+FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
+FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
+FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
+
+BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
+
+### SVE2 floating-point bfloat16 dot-product
+BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+
+### SVE2 floating-point multiply-add long (indexed)
+FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
+FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
+FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
+FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
+BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
+BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
+
+### SVE2 floating-point bfloat16 dot-product (indexed)
+BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2
+
+### SVE broadcast predicate element
+
+&psel esz pd pn pm rv imm
+%psel_rv 16:2 !function=plus_12
+%psel_imm_b 22:2 19:2
+%psel_imm_h 22:2 20:1
+%psel_imm_s 22:2
+%psel_imm_d 23:1
+@psel ........ .. . ... .. .. pn:4 . pm:4 . pd:4 \
+ &psel rv=%psel_rv
+
+PSEL 00100101 .. 1 ..1 .. 01 .... 0 .... 0 .... \
+ @psel esz=0 imm=%psel_imm_b
+PSEL 00100101 .. 1 .10 .. 01 .... 0 .... 0 .... \
+ @psel esz=1 imm=%psel_imm_h
+PSEL 00100101 .. 1 100 .. 01 .... 0 .... 0 .... \
+ @psel esz=2 imm=%psel_imm_s
+PSEL 00100101 .1 1 000 .. 01 .... 0 .... 0 .... \
+ @psel esz=3 imm=%psel_imm_d
+
+### SVE clamp
+
+SCLAMP 01000100 .. 0 ..... 110000 ..... ..... @rda_rn_rm
+UCLAMP 01000100 .. 0 ..... 110001 ..... ..... @rda_rn_rm
diff --git a/target/arm/tcg/t16.decode b/target/arm/tcg/t16.decode
new file mode 100644
index 0000000..646c749
--- /dev/null
+++ b/target/arm/tcg/t16.decode
@@ -0,0 +1,281 @@
+# Thumb1 instructions
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+&empty !extern
+&s_rrr_shi !extern s rd rn rm shim shty
+&s_rrr_shr !extern s rn rd rm rs shty
+&s_rri_rot !extern s rn rd imm rot
+&s_rrrr !extern s rd rn rm ra
+&rrr_rot !extern rd rn rm rot
+&rr !extern rd rm
+&ri !extern rd imm
+&r !extern rm
+&i !extern imm
+&ldst_rr !extern p w u rn rt rm shimm shtype
+&ldst_ri !extern p w u rn rt imm
+&ldst_block !extern rn i b u w list
+&setend !extern E
+&cps !extern mode imod M A I F
+&ci !extern cond imm
+
+# Set S if the instruction is outside of an IT block.
+%s !function=t16_setflags
+
+# Data-processing (two low registers)
+
+%reg_0 0:3
+
+@lll_noshr ...... .... rm:3 rd:3 \
+ &s_rrr_shi %s rn=%reg_0 shim=0 shty=0
+@xll_noshr ...... .... rm:3 rn:3 \
+ &s_rrr_shi s=1 rd=0 shim=0 shty=0
+@lxl_shr ...... .... rs:3 rd:3 \
+ &s_rrr_shr %s rm=%reg_0 rn=0
+
+AND_rrri 010000 0000 ... ... @lll_noshr
+EOR_rrri 010000 0001 ... ... @lll_noshr
+MOV_rxrr 010000 0010 ... ... @lxl_shr shty=0 # LSL
+MOV_rxrr 010000 0011 ... ... @lxl_shr shty=1 # LSR
+MOV_rxrr 010000 0100 ... ... @lxl_shr shty=2 # ASR
+ADC_rrri 010000 0101 ... ... @lll_noshr
+SBC_rrri 010000 0110 ... ... @lll_noshr
+MOV_rxrr 010000 0111 ... ... @lxl_shr shty=3 # ROR
+TST_xrri 010000 1000 ... ... @xll_noshr
+RSB_rri 010000 1001 rn:3 rd:3 &s_rri_rot %s imm=0 rot=0
+CMP_xrri 010000 1010 ... ... @xll_noshr
+CMN_xrri 010000 1011 ... ... @xll_noshr
+ORR_rrri 010000 1100 ... ... @lll_noshr
+MUL 010000 1101 rn:3 rd:3 &s_rrrr %s rm=%reg_0 ra=0
+BIC_rrri 010000 1110 ... ... @lll_noshr
+MVN_rxri 010000 1111 ... ... @lll_noshr
+
+# Load/store (register offset)
+
+@ldst_rr ....... rm:3 rn:3 rt:3 \
+ &ldst_rr p=1 w=0 u=1 shimm=0 shtype=0
+
+STR_rr 0101 000 ... ... ... @ldst_rr
+STRH_rr 0101 001 ... ... ... @ldst_rr
+STRB_rr 0101 010 ... ... ... @ldst_rr
+LDRSB_rr 0101 011 ... ... ... @ldst_rr
+LDR_rr 0101 100 ... ... ... @ldst_rr
+LDRH_rr 0101 101 ... ... ... @ldst_rr
+LDRB_rr 0101 110 ... ... ... @ldst_rr
+LDRSH_rr 0101 111 ... ... ... @ldst_rr
+
+# Load/store word/byte (immediate offset)
+
+%imm5_6x4 6:5 !function=times_4
+
+@ldst_ri_1 ..... imm:5 rn:3 rt:3 \
+ &ldst_ri p=1 w=0 u=1
+@ldst_ri_4 ..... ..... rn:3 rt:3 \
+ &ldst_ri p=1 w=0 u=1 imm=%imm5_6x4
+
+STR_ri 01100 ..... ... ... @ldst_ri_4
+LDR_ri 01101 ..... ... ... @ldst_ri_4
+STRB_ri 01110 ..... ... ... @ldst_ri_1
+LDRB_ri 01111 ..... ... ... @ldst_ri_1
+
+# Load/store halfword (immediate offset)
+
+%imm5_6x2 6:5 !function=times_2
+@ldst_ri_2 ..... ..... rn:3 rt:3 \
+ &ldst_ri p=1 w=0 u=1 imm=%imm5_6x2
+
+STRH_ri 10000 ..... ... ... @ldst_ri_2
+LDRH_ri 10001 ..... ... ... @ldst_ri_2
+
+# Load/store (SP-relative)
+
+%imm8_0x4 0:8 !function=times_4
+@ldst_spec_i ..... rt:3 ........ \
+ &ldst_ri p=1 w=0 u=1 imm=%imm8_0x4
+
+STR_ri 10010 ... ........ @ldst_spec_i rn=13
+LDR_ri 10011 ... ........ @ldst_spec_i rn=13
+
+# Load (PC-relative)
+
+LDR_ri 01001 ... ........ @ldst_spec_i rn=15
+
+# Add PC/SP (immediate)
+
+ADR 10100 rd:3 ........ imm=%imm8_0x4
+ADD_rri 10101 rd:3 ........ \
+ &s_rri_rot rn=13 s=0 rot=0 imm=%imm8_0x4 # SP
+
+# Load/store multiple
+
+@ldstm ..... rn:3 list:8 &ldst_block i=1 b=0 u=0 w=1
+
+STM 11000 ... ........ @ldstm
+LDM_t16 11001 ... ........ @ldstm
+
+# Shift (immediate)
+
+@shift_i ..... shim:5 rm:3 rd:3 &s_rrr_shi %s rn=%reg_0
+
+MOV_rxri 000 00 ..... ... ... @shift_i shty=0 # LSL
+MOV_rxri 000 01 ..... ... ... @shift_i shty=1 # LSR
+MOV_rxri 000 10 ..... ... ... @shift_i shty=2 # ASR
+
+# Add/subtract (three low registers)
+
+@addsub_3 ....... rm:3 rn:3 rd:3 \
+ &s_rrr_shi %s shim=0 shty=0
+
+ADD_rrri 0001100 ... ... ... @addsub_3
+SUB_rrri 0001101 ... ... ... @addsub_3
+
+# Add/subtract (two low registers and immediate)
+
+@addsub_2i ....... imm:3 rn:3 rd:3 \
+ &s_rri_rot %s rot=0
+
+ADD_rri 0001 110 ... ... ... @addsub_2i
+SUB_rri 0001 111 ... ... ... @addsub_2i
+
+# Add, subtract, compare, move (one low register and immediate)
+
+%reg_8 8:3
+@arith_1i ..... rd:3 imm:8 \
+ &s_rri_rot rot=0 rn=%reg_8
+
+MOV_rxi 00100 ... ........ @arith_1i %s
+CMP_xri 00101 ... ........ @arith_1i s=1
+ADD_rri 00110 ... ........ @arith_1i %s
+SUB_rri 00111 ... ........ @arith_1i %s
+
+# Add, compare, move (two high registers)
+
+%reg_0_7 7:1 0:3
+@addsub_2h .... .... . rm:4 ... \
+ &s_rrr_shi rd=%reg_0_7 rn=%reg_0_7 shim=0 shty=0
+
+ADD_rrri 0100 0100 . .... ... @addsub_2h s=0
+CMP_xrri 0100 0101 . .... ... @addsub_2h s=1
+MOV_rxri 0100 0110 . .... ... @addsub_2h s=0
+
+# Adjust SP (immediate)
+
+%imm7_0x4 0:7 !function=times_4
+@addsub_sp_i .... .... . ....... \
+ &s_rri_rot s=0 rd=13 rn=13 rot=0 imm=%imm7_0x4
+
+ADD_rri 1011 0000 0 ....... @addsub_sp_i
+SUB_rri 1011 0000 1 ....... @addsub_sp_i
+
+# Branch and exchange
+
+@branchr .... .... . rm:4 ... &r
+
+BX 0100 0111 0 .... 000 @branchr
+BLX_r 0100 0111 1 .... 000 @branchr
+BXNS 0100 0111 0 .... 100 @branchr
+BLXNS 0100 0111 1 .... 100 @branchr
+
+# Extend
+
+@extend .... .... .. rm:3 rd:3 &rrr_rot rn=15 rot=0
+
+SXTAH 1011 0010 00 ... ... @extend
+SXTAB 1011 0010 01 ... ... @extend
+UXTAH 1011 0010 10 ... ... @extend
+UXTAB 1011 0010 11 ... ... @extend
+
+# Change processor state
+
+%imod 4:1 !function=plus_2
+
+SETEND 1011 0110 010 1 E:1 000 &setend
+{
+ CPS 1011 0110 011 . 0 A:1 I:1 F:1 &cps mode=0 M=0 %imod
+ CPS_v7m 1011 0110 011 im:1 00 I:1 F:1
+}
+
+# Reverse bytes
+
+@rdm .... .... .. rm:3 rd:3 &rr
+
+REV 1011 1010 00 ... ... @rdm
+REV16 1011 1010 01 ... ... @rdm
+REVSH 1011 1010 11 ... ... @rdm
+
+# Hints
+
+{
+ {
+ YIELD 1011 1111 0001 0000
+ WFE 1011 1111 0010 0000
+ WFI 1011 1111 0011 0000
+
+ # TODO: Implement SEV, SEVL; may help SMP performance.
+ # SEV 1011 1111 0100 0000
+ # SEVL 1011 1111 0101 0000
+
+ # The canonical nop has the second nibble as 0000, but the whole of the
+ # rest of the space is a reserved hint, behaves as nop.
+ NOP 1011 1111 ---- 0000
+ }
+ IT 1011 1111 cond_mask:8
+}
+
+# Miscellaneous 16-bit instructions
+
+%imm6_9_3 9:1 3:5 !function=times_2
+
+HLT 1011 1010 10 imm:6 &i
+BKPT 1011 1110 imm:8 &i
+CBZ 1011 nz:1 0.1 ..... rn:3 imm=%imm6_9_3
+
+# Push and Pop
+
+%push_list 0:9 !function=t16_push_list
+%pop_list 0:9 !function=t16_pop_list
+
+STM 1011 010 ......... \
+ &ldst_block i=0 b=1 u=0 w=1 rn=13 list=%push_list
+LDM_t16 1011 110 ......... \
+ &ldst_block i=1 b=0 u=0 w=1 rn=13 list=%pop_list
+
+# Conditional branches, Supervisor call
+
+%imm8_0x2 0:s8 !function=times_2
+
+{
+ UDF 1101 1110 ---- ----
+ SVC 1101 1111 imm:8 &i
+ B_cond_thumb 1101 cond:4 ........ &ci imm=%imm8_0x2
+}
+
+# Unconditional Branch
+
+%imm11_0x2 0:s11 !function=times_2
+
+B 11100 ........... &i imm=%imm11_0x2
+
+# thumb_insn_is_16bit() ensures we won't be decoding these as
+# T16 instructions for a Thumb2 CPU, so these patterns must be
+# a Thumb1 split BL/BLX.
+BLX_suffix 11101 imm:11 &i
+BL_BLX_prefix 11110 imm:s11 &i
+BL_suffix 11111 imm:11 &i
diff --git a/target/arm/tcg/t32.decode b/target/arm/tcg/t32.decode
new file mode 100644
index 0000000..f21ad01
--- /dev/null
+++ b/target/arm/tcg/t32.decode
@@ -0,0 +1,753 @@
+# Thumb2 instructions
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+&empty !extern
+&s_rrr_shi !extern s rd rn rm shim shty
+&s_rrr_shr !extern s rn rd rm rs shty
+&s_rri_rot !extern s rn rd imm rot
+&s_rrrr !extern s rd rn rm ra
+&rrrr !extern rd rn rm ra
+&rrr_rot !extern rd rn rm rot
+&rrr !extern rd rn rm
+&rr !extern rd rm
+&ri !extern rd imm
+&r !extern rm
+&i !extern imm
+&msr_reg !extern rn r mask
+&mrs_reg !extern rd r
+&msr_bank !extern rn r sysm
+&mrs_bank !extern rd r sysm
+&ldst_rr !extern p w u rn rt rm shimm shtype
+&ldst_ri !extern p w u rn rt imm
+&ldst_block !extern rn i b u w list
+&strex !extern rn rd rt rt2 imm
+&ldrex !extern rn rt rt2 imm
+&bfx !extern rd rn lsb widthm1
+&bfi !extern rd rn lsb msb
+&sat !extern rd rn satimm imm sh
+&pkh !extern rd rn rm imm tb
+&cps !extern mode imod M A I F
+&mcr !extern cp opc1 crn crm opc2 rt
+&mcrr !extern cp opc1 crm rt rt2
+
+&mve_shl_ri rdalo rdahi shim
+&mve_shl_rr rdalo rdahi rm
+&mve_sh_ri rda shim
+&mve_sh_rr rda rm
+
+# rdahi: bits [3:1] from insn, bit 0 is 1
+# rdalo: bits [3:1] from insn, bit 0 is 0
+%rdahi_9 9:3 !function=times_2_plus_1
+%rdalo_17 17:3 !function=times_2
+
+# Data-processing (register)
+
+%imm5_12_6 12:3 6:2
+
+@s_rrr_shi ....... .... s:1 rn:4 .... rd:4 .. shty:2 rm:4 \
+ &s_rrr_shi shim=%imm5_12_6
+@s_rxr_shi ....... .... s:1 .... .... rd:4 .. shty:2 rm:4 \
+ &s_rrr_shi shim=%imm5_12_6 rn=0
+@S_xrr_shi ....... .... . rn:4 .... .... .. shty:2 rm:4 \
+ &s_rrr_shi shim=%imm5_12_6 s=1 rd=0
+
+@mve_shl_ri ....... .... . ... . . ... ... . .. .. .... \
+ &mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
+@mve_shl_rr ....... .... . ... . rm:4 ... . .. .. .... \
+ &mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
+@mve_sh_ri ....... .... . rda:4 . ... ... . .. .. .... \
+ &mve_sh_ri shim=%imm5_12_6
+@mve_sh_rr ....... .... . rda:4 rm:4 .... .... .... &mve_sh_rr
+
+{
+ TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi
+ AND_rrri 1110101 0000 . .... 0 ... .... .... .... @s_rrr_shi
+}
+BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi
+{
+ # The v8.1M MVE shift insns overlap in encoding with MOVS/ORRS
+ # and are distinguished by having Rm==13 or 15. Those are UNPREDICTABLE
+ # cases for MOVS/ORRS. We decode the MVE cases first, ensuring that
+ # they explicitly call unallocated_encoding() for cases that must UNDEF
+ # (eg "using a new shift insn on a v8.1M CPU without MVE"), and letting
+ # the rest fall through (where ORR_rrri and MOV_rxri will end up
+ # handling them as r13 and r15 accesses with the same semantics as A32).
+ [
+ {
+ UQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 00 1111 @mve_sh_ri
+ LSLL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111 @mve_shl_ri
+ UQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111 @mve_shl_ri
+ }
+
+ {
+ URSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 01 1111 @mve_sh_ri
+ LSRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111 @mve_shl_ri
+ URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri
+ }
+
+ {
+ SRSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 10 1111 @mve_sh_ri
+ ASRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111 @mve_shl_ri
+ SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri
+ }
+
+ {
+ SQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 11 1111 @mve_sh_ri
+ SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
+ }
+
+ {
+ UQRSHL_rr 1110101 0010 1 .... .... 1111 0000 1101 @mve_sh_rr
+ LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr
+ UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101 @mve_shl_rr
+ }
+
+ {
+ SQRSHR_rr 1110101 0010 1 .... .... 1111 0010 1101 @mve_sh_rr
+ ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr
+ SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101 @mve_shl_rr
+ }
+
+ UQRSHLL48_rr 1110101 0010 1 ... 1 .... ... 1 1000 1101 @mve_shl_rr
+ SQRSHRL48_rr 1110101 0010 1 ... 1 .... ... 1 1010 1101 @mve_shl_rr
+ ]
+
+ MOV_rxri 1110101 0010 . 1111 0 ... .... .... .... @s_rxr_shi
+ ORR_rrri 1110101 0010 . .... 0 ... .... .... .... @s_rrr_shi
+
+ # v8.1M CSEL and friends
+ CSEL 1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4
+}
+{
+ MVN_rxri 1110101 0011 . 1111 0 ... .... .... .... @s_rxr_shi
+ ORN_rrri 1110101 0011 . .... 0 ... .... .... .... @s_rrr_shi
+}
+{
+ TEQ_xrri 1110101 0100 1 .... 0 ... 1111 .... .... @S_xrr_shi
+ EOR_rrri 1110101 0100 . .... 0 ... .... .... .... @s_rrr_shi
+}
+PKH 1110101 0110 0 rn:4 0 ... rd:4 .. tb:1 0 rm:4 \
+ &pkh imm=%imm5_12_6
+{
+ CMN_xrri 1110101 1000 1 .... 0 ... 1111 .... .... @S_xrr_shi
+ ADD_rrri 1110101 1000 . .... 0 ... .... .... .... @s_rrr_shi
+}
+ADC_rrri 1110101 1010 . .... 0 ... .... .... .... @s_rrr_shi
+SBC_rrri 1110101 1011 . .... 0 ... .... .... .... @s_rrr_shi
+{
+ CMP_xrri 1110101 1101 1 .... 0 ... 1111 .... .... @S_xrr_shi
+ SUB_rrri 1110101 1101 . .... 0 ... .... .... .... @s_rrr_shi
+}
+RSB_rrri 1110101 1110 . .... 0 ... .... .... .... @s_rrr_shi
+
+# Data-processing (register-shifted register)
+
+MOV_rxrr 1111 1010 0 shty:2 s:1 rm:4 1111 rd:4 0000 rs:4 \
+ &s_rrr_shr rn=0
+
+# Data-processing (immediate)
+
+%t32extrot 26:1 12:3 0:8 !function=t32_expandimm_rot
+%t32extimm 26:1 12:3 0:8 !function=t32_expandimm_imm
+
+@s_rri_rot ....... .... s:1 rn:4 . ... rd:4 ........ \
+ &s_rri_rot imm=%t32extimm rot=%t32extrot
+@s_rxi_rot ....... .... s:1 .... . ... rd:4 ........ \
+ &s_rri_rot imm=%t32extimm rot=%t32extrot rn=0
+@S_xri_rot ....... .... . rn:4 . ... .... ........ \
+ &s_rri_rot imm=%t32extimm rot=%t32extrot s=1 rd=0
+
+{
+ TST_xri 1111 0.0 0000 1 .... 0 ... 1111 ........ @S_xri_rot
+ AND_rri 1111 0.0 0000 . .... 0 ... .... ........ @s_rri_rot
+}
+BIC_rri 1111 0.0 0001 . .... 0 ... .... ........ @s_rri_rot
+{
+ MOV_rxi 1111 0.0 0010 . 1111 0 ... .... ........ @s_rxi_rot
+ ORR_rri 1111 0.0 0010 . .... 0 ... .... ........ @s_rri_rot
+}
+{
+ MVN_rxi 1111 0.0 0011 . 1111 0 ... .... ........ @s_rxi_rot
+ ORN_rri 1111 0.0 0011 . .... 0 ... .... ........ @s_rri_rot
+}
+{
+ TEQ_xri 1111 0.0 0100 1 .... 0 ... 1111 ........ @S_xri_rot
+ EOR_rri 1111 0.0 0100 . .... 0 ... .... ........ @s_rri_rot
+}
+{
+ CMN_xri 1111 0.0 1000 1 .... 0 ... 1111 ........ @S_xri_rot
+ ADD_rri 1111 0.0 1000 . .... 0 ... .... ........ @s_rri_rot
+}
+ADC_rri 1111 0.0 1010 . .... 0 ... .... ........ @s_rri_rot
+SBC_rri 1111 0.0 1011 . .... 0 ... .... ........ @s_rri_rot
+{
+ CMP_xri 1111 0.0 1101 1 .... 0 ... 1111 ........ @S_xri_rot
+ SUB_rri 1111 0.0 1101 . .... 0 ... .... ........ @s_rri_rot
+}
+RSB_rri 1111 0.0 1110 . .... 0 ... .... ........ @s_rri_rot
+
+# Data processing (plain binary immediate)
+
+%imm12_26_12_0 26:1 12:3 0:8
+%neg12_26_12_0 26:1 12:3 0:8 !function=negate
+@s0_rri_12 .... ... .... . rn:4 . ... rd:4 ........ \
+ &s_rri_rot imm=%imm12_26_12_0 rot=0 s=0
+
+{
+ ADR 1111 0.1 0000 0 1111 0 ... rd:4 ........ \
+ &ri imm=%imm12_26_12_0
+ ADD_rri 1111 0.1 0000 0 .... 0 ... .... ........ @s0_rri_12
+}
+{
+ ADR 1111 0.1 0101 0 1111 0 ... rd:4 ........ \
+ &ri imm=%neg12_26_12_0
+ SUB_rri 1111 0.1 0101 0 .... 0 ... .... ........ @s0_rri_12
+}
+
+# Move Wide
+
+%imm16_26_16_12_0 16:4 26:1 12:3 0:8
+@mov16 .... .... .... .... .... rd:4 .... .... \
+ &ri imm=%imm16_26_16_12_0
+
+MOVW 1111 0.10 0100 .... 0 ... .... ........ @mov16
+MOVT 1111 0.10 1100 .... 0 ... .... ........ @mov16
+
+# Saturate, bitfield
+
+@sat .... .... .. sh:1 . rn:4 . ... rd:4 .. . satimm:5 \
+ &sat imm=%imm5_12_6
+@sat16 .... .... .. . . rn:4 . ... rd:4 .. . satimm:5 \
+ &sat sh=0 imm=0
+
+{
+ SSAT16 1111 0011 001 0 .... 0 000 .... 00 0 ..... @sat16
+ SSAT 1111 0011 00. 0 .... 0 ... .... .. 0 ..... @sat
+}
+{
+ USAT16 1111 0011 101 0 .... 0 000 .... 00 0 ..... @sat16
+ USAT 1111 0011 10. 0 .... 0 ... .... .. 0 ..... @sat
+}
+
+@bfx .... .... ... . rn:4 . ... rd:4 .. . widthm1:5 \
+ &bfx lsb=%imm5_12_6
+@bfi .... .... ... . rn:4 . ... rd:4 .. . msb:5 \
+ &bfi lsb=%imm5_12_6
+
+SBFX 1111 0011 010 0 .... 0 ... .... ..0..... @bfx
+UBFX 1111 0011 110 0 .... 0 ... .... ..0..... @bfx
+
+# bfc is bfi w/ rn=15
+BFCI 1111 0011 011 0 .... 0 ... .... ..0..... @bfi
+
+# Multiply and multiply accumulate
+
+@s0_rnadm .... .... .... rn:4 ra:4 rd:4 .... rm:4 &s_rrrr s=0
+@s0_rn0dm .... .... .... rn:4 .... rd:4 .... rm:4 &s_rrrr ra=0 s=0
+@rnadm .... .... .... rn:4 ra:4 rd:4 .... rm:4 &rrrr
+@rn0dm .... .... .... rn:4 .... rd:4 .... rm:4 &rrrr ra=0
+@rndm .... .... .... rn:4 .... rd:4 .... rm:4 &rrr
+@rdm .... .... .... .... .... rd:4 .... rm:4 &rr
+
+{
+ MUL 1111 1011 0000 .... 1111 .... 0000 .... @s0_rn0dm
+ MLA 1111 1011 0000 .... .... .... 0000 .... @s0_rnadm
+}
+MLS 1111 1011 0000 .... .... .... 0001 .... @rnadm
+SMULL 1111 1011 1000 .... .... .... 0000 .... @s0_rnadm
+UMULL 1111 1011 1010 .... .... .... 0000 .... @s0_rnadm
+SMLAL 1111 1011 1100 .... .... .... 0000 .... @s0_rnadm
+UMLAL 1111 1011 1110 .... .... .... 0000 .... @s0_rnadm
+UMAAL 1111 1011 1110 .... .... .... 0110 .... @rnadm
+{
+ SMULWB 1111 1011 0011 .... 1111 .... 0000 .... @rn0dm
+ SMLAWB 1111 1011 0011 .... .... .... 0000 .... @rnadm
+}
+{
+ SMULWT 1111 1011 0011 .... 1111 .... 0001 .... @rn0dm
+ SMLAWT 1111 1011 0011 .... .... .... 0001 .... @rnadm
+}
+{
+ SMULBB 1111 1011 0001 .... 1111 .... 0000 .... @rn0dm
+ SMLABB 1111 1011 0001 .... .... .... 0000 .... @rnadm
+}
+{
+ SMULBT 1111 1011 0001 .... 1111 .... 0001 .... @rn0dm
+ SMLABT 1111 1011 0001 .... .... .... 0001 .... @rnadm
+}
+{
+ SMULTB 1111 1011 0001 .... 1111 .... 0010 .... @rn0dm
+ SMLATB 1111 1011 0001 .... .... .... 0010 .... @rnadm
+}
+{
+ SMULTT 1111 1011 0001 .... 1111 .... 0011 .... @rn0dm
+ SMLATT 1111 1011 0001 .... .... .... 0011 .... @rnadm
+}
+SMLALBB 1111 1011 1100 .... .... .... 1000 .... @rnadm
+SMLALBT 1111 1011 1100 .... .... .... 1001 .... @rnadm
+SMLALTB 1111 1011 1100 .... .... .... 1010 .... @rnadm
+SMLALTT 1111 1011 1100 .... .... .... 1011 .... @rnadm
+
+# usad8 is usada8 w/ ra=15
+USADA8 1111 1011 0111 .... .... .... 0000 .... @rnadm
+
+SMLAD 1111 1011 0010 .... .... .... 0000 .... @rnadm
+SMLADX 1111 1011 0010 .... .... .... 0001 .... @rnadm
+SMLSD 1111 1011 0100 .... .... .... 0000 .... @rnadm
+SMLSDX 1111 1011 0100 .... .... .... 0001 .... @rnadm
+
+SMLALD 1111 1011 1100 .... .... .... 1100 .... @rnadm
+SMLALDX 1111 1011 1100 .... .... .... 1101 .... @rnadm
+SMLSLD 1111 1011 1101 .... .... .... 1100 .... @rnadm
+SMLSLDX 1111 1011 1101 .... .... .... 1101 .... @rnadm
+
+SMMLA 1111 1011 0101 .... .... .... 0000 .... @rnadm
+SMMLAR 1111 1011 0101 .... .... .... 0001 .... @rnadm
+SMMLS 1111 1011 0110 .... .... .... 0000 .... @rnadm
+SMMLSR 1111 1011 0110 .... .... .... 0001 .... @rnadm
+
+SDIV 1111 1011 1001 .... 1111 .... 1111 .... @rndm
+UDIV 1111 1011 1011 .... 1111 .... 1111 .... @rndm
+
+# Data-processing (two source registers)
+
+QADD 1111 1010 1000 .... 1111 .... 1000 .... @rndm
+QSUB 1111 1010 1000 .... 1111 .... 1010 .... @rndm
+QDADD 1111 1010 1000 .... 1111 .... 1001 .... @rndm
+QDSUB 1111 1010 1000 .... 1111 .... 1011 .... @rndm
+
+CRC32B 1111 1010 1100 .... 1111 .... 1000 .... @rndm
+CRC32H 1111 1010 1100 .... 1111 .... 1001 .... @rndm
+CRC32W 1111 1010 1100 .... 1111 .... 1010 .... @rndm
+CRC32CB 1111 1010 1101 .... 1111 .... 1000 .... @rndm
+CRC32CH 1111 1010 1101 .... 1111 .... 1001 .... @rndm
+CRC32CW 1111 1010 1101 .... 1111 .... 1010 .... @rndm
+
+SEL 1111 1010 1010 .... 1111 .... 1000 .... @rndm
+
+# Note rn != rm is CONSTRAINED UNPREDICTABLE; we choose to ignore rn.
+REV 1111 1010 1001 ---- 1111 .... 1000 .... @rdm
+REV16 1111 1010 1001 ---- 1111 .... 1001 .... @rdm
+RBIT 1111 1010 1001 ---- 1111 .... 1010 .... @rdm
+REVSH 1111 1010 1001 ---- 1111 .... 1011 .... @rdm
+CLZ 1111 1010 1011 ---- 1111 .... 1000 .... @rdm
+
+# Branches and miscellaneous control
+
+%msr_sysm 4:1 8:4
+%mrs_sysm 4:1 16:4
+%imm16_16_0 16:4 0:12
+%imm21 26:s1 11:1 13:1 16:6 0:11 !function=times_2
+&ci cond imm
+
+{
+ # Group insn[25:23] = 111, which is cond=111x for the branch below,
+ # or unconditional, which would be illegal for the branch.
+ [
+ # Hints, and CPS
+ {
+ [
+ YIELD 1111 0011 1010 1111 1000 0000 0000 0001
+ WFE 1111 0011 1010 1111 1000 0000 0000 0010
+ WFI 1111 0011 1010 1111 1000 0000 0000 0011
+
+ # TODO: Implement SEV, SEVL; may help SMP performance.
+ # SEV 1111 0011 1010 1111 1000 0000 0000 0100
+ # SEVL 1111 0011 1010 1111 1000 0000 0000 0101
+
+ ESB 1111 0011 1010 1111 1000 0000 0001 0000
+ ]
+
+ # The canonical nop ends in 0000 0000, but the whole rest
+ # of the space is "reserved hint, behaves as nop".
+ NOP 1111 0011 1010 1111 1000 0000 ---- ----
+
+ # If imod == '00' && M == '0' then SEE "Hint instructions", above.
+ CPS 1111 0011 1010 1111 1000 0 imod:2 M:1 A:1 I:1 F:1 mode:5 \
+ &cps
+ }
+
+ # Miscellaneous control
+ CLREX 1111 0011 1011 1111 1000 1111 0010 1111
+ DSB 1111 0011 1011 1111 1000 1111 0100 ----
+ DMB 1111 0011 1011 1111 1000 1111 0101 ----
+ ISB 1111 0011 1011 1111 1000 1111 0110 ----
+ SB 1111 0011 1011 1111 1000 1111 0111 0000
+
+ # Note that the v7m insn overlaps both the normal and banked insn.
+ {
+ MRS_bank 1111 0011 111 r:1 .... 1000 rd:4 001. 0000 \
+ &mrs_bank sysm=%mrs_sysm
+ MRS_reg 1111 0011 111 r:1 1111 1000 rd:4 0000 0000 &mrs_reg
+ MRS_v7m 1111 0011 111 0 1111 1000 rd:4 sysm:8
+ }
+ {
+ MSR_bank 1111 0011 100 r:1 rn:4 1000 .... 001. 0000 \
+ &msr_bank sysm=%msr_sysm
+ MSR_reg 1111 0011 100 r:1 rn:4 1000 mask:4 0000 0000 &msr_reg
+ MSR_v7m 1111 0011 100 0 rn:4 1000 mask:2 00 sysm:8
+ }
+ BXJ 1111 0011 1100 rm:4 1000 1111 0000 0000 &r
+ {
+ # At v6T2, this is the T5 encoding of SUBS PC, LR, #IMM, and works as for
+ # every other encoding of SUBS. With v7VE, IMM=0 is redefined as ERET.
+ # The distinction between the two only matters for Hyp mode.
+ ERET 1111 0011 1101 1110 1000 1111 0000 0000
+ SUB_rri 1111 0011 1101 1110 1000 1111 imm:8 \
+ &s_rri_rot rot=0 s=1 rd=15 rn=14
+ }
+ SMC 1111 0111 1111 imm:4 1000 0000 0000 0000 &i
+ HVC 1111 0111 1110 .... 1000 .... .... .... \
+ &i imm=%imm16_16_0
+ UDF 1111 0111 1111 ---- 1010 ---- ---- ----
+ ]
+ B_cond_thumb 1111 0. cond:4 ...... 10.0 ............ &ci imm=%imm21
+}
+
+# Load/store (register, immediate, literal)
+
+@ldst_rr .... .... .... rn:4 rt:4 ...... shimm:2 rm:4 \
+ &ldst_rr p=1 w=0 u=1 shtype=0
+@ldst_ri_idx .... .... .... rn:4 rt:4 . p:1 u:1 . imm:8 \
+ &ldst_ri w=1
+@ldst_ri_neg .... .... .... rn:4 rt:4 .... imm:8 \
+ &ldst_ri p=1 w=0 u=0
+@ldst_ri_unp .... .... .... rn:4 rt:4 .... imm:8 \
+ &ldst_ri p=1 w=0 u=1
+@ldst_ri_pos .... .... .... rn:4 rt:4 imm:12 \
+ &ldst_ri p=1 w=0 u=1
+@ldst_ri_lit .... .... u:1 ... .... rt:4 imm:12 \
+ &ldst_ri p=1 w=0 rn=15
+
+STRB_rr 1111 1000 0000 .... .... 000000 .. .... @ldst_rr
+STRB_ri 1111 1000 0000 .... .... 1..1 ........ @ldst_ri_idx
+STRB_ri 1111 1000 0000 .... .... 1100 ........ @ldst_ri_neg
+STRBT_ri 1111 1000 0000 .... .... 1110 ........ @ldst_ri_unp
+STRB_ri 1111 1000 1000 .... .... ............ @ldst_ri_pos
+
+STRH_rr 1111 1000 0010 .... .... 000000 .. .... @ldst_rr
+STRH_ri 1111 1000 0010 .... .... 1..1 ........ @ldst_ri_idx
+STRH_ri 1111 1000 0010 .... .... 1100 ........ @ldst_ri_neg
+STRHT_ri 1111 1000 0010 .... .... 1110 ........ @ldst_ri_unp
+STRH_ri 1111 1000 1010 .... .... ............ @ldst_ri_pos
+
+STR_rr 1111 1000 0100 .... .... 000000 .. .... @ldst_rr
+STR_ri 1111 1000 0100 .... .... 1..1 ........ @ldst_ri_idx
+STR_ri 1111 1000 0100 .... .... 1100 ........ @ldst_ri_neg
+STRT_ri 1111 1000 0100 .... .... 1110 ........ @ldst_ri_unp
+STR_ri 1111 1000 1100 .... .... ............ @ldst_ri_pos
+
+# Note that Load, unsigned (literal) overlaps all other load encodings.
+{
+ {
+ NOP 1111 1000 -001 1111 1111 ------------ # PLD
+ LDRB_ri 1111 1000 .001 1111 .... ............ @ldst_ri_lit
+ }
+ {
+ NOP 1111 1000 1001 ---- 1111 ------------ # PLD
+ LDRB_ri 1111 1000 1001 .... .... ............ @ldst_ri_pos
+ }
+ LDRB_ri 1111 1000 0001 .... .... 1..1 ........ @ldst_ri_idx
+ {
+ NOP 1111 1000 0001 ---- 1111 1100 -------- # PLD
+ LDRB_ri 1111 1000 0001 .... .... 1100 ........ @ldst_ri_neg
+ }
+ LDRBT_ri 1111 1000 0001 .... .... 1110 ........ @ldst_ri_unp
+ {
+ NOP 1111 1000 0001 ---- 1111 000000 -- ---- # PLD
+ LDRB_rr 1111 1000 0001 .... .... 000000 .. .... @ldst_rr
+ }
+}
+{
+ {
+ NOP 1111 1000 -011 1111 1111 ------------ # PLD
+ LDRH_ri 1111 1000 .011 1111 .... ............ @ldst_ri_lit
+ }
+ {
+ NOP 1111 1000 1011 ---- 1111 ------------ # PLDW
+ LDRH_ri 1111 1000 1011 .... .... ............ @ldst_ri_pos
+ }
+ LDRH_ri 1111 1000 0011 .... .... 1..1 ........ @ldst_ri_idx
+ {
+ NOP 1111 1000 0011 ---- 1111 1100 -------- # PLDW
+ LDRH_ri 1111 1000 0011 .... .... 1100 ........ @ldst_ri_neg
+ }
+ LDRHT_ri 1111 1000 0011 .... .... 1110 ........ @ldst_ri_unp
+ {
+ NOP 1111 1000 0011 ---- 1111 000000 -- ---- # PLDW
+ LDRH_rr 1111 1000 0011 .... .... 000000 .. .... @ldst_rr
+ }
+}
+{
+ LDR_ri 1111 1000 .101 1111 .... ............ @ldst_ri_lit
+ LDR_ri 1111 1000 1101 .... .... ............ @ldst_ri_pos
+ LDR_ri 1111 1000 0101 .... .... 1..1 ........ @ldst_ri_idx
+ LDR_ri 1111 1000 0101 .... .... 1100 ........ @ldst_ri_neg
+ LDRT_ri 1111 1000 0101 .... .... 1110 ........ @ldst_ri_unp
+ LDR_rr 1111 1000 0101 .... .... 000000 .. .... @ldst_rr
+}
+# NOPs here are PLI.
+{
+ {
+ NOP 1111 1001 -001 1111 1111 ------------
+ LDRSB_ri 1111 1001 .001 1111 .... ............ @ldst_ri_lit
+ }
+ {
+ NOP 1111 1001 1001 ---- 1111 ------------
+ LDRSB_ri 1111 1001 1001 .... .... ............ @ldst_ri_pos
+ }
+ LDRSB_ri 1111 1001 0001 .... .... 1..1 ........ @ldst_ri_idx
+ {
+ NOP 1111 1001 0001 ---- 1111 1100 --------
+ LDRSB_ri 1111 1001 0001 .... .... 1100 ........ @ldst_ri_neg
+ }
+ LDRSBT_ri 1111 1001 0001 .... .... 1110 ........ @ldst_ri_unp
+ {
+ NOP 1111 1001 0001 ---- 1111 000000 -- ----
+ LDRSB_rr 1111 1001 0001 .... .... 000000 .. .... @ldst_rr
+ }
+}
+# NOPs here are unallocated memory hints, treated as NOP.
+{
+ {
+ NOP 1111 1001 -011 1111 1111 ------------
+ LDRSH_ri 1111 1001 .011 1111 .... ............ @ldst_ri_lit
+ }
+ {
+ NOP 1111 1001 1011 ---- 1111 ------------
+ LDRSH_ri 1111 1001 1011 .... .... ............ @ldst_ri_pos
+ }
+ LDRSH_ri 1111 1001 0011 .... .... 1..1 ........ @ldst_ri_idx
+ {
+ NOP 1111 1001 0011 ---- 1111 1100 --------
+ LDRSH_ri 1111 1001 0011 .... .... 1100 ........ @ldst_ri_neg
+ }
+ LDRSHT_ri 1111 1001 0011 .... .... 1110 ........ @ldst_ri_unp
+ {
+ NOP 1111 1001 0011 ---- 1111 000000 -- ----
+ LDRSH_rr 1111 1001 0011 .... .... 000000 .. .... @ldst_rr
+ }
+}
+
+%imm8x4 0:8 !function=times_4
+&ldst_ri2 p w u rn rt rt2 imm
+@ldstd_ri8 .... .... u:1 ... rn:4 rt:4 rt2:4 ........ \
+ &ldst_ri2 imm=%imm8x4
+
+STRD_ri_t32 1110 1000 .110 .... .... .... ........ @ldstd_ri8 w=1 p=0
+LDRD_ri_t32 1110 1000 .111 .... .... .... ........ @ldstd_ri8 w=1 p=0
+
+STRD_ri_t32 1110 1001 .100 .... .... .... ........ @ldstd_ri8 w=0 p=1
+LDRD_ri_t32 1110 1001 .101 .... .... .... ........ @ldstd_ri8 w=0 p=1
+
+STRD_ri_t32 1110 1001 .110 .... .... .... ........ @ldstd_ri8 w=1 p=1
+{
+ SG 1110 1001 0111 1111 1110 1001 01111111
+ LDRD_ri_t32 1110 1001 .111 .... .... .... ........ @ldstd_ri8 w=1 p=1
+}
+
+# Load/Store Exclusive, Load-Acquire/Store-Release, and Table Branch
+
+@strex_i .... .... .... rn:4 rt:4 rd:4 .... .... \
+ &strex rt2=15 imm=%imm8x4
+@strex_0 .... .... .... rn:4 rt:4 .... .... rd:4 \
+ &strex rt2=15 imm=0
+@strex_d .... .... .... rn:4 rt:4 rt2:4 .... rd:4 \
+ &strex imm=0
+
+@ldrex_i .... .... .... rn:4 rt:4 .... .... .... \
+ &ldrex rt2=15 imm=%imm8x4
+@ldrex_0 .... .... .... rn:4 rt:4 .... .... .... \
+ &ldrex rt2=15 imm=0
+@ldrex_d .... .... .... rn:4 rt:4 rt2:4 .... .... \
+ &ldrex imm=0
+
+{
+ TT 1110 1000 0100 rn:4 1111 rd:4 A:1 T:1 000000
+ STREX 1110 1000 0100 .... .... .... .... .... @strex_i
+}
+STREXB 1110 1000 1100 .... .... 1111 0100 .... @strex_0
+STREXH 1110 1000 1100 .... .... 1111 0101 .... @strex_0
+STREXD_t32 1110 1000 1100 .... .... .... 0111 .... @strex_d
+
+STLEX 1110 1000 1100 .... .... 1111 1110 .... @strex_0
+STLEXB 1110 1000 1100 .... .... 1111 1100 .... @strex_0
+STLEXH 1110 1000 1100 .... .... 1111 1101 .... @strex_0
+STLEXD_t32 1110 1000 1100 .... .... .... 1111 .... @strex_d
+
+STL 1110 1000 1100 .... .... 1111 1010 1111 @ldrex_0
+STLB 1110 1000 1100 .... .... 1111 1000 1111 @ldrex_0
+STLH 1110 1000 1100 .... .... 1111 1001 1111 @ldrex_0
+
+LDREX 1110 1000 0101 .... .... 1111 .... .... @ldrex_i
+LDREXB 1110 1000 1101 .... .... 1111 0100 1111 @ldrex_0
+LDREXH 1110 1000 1101 .... .... 1111 0101 1111 @ldrex_0
+LDREXD_t32 1110 1000 1101 .... .... .... 0111 1111 @ldrex_d
+
+LDAEX 1110 1000 1101 .... .... 1111 1110 1111 @ldrex_0
+LDAEXB 1110 1000 1101 .... .... 1111 1100 1111 @ldrex_0
+LDAEXH 1110 1000 1101 .... .... 1111 1101 1111 @ldrex_0
+LDAEXD_t32 1110 1000 1101 .... .... .... 1111 1111 @ldrex_d
+
+LDA 1110 1000 1101 .... .... 1111 1010 1111 @ldrex_0
+LDAB 1110 1000 1101 .... .... 1111 1000 1111 @ldrex_0
+LDAH 1110 1000 1101 .... .... 1111 1001 1111 @ldrex_0
+
+&tbranch rn rm
+@tbranch .... .... .... rn:4 .... .... .... rm:4 &tbranch
+
+TBB 1110 1000 1101 .... 1111 0000 0000 .... @tbranch
+TBH 1110 1000 1101 .... 1111 0000 0001 .... @tbranch
+
+# Parallel addition and subtraction
+
+SADD8 1111 1010 1000 .... 1111 .... 0000 .... @rndm
+QADD8 1111 1010 1000 .... 1111 .... 0001 .... @rndm
+SHADD8 1111 1010 1000 .... 1111 .... 0010 .... @rndm
+UADD8 1111 1010 1000 .... 1111 .... 0100 .... @rndm
+UQADD8 1111 1010 1000 .... 1111 .... 0101 .... @rndm
+UHADD8 1111 1010 1000 .... 1111 .... 0110 .... @rndm
+
+SADD16 1111 1010 1001 .... 1111 .... 0000 .... @rndm
+QADD16 1111 1010 1001 .... 1111 .... 0001 .... @rndm
+SHADD16 1111 1010 1001 .... 1111 .... 0010 .... @rndm
+UADD16 1111 1010 1001 .... 1111 .... 0100 .... @rndm
+UQADD16 1111 1010 1001 .... 1111 .... 0101 .... @rndm
+UHADD16 1111 1010 1001 .... 1111 .... 0110 .... @rndm
+
+SASX 1111 1010 1010 .... 1111 .... 0000 .... @rndm
+QASX 1111 1010 1010 .... 1111 .... 0001 .... @rndm
+SHASX 1111 1010 1010 .... 1111 .... 0010 .... @rndm
+UASX 1111 1010 1010 .... 1111 .... 0100 .... @rndm
+UQASX 1111 1010 1010 .... 1111 .... 0101 .... @rndm
+UHASX 1111 1010 1010 .... 1111 .... 0110 .... @rndm
+
+SSUB8 1111 1010 1100 .... 1111 .... 0000 .... @rndm
+QSUB8 1111 1010 1100 .... 1111 .... 0001 .... @rndm
+SHSUB8 1111 1010 1100 .... 1111 .... 0010 .... @rndm
+USUB8 1111 1010 1100 .... 1111 .... 0100 .... @rndm
+UQSUB8 1111 1010 1100 .... 1111 .... 0101 .... @rndm
+UHSUB8 1111 1010 1100 .... 1111 .... 0110 .... @rndm
+
+SSUB16 1111 1010 1101 .... 1111 .... 0000 .... @rndm
+QSUB16 1111 1010 1101 .... 1111 .... 0001 .... @rndm
+SHSUB16 1111 1010 1101 .... 1111 .... 0010 .... @rndm
+USUB16 1111 1010 1101 .... 1111 .... 0100 .... @rndm
+UQSUB16 1111 1010 1101 .... 1111 .... 0101 .... @rndm
+UHSUB16 1111 1010 1101 .... 1111 .... 0110 .... @rndm
+
+SSAX 1111 1010 1110 .... 1111 .... 0000 .... @rndm
+QSAX 1111 1010 1110 .... 1111 .... 0001 .... @rndm
+SHSAX 1111 1010 1110 .... 1111 .... 0010 .... @rndm
+USAX 1111 1010 1110 .... 1111 .... 0100 .... @rndm
+UQSAX 1111 1010 1110 .... 1111 .... 0101 .... @rndm
+UHSAX 1111 1010 1110 .... 1111 .... 0110 .... @rndm
+
+# Register extends
+
+@rrr_rot .... .... .... rn:4 .... rd:4 .. rot:2 rm:4 &rrr_rot
+
+SXTAH 1111 1010 0000 .... 1111 .... 10.. .... @rrr_rot
+UXTAH 1111 1010 0001 .... 1111 .... 10.. .... @rrr_rot
+SXTAB16 1111 1010 0010 .... 1111 .... 10.. .... @rrr_rot
+UXTAB16 1111 1010 0011 .... 1111 .... 10.. .... @rrr_rot
+SXTAB 1111 1010 0100 .... 1111 .... 10.. .... @rrr_rot
+UXTAB 1111 1010 0101 .... 1111 .... 10.. .... @rrr_rot
+
+# Load/store multiple
+
+@ldstm .... .... .. w:1 . rn:4 list:16 &ldst_block u=0
+
+STM_t32 1110 1000 10.0 .... ................ @ldstm i=1 b=0
+STM_t32 1110 1001 00.0 .... ................ @ldstm i=0 b=1
+{
+ # Rn=15 UNDEFs for LDM; M-profile CLRM uses that encoding
+ CLRM 1110 1000 1001 1111 list:16
+ LDM_t32 1110 1000 10.1 .... ................ @ldstm i=1 b=0
+}
+LDM_t32 1110 1001 00.1 .... ................ @ldstm i=0 b=1
+
+&rfe !extern rn w pu
+@rfe .... .... .. w:1 . rn:4 ................ &rfe
+
+RFE 1110 1000 00.1 .... 1100000000000000 @rfe pu=2
+RFE 1110 1001 10.1 .... 1100000000000000 @rfe pu=1
+
+&srs !extern mode w pu
+@srs .... .... .. w:1 . .... ........... mode:5 &srs
+
+SRS 1110 1000 00.0 1101 1100 0000 000. .... @srs pu=2
+SRS 1110 1001 10.0 1101 1100 0000 000. .... @srs pu=1
+
+# Coprocessor instructions
+
+# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
+# other coprocessor instructions always UNDEF.
+# The trans_ functions for these will ignore cp values 8..13 for v7 or
+# earlier, and 0..13 for v8 and later, because those areas of the
+# encoding space may be used for other things, such as VFP or Neon.
+
+@mcr .... .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4
+@mcrr .... .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4
+
+MCRR 1110 1100 0100 .... .... .... .... .... @mcrr
+MRRC 1110 1100 0101 .... .... .... .... .... @mcrr
+
+MCR 1110 1110 ... 0 .... .... .... ... 1 .... @mcr
+MRC 1110 1110 ... 1 .... .... .... ... 1 .... @mcr
+
+# Branches
+
+%imm24 26:s1 13:1 11:1 16:10 0:11 !function=t32_branch24
+@branch24 ................................ &i imm=%imm24
+
+B 1111 0. .......... 10.1 ............ @branch24
+BL 1111 0. .......... 11.1 ............ @branch24
+{
+ # BLX_i is non-M-profile only
+ BLX_i 1111 0. .......... 11.0 ............ @branch24
+ # M-profile only: loop and branch insns
+ [
+ # All these BF insns have boff != 0b0000; we NOP them all
+ BF 1111 0 boff:4 ------- 1100 - ---------- 1 # BFL
+ BF 1111 0 boff:4 0 ------ 1110 - ---------- 1 # BFCSEL
+ BF 1111 0 boff:4 10 ----- 1110 - ---------- 1 # BF
+ BF 1111 0 boff:4 11 ----- 1110 0 0000000000 1 # BFX, BFLX
+ ]
+ [
+ # LE and WLS immediate
+ %lob_imm 1:10 11:1 !function=times_2
+
+ DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001 size=4
+ WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm size=4
+ {
+ LE 1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm
+ # This is WLSTP
+ WLS 1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
+ }
+ {
+ LCTP 1111 0 0000 000 1111 1110 0000 0000 0001
+ # This is DLSTP
+ DLS 1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001
+ }
+ VCTP 1111 0 0000 0 size:2 rn:4 1110 1000 0000 0001
+ ]
+}
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
new file mode 100644
index 0000000..da9f877
--- /dev/null
+++ b/target/arm/tcg/translate-a64.c
@@ -0,0 +1,15054 @@
+/*
+ * AArch64 translation
+ *
+ * Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "qemu/log.h"
+#include "arm_ldst.h"
+#include "translate.h"
+#include "internals.h"
+#include "qemu/host-utils.h"
+#include "semihosting/semihost.h"
+#include "exec/gen-icount.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "cpregs.h"
+#include "translate-a64.h"
+#include "qemu/atomic128.h"
+
+static TCGv_i64 cpu_X[32];
+static TCGv_i64 cpu_pc;
+
+/* Load/store exclusive handling */
+static TCGv_i64 cpu_exclusive_high;
+
+static const char *regnames[] = {
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+ "x24", "x25", "x26", "x27", "x28", "x29", "lr", "sp"
+};
+
+enum a64_shift_type {
+ A64_SHIFT_TYPE_LSL = 0,
+ A64_SHIFT_TYPE_LSR = 1,
+ A64_SHIFT_TYPE_ASR = 2,
+ A64_SHIFT_TYPE_ROR = 3
+};
+
+/* Table based decoder typedefs - used when the relevant bits for decode
+ * are too awkwardly scattered across the instruction (eg SIMD).
+ */
+typedef void AArch64DecodeFn(DisasContext *s, uint32_t insn);
+
+typedef struct AArch64DecodeTable {
+ uint32_t pattern;
+ uint32_t mask;
+ AArch64DecodeFn *disas_fn;
+} AArch64DecodeTable;
+
+/* initialize TCG globals. */
+void a64_translate_init(void)
+{
+ int i;
+
+ cpu_pc = tcg_global_mem_new_i64(cpu_env,
+ offsetof(CPUARMState, pc),
+ "pc");
+ for (i = 0; i < 32; i++) {
+ cpu_X[i] = tcg_global_mem_new_i64(cpu_env,
+ offsetof(CPUARMState, xregs[i]),
+ regnames[i]);
+ }
+
+ cpu_exclusive_high = tcg_global_mem_new_i64(cpu_env,
+ offsetof(CPUARMState, exclusive_high), "exclusive_high");
+}
+
+/*
+ * Return the core mmu_idx to use for A64 "unprivileged load/store" insns
+ */
+static int get_a64_user_mem_index(DisasContext *s)
+{
+ /*
+ * If AccType_UNPRIV is not used, the insn uses AccType_NORMAL,
+ * which is the usual mmu_idx for this cpu state.
+ */
+ ARMMMUIdx useridx = s->mmu_idx;
+
+ if (s->unpriv) {
+ /*
+ * We have pre-computed the condition for AccType_UNPRIV.
+ * Therefore we should never get here with a mmu_idx for
+ * which we do not know the corresponding user mmu_idx.
+ */
+ switch (useridx) {
+ case ARMMMUIdx_E10_1:
+ case ARMMMUIdx_E10_1_PAN:
+ useridx = ARMMMUIdx_E10_0;
+ break;
+ case ARMMMUIdx_E20_2:
+ case ARMMMUIdx_E20_2_PAN:
+ useridx = ARMMMUIdx_E20_0;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ return arm_to_core_mmu_idx(useridx);
+}
+
+static void set_btype_raw(int val)
+{
+ tcg_gen_st_i32(tcg_constant_i32(val), cpu_env,
+ offsetof(CPUARMState, btype));
+}
+
+static void set_btype(DisasContext *s, int val)
+{
+ /* BTYPE is a 2-bit field, and 0 should be done with reset_btype. */
+ tcg_debug_assert(val >= 1 && val <= 3);
+ set_btype_raw(val);
+ s->btype = -1;
+}
+
+static void reset_btype(DisasContext *s)
+{
+ if (s->btype != 0) {
+ set_btype_raw(0);
+ s->btype = 0;
+ }
+}
+
+static void gen_pc_plus_diff(DisasContext *s, TCGv_i64 dest, target_long diff)
+{
+ assert(s->pc_save != -1);
+ if (TARGET_TB_PCREL) {
+ tcg_gen_addi_i64(dest, cpu_pc, (s->pc_curr - s->pc_save) + diff);
+ } else {
+ tcg_gen_movi_i64(dest, s->pc_curr + diff);
+ }
+}
+
+void gen_a64_update_pc(DisasContext *s, target_long diff)
+{
+ gen_pc_plus_diff(s, cpu_pc, diff);
+ s->pc_save = s->pc_curr + diff;
+}
+
+/*
+ * Handle Top Byte Ignore (TBI) bits.
+ *
+ * If address tagging is enabled via the TCR TBI bits:
+ * + for EL2 and EL3 there is only one TBI bit, and if it is set
+ * then the address is zero-extended, clearing bits [63:56]
+ * + for EL0 and EL1, TBI0 controls addresses with bit 55 == 0
+ * and TBI1 controls addressses with bit 55 == 1.
+ * If the appropriate TBI bit is set for the address then
+ * the address is sign-extended from bit 55 into bits [63:56]
+ *
+ * Here We have concatenated TBI{1,0} into tbi.
+ */
+static void gen_top_byte_ignore(DisasContext *s, TCGv_i64 dst,
+ TCGv_i64 src, int tbi)
+{
+ if (tbi == 0) {
+ /* Load unmodified address */
+ tcg_gen_mov_i64(dst, src);
+ } else if (!regime_has_2_ranges(s->mmu_idx)) {
+ /* Force tag byte to all zero */
+ tcg_gen_extract_i64(dst, src, 0, 56);
+ } else {
+ /* Sign-extend from bit 55. */
+ tcg_gen_sextract_i64(dst, src, 0, 56);
+
+ switch (tbi) {
+ case 1:
+ /* tbi0 but !tbi1: only use the extension if positive */
+ tcg_gen_and_i64(dst, dst, src);
+ break;
+ case 2:
+ /* !tbi0 but tbi1: only use the extension if negative */
+ tcg_gen_or_i64(dst, dst, src);
+ break;
+ case 3:
+ /* tbi0 and tbi1: always use the extension */
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+}
+
+static void gen_a64_set_pc(DisasContext *s, TCGv_i64 src)
+{
+ /*
+ * If address tagging is enabled for instructions via the TCR TBI bits,
+ * then loading an address into the PC will clear out any tag.
+ */
+ gen_top_byte_ignore(s, cpu_pc, src, s->tbii);
+ s->pc_save = -1;
+}
+
+/*
+ * Handle MTE and/or TBI.
+ *
+ * For TBI, ideally, we would do nothing. Proper behaviour on fault is
+ * for the tag to be present in the FAR_ELx register. But for user-only
+ * mode we do not have a TLB with which to implement this, so we must
+ * remove the top byte now.
+ *
+ * Always return a fresh temporary that we can increment independently
+ * of the write-back address.
+ */
+
+TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr)
+{
+ TCGv_i64 clean = new_tmp_a64(s);
+#ifdef CONFIG_USER_ONLY
+ gen_top_byte_ignore(s, clean, addr, s->tbid);
+#else
+ tcg_gen_mov_i64(clean, addr);
+#endif
+ return clean;
+}
+
+/* Insert a zero tag into src, with the result at dst. */
+static void gen_address_with_allocation_tag0(TCGv_i64 dst, TCGv_i64 src)
+{
+ tcg_gen_andi_i64(dst, src, ~MAKE_64BIT_MASK(56, 4));
+}
+
+static void gen_probe_access(DisasContext *s, TCGv_i64 ptr,
+ MMUAccessType acc, int log2_size)
+{
+ gen_helper_probe_access(cpu_env, ptr,
+ tcg_constant_i32(acc),
+ tcg_constant_i32(get_mem_index(s)),
+ tcg_constant_i32(1 << log2_size));
+}
+
+/*
+ * For MTE, check a single logical or atomic access. This probes a single
+ * address, the exact one specified. The size and alignment of the access
+ * is not relevant to MTE, per se, but watchpoints do require the size,
+ * and we want to recognize those before making any other changes to state.
+ */
+static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr,
+ bool is_write, bool tag_checked,
+ int log2_size, bool is_unpriv,
+ int core_idx)
+{
+ if (tag_checked && s->mte_active[is_unpriv]) {
+ TCGv_i64 ret;
+ int desc = 0;
+
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, core_idx);
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+ desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+ desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << log2_size) - 1);
+
+ ret = new_tmp_a64(s);
+ gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
+
+ return ret;
+ }
+ return clean_data_tbi(s, addr);
+}
+
+TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
+ bool tag_checked, int log2_size)
+{
+ return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size,
+ false, get_mem_index(s));
+}
+
+/*
+ * For MTE, check multiple logical sequential accesses.
+ */
+TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
+ bool tag_checked, int size)
+{
+ if (tag_checked && s->mte_active[0]) {
+ TCGv_i64 ret;
+ int desc = 0;
+
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+ desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+ desc = FIELD_DP32(desc, MTEDESC, SIZEM1, size - 1);
+
+ ret = new_tmp_a64(s);
+ gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
+
+ return ret;
+ }
+ return clean_data_tbi(s, addr);
+}
+
+typedef struct DisasCompare64 {
+ TCGCond cond;
+ TCGv_i64 value;
+} DisasCompare64;
+
+static void a64_test_cc(DisasCompare64 *c64, int cc)
+{
+ DisasCompare c32;
+
+ arm_test_cc(&c32, cc);
+
+ /* Sign-extend the 32-bit value so that the GE/LT comparisons work
+ * properly. The NE/EQ comparisons are also fine with this choice. */
+ c64->cond = c32.cond;
+ c64->value = tcg_temp_new_i64();
+ tcg_gen_ext_i32_i64(c64->value, c32.value);
+
+ arm_free_cc(&c32);
+}
+
+static void a64_free_cc(DisasCompare64 *c64)
+{
+ tcg_temp_free_i64(c64->value);
+}
+
+static void gen_rebuild_hflags(DisasContext *s)
+{
+ gen_helper_rebuild_hflags_a64(cpu_env, tcg_constant_i32(s->current_el));
+}
+
+static void gen_exception_internal(int excp)
+{
+ assert(excp_is_internal(excp));
+ gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
+}
+
+static void gen_exception_internal_insn(DisasContext *s, int excp)
+{
+ gen_a64_update_pc(s, 0);
+ gen_exception_internal(excp);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syndrome)
+{
+ gen_a64_update_pc(s, 0);
+ gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syndrome));
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_step_complete_exception(DisasContext *s)
+{
+ /* We just completed step of an insn. Move from Active-not-pending
+ * to Active-pending, and then also take the swstep exception.
+ * This corresponds to making the (IMPDEF) choice to prioritize
+ * swstep exceptions over asynchronous exceptions taken to an exception
+ * level where debug is disabled. This choice has the advantage that
+ * we do not need to maintain internal state corresponding to the
+ * ISV/EX syndrome bits between completion of the step and generation
+ * of the exception, and our syndrome information is always correct.
+ */
+ gen_ss_advance(s);
+ gen_swstep_exception(s, 1, s->is_ldex);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+static inline bool use_goto_tb(DisasContext *s, uint64_t dest)
+{
+ if (s->ss_active) {
+ return false;
+ }
+ return translator_use_goto_tb(&s->base, dest);
+}
+
+static void gen_goto_tb(DisasContext *s, int n, int64_t diff)
+{
+ if (use_goto_tb(s, s->pc_curr + diff)) {
+ /*
+ * For pcrel, the pc must always be up-to-date on entry to
+ * the linked TB, so that it can use simple additions for all
+ * further adjustments. For !pcrel, the linked TB is compiled
+ * to know its full virtual address, so we can delay the
+ * update to pc to the unlinked path. A long chain of links
+ * can thus avoid many updates to the PC.
+ */
+ if (TARGET_TB_PCREL) {
+ gen_a64_update_pc(s, diff);
+ tcg_gen_goto_tb(n);
+ } else {
+ tcg_gen_goto_tb(n);
+ gen_a64_update_pc(s, diff);
+ }
+ tcg_gen_exit_tb(s->base.tb, n);
+ s->base.is_jmp = DISAS_NORETURN;
+ } else {
+ gen_a64_update_pc(s, diff);
+ if (s->ss_active) {
+ gen_step_complete_exception(s);
+ } else {
+ tcg_gen_lookup_and_goto_ptr();
+ s->base.is_jmp = DISAS_NORETURN;
+ }
+ }
+}
+
+static void init_tmp_a64_array(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+ memset(s->tmp_a64, 0, sizeof(s->tmp_a64));
+#endif
+ s->tmp_a64_count = 0;
+}
+
+static void free_tmp_a64(DisasContext *s)
+{
+ int i;
+ for (i = 0; i < s->tmp_a64_count; i++) {
+ tcg_temp_free_i64(s->tmp_a64[i]);
+ }
+ init_tmp_a64_array(s);
+}
+
+TCGv_i64 new_tmp_a64(DisasContext *s)
+{
+ assert(s->tmp_a64_count < TMP_A64_MAX);
+ return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_new_i64();
+}
+
+TCGv_i64 new_tmp_a64_local(DisasContext *s)
+{
+ assert(s->tmp_a64_count < TMP_A64_MAX);
+ return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_local_new_i64();
+}
+
+TCGv_i64 new_tmp_a64_zero(DisasContext *s)
+{
+ TCGv_i64 t = new_tmp_a64(s);
+ tcg_gen_movi_i64(t, 0);
+ return t;
+}
+
+/*
+ * Register access functions
+ *
+ * These functions are used for directly accessing a register in where
+ * changes to the final register value are likely to be made. If you
+ * need to use a register for temporary calculation (e.g. index type
+ * operations) use the read_* form.
+ *
+ * B1.2.1 Register mappings
+ *
+ * In instruction register encoding 31 can refer to ZR (zero register) or
+ * the SP (stack pointer) depending on context. In QEMU's case we map SP
+ * to cpu_X[31] and ZR accesses to a temporary which can be discarded.
+ * This is the point of the _sp forms.
+ */
+TCGv_i64 cpu_reg(DisasContext *s, int reg)
+{
+ if (reg == 31) {
+ return new_tmp_a64_zero(s);
+ } else {
+ return cpu_X[reg];
+ }
+}
+
+/* register access for when 31 == SP */
+TCGv_i64 cpu_reg_sp(DisasContext *s, int reg)
+{
+ return cpu_X[reg];
+}
+
+/* read a cpu register in 32bit/64bit mode. Returns a TCGv_i64
+ * representing the register contents. This TCGv is an auto-freed
+ * temporary so it need not be explicitly freed, and may be modified.
+ */
+TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf)
+{
+ TCGv_i64 v = new_tmp_a64(s);
+ if (reg != 31) {
+ if (sf) {
+ tcg_gen_mov_i64(v, cpu_X[reg]);
+ } else {
+ tcg_gen_ext32u_i64(v, cpu_X[reg]);
+ }
+ } else {
+ tcg_gen_movi_i64(v, 0);
+ }
+ return v;
+}
+
+TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf)
+{
+ TCGv_i64 v = new_tmp_a64(s);
+ if (sf) {
+ tcg_gen_mov_i64(v, cpu_X[reg]);
+ } else {
+ tcg_gen_ext32u_i64(v, cpu_X[reg]);
+ }
+ return v;
+}
+
+/* Return the offset into CPUARMState of a slice (from
+ * the least significant end) of FP register Qn (ie
+ * Dn, Sn, Hn or Bn).
+ * (Note that this is not the same mapping as for A32; see cpu.h)
+ */
+static inline int fp_reg_offset(DisasContext *s, int regno, MemOp size)
+{
+ return vec_reg_offset(s, regno, 0, size);
+}
+
+/* Offset of the high half of the 128 bit vector Qn */
+static inline int fp_reg_hi_offset(DisasContext *s, int regno)
+{
+ return vec_reg_offset(s, regno, 1, MO_64);
+}
+
+/* Convenience accessors for reading and writing single and double
+ * FP registers. Writing clears the upper parts of the associated
+ * 128 bit vector register, as required by the architecture.
+ * Note that unlike the GP register accessors, the values returned
+ * by the read functions must be manually freed.
+ */
+static TCGv_i64 read_fp_dreg(DisasContext *s, int reg)
+{
+ TCGv_i64 v = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(v, cpu_env, fp_reg_offset(s, reg, MO_64));
+ return v;
+}
+
+static TCGv_i32 read_fp_sreg(DisasContext *s, int reg)
+{
+ TCGv_i32 v = tcg_temp_new_i32();
+
+ tcg_gen_ld_i32(v, cpu_env, fp_reg_offset(s, reg, MO_32));
+ return v;
+}
+
+static TCGv_i32 read_fp_hreg(DisasContext *s, int reg)
+{
+ TCGv_i32 v = tcg_temp_new_i32();
+
+ tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16));
+ return v;
+}
+
+/* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64).
+ * If SVE is not enabled, then there are only 128 bits in the vector.
+ */
+static void clear_vec_high(DisasContext *s, bool is_q, int rd)
+{
+ unsigned ofs = fp_reg_offset(s, rd, MO_64);
+ unsigned vsz = vec_full_reg_size(s);
+
+ /* Nop move, with side effect of clearing the tail. */
+ tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
+}
+
+void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
+{
+ unsigned ofs = fp_reg_offset(s, reg, MO_64);
+
+ tcg_gen_st_i64(v, cpu_env, ofs);
+ clear_vec_high(s, false, reg);
+}
+
+static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v)
+{
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ tcg_gen_extu_i32_i64(tmp, v);
+ write_fp_dreg(s, reg, tmp);
+ tcg_temp_free_i64(tmp);
+}
+
+/* Expand a 2-operand AdvSIMD vector operation using an expander function. */
+static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
+ GVecGen2Fn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand + immediate AdvSIMD vector operation using
+ * an expander function.
+ */
+static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn,
+ int64_t imm, GVecGen2iFn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ imm, is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 3-operand AdvSIMD vector operation using an expander function. */
+static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm,
+ GVecGen3Fn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 4-operand AdvSIMD vector operation using an expander function. */
+static void gen_gvec_fn4(DisasContext *s, bool is_q, int rd, int rn, int rm,
+ int rx, GVecGen4Fn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), vec_full_reg_offset(s, rx),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand operation using an out-of-line helper. */
+static void gen_gvec_op2_ool(DisasContext *s, bool is_q, int rd,
+ int rn, int data, gen_helper_gvec_2 *fn)
+{
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/* Expand a 3-operand operation using an out-of-line helper. */
+static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd,
+ int rn, int rm, int data, gen_helper_gvec_3 *fn)
+{
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/* Expand a 3-operand + fpstatus pointer + simd data value operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn,
+ int rm, bool is_fp16, int data,
+ gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+ tcg_temp_free_ptr(fpst);
+}
+
+/* Expand a 3-operand + qc + operation using an out-of-line helper. */
+static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn,
+ int rm, gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr qc_ptr = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), qc_ptr,
+ is_q ? 16 : 8, vec_full_reg_size(s), 0, fn);
+ tcg_temp_free_ptr(qc_ptr);
+}
+
+/* Expand a 4-operand operation using an out-of-line helper. */
+static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn,
+ int rm, int ra, int data, gen_helper_gvec_4 *fn)
+{
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, ra),
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/*
+ * Expand a 4-operand + fpstatus pointer + simd data value operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn,
+ int rm, int ra, bool is_fp16, int data,
+ gen_helper_gvec_4_ptr *fn)
+{
+ TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, ra), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+ tcg_temp_free_ptr(fpst);
+}
+
+/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
+ * than the 32 bit equivalent.
+ */
+static inline void gen_set_NZ64(TCGv_i64 result)
+{
+ tcg_gen_extr_i64_i32(cpu_ZF, cpu_NF, result);
+ tcg_gen_or_i32(cpu_ZF, cpu_ZF, cpu_NF);
+}
+
+/* Set NZCV as for a logical operation: NZ as per result, CV cleared. */
+static inline void gen_logic_CC(int sf, TCGv_i64 result)
+{
+ if (sf) {
+ gen_set_NZ64(result);
+ } else {
+ tcg_gen_extrl_i64_i32(cpu_ZF, result);
+ tcg_gen_mov_i32(cpu_NF, cpu_ZF);
+ }
+ tcg_gen_movi_i32(cpu_CF, 0);
+ tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* dest = T0 + T1; compute C, N, V and Z flags */
+static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+ if (sf) {
+ TCGv_i64 result, flag, tmp;
+ result = tcg_temp_new_i64();
+ flag = tcg_temp_new_i64();
+ tmp = tcg_temp_new_i64();
+
+ tcg_gen_movi_i64(tmp, 0);
+ tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp);
+
+ tcg_gen_extrl_i64_i32(cpu_CF, flag);
+
+ gen_set_NZ64(result);
+
+ tcg_gen_xor_i64(flag, result, t0);
+ tcg_gen_xor_i64(tmp, t0, t1);
+ tcg_gen_andc_i64(flag, flag, tmp);
+ tcg_temp_free_i64(tmp);
+ tcg_gen_extrh_i64_i32(cpu_VF, flag);
+
+ tcg_gen_mov_i64(dest, result);
+ tcg_temp_free_i64(result);
+ tcg_temp_free_i64(flag);
+ } else {
+ /* 32 bit arithmetic */
+ TCGv_i32 t0_32 = tcg_temp_new_i32();
+ TCGv_i32 t1_32 = tcg_temp_new_i32();
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ tcg_gen_movi_i32(tmp, 0);
+ tcg_gen_extrl_i64_i32(t0_32, t0);
+ tcg_gen_extrl_i64_i32(t1_32, t1);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, t1_32, tmp);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+ tcg_gen_xor_i32(tmp, t0_32, t1_32);
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+ tcg_gen_extu_i32_i64(dest, cpu_NF);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(t0_32);
+ tcg_temp_free_i32(t1_32);
+ }
+}
+
+/* dest = T0 - T1; compute C, N, V and Z flags */
+static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+ if (sf) {
+ /* 64 bit arithmetic */
+ TCGv_i64 result, flag, tmp;
+
+ result = tcg_temp_new_i64();
+ flag = tcg_temp_new_i64();
+ tcg_gen_sub_i64(result, t0, t1);
+
+ gen_set_NZ64(result);
+
+ tcg_gen_setcond_i64(TCG_COND_GEU, flag, t0, t1);
+ tcg_gen_extrl_i64_i32(cpu_CF, flag);
+
+ tcg_gen_xor_i64(flag, result, t0);
+ tmp = tcg_temp_new_i64();
+ tcg_gen_xor_i64(tmp, t0, t1);
+ tcg_gen_and_i64(flag, flag, tmp);
+ tcg_temp_free_i64(tmp);
+ tcg_gen_extrh_i64_i32(cpu_VF, flag);
+ tcg_gen_mov_i64(dest, result);
+ tcg_temp_free_i64(flag);
+ tcg_temp_free_i64(result);
+ } else {
+ /* 32 bit arithmetic */
+ TCGv_i32 t0_32 = tcg_temp_new_i32();
+ TCGv_i32 t1_32 = tcg_temp_new_i32();
+ TCGv_i32 tmp;
+
+ tcg_gen_extrl_i64_i32(t0_32, t0);
+ tcg_gen_extrl_i64_i32(t1_32, t1);
+ tcg_gen_sub_i32(cpu_NF, t0_32, t1_32);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0_32, t1_32);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+ tmp = tcg_temp_new_i32();
+ tcg_gen_xor_i32(tmp, t0_32, t1_32);
+ tcg_temp_free_i32(t0_32);
+ tcg_temp_free_i32(t1_32);
+ tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_extu_i32_i64(dest, cpu_NF);
+ }
+}
+
+/* dest = T0 + T1 + CF; do not compute flags. */
+static void gen_adc(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+ TCGv_i64 flag = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(flag, cpu_CF);
+ tcg_gen_add_i64(dest, t0, t1);
+ tcg_gen_add_i64(dest, dest, flag);
+ tcg_temp_free_i64(flag);
+
+ if (!sf) {
+ tcg_gen_ext32u_i64(dest, dest);
+ }
+}
+
+/* dest = T0 + T1 + CF; compute C, N, V and Z flags. */
+static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+ if (sf) {
+ TCGv_i64 result = tcg_temp_new_i64();
+ TCGv_i64 cf_64 = tcg_temp_new_i64();
+ TCGv_i64 vf_64 = tcg_temp_new_i64();
+ TCGv_i64 tmp = tcg_temp_new_i64();
+ TCGv_i64 zero = tcg_constant_i64(0);
+
+ tcg_gen_extu_i32_i64(cf_64, cpu_CF);
+ tcg_gen_add2_i64(result, cf_64, t0, zero, cf_64, zero);
+ tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, zero);
+ tcg_gen_extrl_i64_i32(cpu_CF, cf_64);
+ gen_set_NZ64(result);
+
+ tcg_gen_xor_i64(vf_64, result, t0);
+ tcg_gen_xor_i64(tmp, t0, t1);
+ tcg_gen_andc_i64(vf_64, vf_64, tmp);
+ tcg_gen_extrh_i64_i32(cpu_VF, vf_64);
+
+ tcg_gen_mov_i64(dest, result);
+
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i64(vf_64);
+ tcg_temp_free_i64(cf_64);
+ tcg_temp_free_i64(result);
+ } else {
+ TCGv_i32 t0_32 = tcg_temp_new_i32();
+ TCGv_i32 t1_32 = tcg_temp_new_i32();
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ TCGv_i32 zero = tcg_constant_i32(0);
+
+ tcg_gen_extrl_i64_i32(t0_32, t0);
+ tcg_gen_extrl_i64_i32(t1_32, t1);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, zero, cpu_CF, zero);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, zero);
+
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+ tcg_gen_xor_i32(tmp, t0_32, t1_32);
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+ tcg_gen_extu_i32_i64(dest, cpu_NF);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(t1_32);
+ tcg_temp_free_i32(t0_32);
+ }
+}
+
+/*
+ * Load/Store generators
+ */
+
+/*
+ * Store from GPR register to memory.
+ */
+static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source,
+ TCGv_i64 tcg_addr, MemOp memop, int memidx,
+ bool iss_valid,
+ unsigned int iss_srt,
+ bool iss_sf, bool iss_ar)
+{
+ memop = finalize_memop(s, memop);
+ tcg_gen_qemu_st_i64(source, tcg_addr, memidx, memop);
+
+ if (iss_valid) {
+ uint32_t syn;
+
+ syn = syn_data_abort_with_iss(0,
+ (memop & MO_SIZE),
+ false,
+ iss_srt,
+ iss_sf,
+ iss_ar,
+ 0, 0, 0, 0, 0, false);
+ disas_set_insn_syndrome(s, syn);
+ }
+}
+
+static void do_gpr_st(DisasContext *s, TCGv_i64 source,
+ TCGv_i64 tcg_addr, MemOp memop,
+ bool iss_valid,
+ unsigned int iss_srt,
+ bool iss_sf, bool iss_ar)
+{
+ do_gpr_st_memidx(s, source, tcg_addr, memop, get_mem_index(s),
+ iss_valid, iss_srt, iss_sf, iss_ar);
+}
+
+/*
+ * Load from memory to GPR register
+ */
+static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+ MemOp memop, bool extend, int memidx,
+ bool iss_valid, unsigned int iss_srt,
+ bool iss_sf, bool iss_ar)
+{
+ memop = finalize_memop(s, memop);
+ tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop);
+
+ if (extend && (memop & MO_SIGN)) {
+ g_assert((memop & MO_SIZE) <= MO_32);
+ tcg_gen_ext32u_i64(dest, dest);
+ }
+
+ if (iss_valid) {
+ uint32_t syn;
+
+ syn = syn_data_abort_with_iss(0,
+ (memop & MO_SIZE),
+ (memop & MO_SIGN) != 0,
+ iss_srt,
+ iss_sf,
+ iss_ar,
+ 0, 0, 0, 0, 0, false);
+ disas_set_insn_syndrome(s, syn);
+ }
+}
+
+static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+ MemOp memop, bool extend,
+ bool iss_valid, unsigned int iss_srt,
+ bool iss_sf, bool iss_ar)
+{
+ do_gpr_ld_memidx(s, dest, tcg_addr, memop, extend, get_mem_index(s),
+ iss_valid, iss_srt, iss_sf, iss_ar);
+}
+
+/*
+ * Store from FP register to memory
+ */
+static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size)
+{
+ /* This writes the bottom N bits of a 128 bit wide vector to memory */
+ TCGv_i64 tmplo = tcg_temp_new_i64();
+ MemOp mop;
+
+ tcg_gen_ld_i64(tmplo, cpu_env, fp_reg_offset(s, srcidx, MO_64));
+
+ if (size < 4) {
+ mop = finalize_memop(s, size);
+ tcg_gen_qemu_st_i64(tmplo, tcg_addr, get_mem_index(s), mop);
+ } else {
+ bool be = s->be_data == MO_BE;
+ TCGv_i64 tcg_hiaddr = tcg_temp_new_i64();
+ TCGv_i64 tmphi = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(tmphi, cpu_env, fp_reg_hi_offset(s, srcidx));
+
+ mop = s->be_data | MO_UQ;
+ tcg_gen_qemu_st_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
+ mop | (s->align_mem ? MO_ALIGN_16 : 0));
+ tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
+ tcg_gen_qemu_st_i64(be ? tmplo : tmphi, tcg_hiaddr,
+ get_mem_index(s), mop);
+
+ tcg_temp_free_i64(tcg_hiaddr);
+ tcg_temp_free_i64(tmphi);
+ }
+
+ tcg_temp_free_i64(tmplo);
+}
+
+/*
+ * Load from memory to FP register
+ */
+static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
+{
+ /* This always zero-extends and writes to a full 128 bit wide vector */
+ TCGv_i64 tmplo = tcg_temp_new_i64();
+ TCGv_i64 tmphi = NULL;
+ MemOp mop;
+
+ if (size < 4) {
+ mop = finalize_memop(s, size);
+ tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), mop);
+ } else {
+ bool be = s->be_data == MO_BE;
+ TCGv_i64 tcg_hiaddr;
+
+ tmphi = tcg_temp_new_i64();
+ tcg_hiaddr = tcg_temp_new_i64();
+
+ mop = s->be_data | MO_UQ;
+ tcg_gen_qemu_ld_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
+ mop | (s->align_mem ? MO_ALIGN_16 : 0));
+ tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
+ tcg_gen_qemu_ld_i64(be ? tmplo : tmphi, tcg_hiaddr,
+ get_mem_index(s), mop);
+ tcg_temp_free_i64(tcg_hiaddr);
+ }
+
+ tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
+ tcg_temp_free_i64(tmplo);
+
+ if (tmphi) {
+ tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
+ tcg_temp_free_i64(tmphi);
+ }
+ clear_vec_high(s, tmphi != NULL, destidx);
+}
+
+/*
+ * Vector load/store helpers.
+ *
+ * The principal difference between this and a FP load is that we don't
+ * zero extend as we are filling a partial chunk of the vector register.
+ * These functions don't support 128 bit loads/stores, which would be
+ * normal load/store operations.
+ *
+ * The _i32 versions are useful when operating on 32 bit quantities
+ * (eg for floating point single or using Neon helper functions).
+ */
+
+/* Get value of an element within a vector register */
+static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
+ int element, MemOp memop)
+{
+ int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
+ switch ((unsigned)memop) {
+ case MO_8:
+ tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_32:
+ tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_8|MO_SIGN:
+ tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16|MO_SIGN:
+ tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_32|MO_SIGN:
+ tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_64:
+ case MO_64|MO_SIGN:
+ tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx,
+ int element, MemOp memop)
+{
+ int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
+ switch (memop) {
+ case MO_8:
+ tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_8|MO_SIGN:
+ tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16|MO_SIGN:
+ tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_32:
+ case MO_32|MO_SIGN:
+ tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Set value of an element within a vector register */
+static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
+ int element, MemOp memop)
+{
+ int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
+ switch (memop) {
+ case MO_8:
+ tcg_gen_st8_i64(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_st16_i64(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_32:
+ tcg_gen_st32_i64(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_64:
+ tcg_gen_st_i64(tcg_src, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
+ int destidx, int element, MemOp memop)
+{
+ int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
+ switch (memop) {
+ case MO_8:
+ tcg_gen_st8_i32(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_st16_i32(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_32:
+ tcg_gen_st_i32(tcg_src, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Store from vector register to memory */
+static void do_vec_st(DisasContext *s, int srcidx, int element,
+ TCGv_i64 tcg_addr, MemOp mop)
+{
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_tmp, srcidx, element, mop & MO_SIZE);
+ tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
+
+ tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Load from memory to vector register */
+static void do_vec_ld(DisasContext *s, int destidx, int element,
+ TCGv_i64 tcg_addr, MemOp mop)
+{
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
+ write_vec_element(s, tcg_tmp, destidx, element, mop & MO_SIZE);
+
+ tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Check that FP/Neon access is enabled. If it is, return
+ * true. If not, emit code to generate an appropriate exception,
+ * and return false; the caller should not emit any code for
+ * the instruction. Note that this check must happen after all
+ * unallocated-encoding checks (otherwise the syndrome information
+ * for the resulting exception will be incorrect).
+ */
+static bool fp_access_check_only(DisasContext *s)
+{
+ if (s->fp_excp_el) {
+ assert(!s->fp_access_checked);
+ s->fp_access_checked = true;
+
+ gen_exception_insn_el(s, 0, EXCP_UDEF,
+ syn_fp_access_trap(1, 0xe, false, 0),
+ s->fp_excp_el);
+ return false;
+ }
+ s->fp_access_checked = true;
+ return true;
+}
+
+static bool fp_access_check(DisasContext *s)
+{
+ if (!fp_access_check_only(s)) {
+ return false;
+ }
+ if (s->sme_trap_nonstreaming && s->is_nonstreaming) {
+ gen_exception_insn(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_Streaming, false));
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Check that SVE access is enabled. If it is, return true.
+ * If not, emit code to generate an appropriate exception and return false.
+ * This function corresponds to CheckSVEEnabled().
+ */
+bool sve_access_check(DisasContext *s)
+{
+ if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) {
+ assert(dc_isar_feature(aa64_sme, s));
+ if (!sme_sm_enabled_check(s)) {
+ goto fail_exit;
+ }
+ } else if (s->sve_excp_el) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF,
+ syn_sve_access_trap(), s->sve_excp_el);
+ goto fail_exit;
+ }
+ s->sve_access_checked = true;
+ return fp_access_check(s);
+
+ fail_exit:
+ /* Assert that we only raise one exception per instruction. */
+ assert(!s->sve_access_checked);
+ s->sve_access_checked = true;
+ return false;
+}
+
+/*
+ * Check that SME access is enabled, raise an exception if not.
+ * Note that this function corresponds to CheckSMEAccess and is
+ * only used directly for cpregs.
+ */
+static bool sme_access_check(DisasContext *s)
+{
+ if (s->sme_excp_el) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_AccessTrap, false),
+ s->sme_excp_el);
+ return false;
+ }
+ return true;
+}
+
+/* This function corresponds to CheckSMEEnabled. */
+bool sme_enabled_check(DisasContext *s)
+{
+ /*
+ * Note that unlike sve_excp_el, we have not constrained sme_excp_el
+ * to be zero when fp_excp_el has priority. This is because we need
+ * sme_excp_el by itself for cpregs access checks.
+ */
+ if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) {
+ s->fp_access_checked = true;
+ return sme_access_check(s);
+ }
+ return fp_access_check_only(s);
+}
+
+/* Common subroutine for CheckSMEAnd*Enabled. */
+bool sme_enabled_check_with_svcr(DisasContext *s, unsigned req)
+{
+ if (!sme_enabled_check(s)) {
+ return false;
+ }
+ if (FIELD_EX64(req, SVCR, SM) && !s->pstate_sm) {
+ gen_exception_insn(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_NotStreaming, false));
+ return false;
+ }
+ if (FIELD_EX64(req, SVCR, ZA) && !s->pstate_za) {
+ gen_exception_insn(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_InactiveZA, false));
+ return false;
+ }
+ return true;
+}
+
+/*
+ * This utility function is for doing register extension with an
+ * optional shift. You will likely want to pass a temporary for the
+ * destination register. See DecodeRegExtend() in the ARM ARM.
+ */
+static void ext_and_shift_reg(TCGv_i64 tcg_out, TCGv_i64 tcg_in,
+ int option, unsigned int shift)
+{
+ int extsize = extract32(option, 0, 2);
+ bool is_signed = extract32(option, 2, 1);
+
+ if (is_signed) {
+ switch (extsize) {
+ case 0:
+ tcg_gen_ext8s_i64(tcg_out, tcg_in);
+ break;
+ case 1:
+ tcg_gen_ext16s_i64(tcg_out, tcg_in);
+ break;
+ case 2:
+ tcg_gen_ext32s_i64(tcg_out, tcg_in);
+ break;
+ case 3:
+ tcg_gen_mov_i64(tcg_out, tcg_in);
+ break;
+ }
+ } else {
+ switch (extsize) {
+ case 0:
+ tcg_gen_ext8u_i64(tcg_out, tcg_in);
+ break;
+ case 1:
+ tcg_gen_ext16u_i64(tcg_out, tcg_in);
+ break;
+ case 2:
+ tcg_gen_ext32u_i64(tcg_out, tcg_in);
+ break;
+ case 3:
+ tcg_gen_mov_i64(tcg_out, tcg_in);
+ break;
+ }
+ }
+
+ if (shift) {
+ tcg_gen_shli_i64(tcg_out, tcg_out, shift);
+ }
+}
+
+static inline void gen_check_sp_alignment(DisasContext *s)
+{
+ /* The AArch64 architecture mandates that (if enabled via PSTATE
+ * or SCTLR bits) there is a check that SP is 16-aligned on every
+ * SP-relative load or store (with an exception generated if it is not).
+ * In line with general QEMU practice regarding misaligned accesses,
+ * we omit these checks for the sake of guest program performance.
+ * This function is provided as a hook so we can more easily add these
+ * checks in future (possibly as a "favour catching guest program bugs
+ * over speed" user selectable option).
+ */
+}
+
+/*
+ * This provides a simple table based table lookup decoder. It is
+ * intended to be used when the relevant bits for decode are too
+ * awkwardly placed and switch/if based logic would be confusing and
+ * deeply nested. Since it's a linear search through the table, tables
+ * should be kept small.
+ *
+ * It returns the first handler where insn & mask == pattern, or
+ * NULL if there is no match.
+ * The table is terminated by an empty mask (i.e. 0)
+ */
+static inline AArch64DecodeFn *lookup_disas_fn(const AArch64DecodeTable *table,
+ uint32_t insn)
+{
+ const AArch64DecodeTable *tptr = table;
+
+ while (tptr->mask) {
+ if ((insn & tptr->mask) == tptr->pattern) {
+ return tptr->disas_fn;
+ }
+ tptr++;
+ }
+ return NULL;
+}
+
+/*
+ * The instruction disassembly implemented here matches
+ * the instruction encoding classifications in chapter C4
+ * of the ARM Architecture Reference Manual (DDI0487B_a);
+ * classification names and decode diagrams here should generally
+ * match up with those in the manual.
+ */
+
+/* Unconditional branch (immediate)
+ * 31 30 26 25 0
+ * +----+-----------+-------------------------------------+
+ * | op | 0 0 1 0 1 | imm26 |
+ * +----+-----------+-------------------------------------+
+ */
+static void disas_uncond_b_imm(DisasContext *s, uint32_t insn)
+{
+ int64_t diff = sextract32(insn, 0, 26) * 4;
+
+ if (insn & (1U << 31)) {
+ /* BL Branch with link */
+ gen_pc_plus_diff(s, cpu_reg(s, 30), curr_insn_len(s));
+ }
+
+ /* B Branch / BL Branch with link */
+ reset_btype(s);
+ gen_goto_tb(s, 0, diff);
+}
+
+/* Compare and branch (immediate)
+ * 31 30 25 24 23 5 4 0
+ * +----+-------------+----+---------------------+--------+
+ * | sf | 0 1 1 0 1 0 | op | imm19 | Rt |
+ * +----+-------------+----+---------------------+--------+
+ */
+static void disas_comp_b_imm(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, op, rt;
+ int64_t diff;
+ DisasLabel match;
+ TCGv_i64 tcg_cmp;
+
+ sf = extract32(insn, 31, 1);
+ op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */
+ rt = extract32(insn, 0, 5);
+ diff = sextract32(insn, 5, 19) * 4;
+
+ tcg_cmp = read_cpu_reg(s, rt, sf);
+ reset_btype(s);
+
+ match = gen_disas_label(s);
+ tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+ tcg_cmp, 0, match.label);
+ gen_goto_tb(s, 0, 4);
+ set_disas_label(s, match);
+ gen_goto_tb(s, 1, diff);
+}
+
+/* Test and branch (immediate)
+ * 31 30 25 24 23 19 18 5 4 0
+ * +----+-------------+----+-------+-------------+------+
+ * | b5 | 0 1 1 0 1 1 | op | b40 | imm14 | Rt |
+ * +----+-------------+----+-------+-------------+------+
+ */
+static void disas_test_b_imm(DisasContext *s, uint32_t insn)
+{
+ unsigned int bit_pos, op, rt;
+ int64_t diff;
+ DisasLabel match;
+ TCGv_i64 tcg_cmp;
+
+ bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5);
+ op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */
+ diff = sextract32(insn, 5, 14) * 4;
+ rt = extract32(insn, 0, 5);
+
+ tcg_cmp = tcg_temp_new_i64();
+ tcg_gen_andi_i64(tcg_cmp, cpu_reg(s, rt), (1ULL << bit_pos));
+
+ reset_btype(s);
+
+ match = gen_disas_label(s);
+ tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+ tcg_cmp, 0, match.label);
+ tcg_temp_free_i64(tcg_cmp);
+ gen_goto_tb(s, 0, 4);
+ set_disas_label(s, match);
+ gen_goto_tb(s, 1, diff);
+}
+
+/* Conditional branch (immediate)
+ * 31 25 24 23 5 4 3 0
+ * +---------------+----+---------------------+----+------+
+ * | 0 1 0 1 0 1 0 | o1 | imm19 | o0 | cond |
+ * +---------------+----+---------------------+----+------+
+ */
+static void disas_cond_b_imm(DisasContext *s, uint32_t insn)
+{
+ unsigned int cond;
+ int64_t diff;
+
+ if ((insn & (1 << 4)) || (insn & (1 << 24))) {
+ unallocated_encoding(s);
+ return;
+ }
+ diff = sextract32(insn, 5, 19) * 4;
+ cond = extract32(insn, 0, 4);
+
+ reset_btype(s);
+ if (cond < 0x0e) {
+ /* genuinely conditional branches */
+ DisasLabel match = gen_disas_label(s);
+ arm_gen_test_cc(cond, match.label);
+ gen_goto_tb(s, 0, 4);
+ set_disas_label(s, match);
+ gen_goto_tb(s, 1, diff);
+ } else {
+ /* 0xe and 0xf are both "always" conditions */
+ gen_goto_tb(s, 0, diff);
+ }
+}
+
+/* HINT instruction group, including various allocated HINTs */
+static void handle_hint(DisasContext *s, uint32_t insn,
+ unsigned int op1, unsigned int op2, unsigned int crm)
+{
+ unsigned int selector = crm << 3 | op2;
+
+ if (op1 != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (selector) {
+ case 0b00000: /* NOP */
+ break;
+ case 0b00011: /* WFI */
+ s->base.is_jmp = DISAS_WFI;
+ break;
+ case 0b00001: /* YIELD */
+ /* When running in MTTCG we don't generate jumps to the yield and
+ * WFE helpers as it won't affect the scheduling of other vCPUs.
+ * If we wanted to more completely model WFE/SEV so we don't busy
+ * spin unnecessarily we would need to do something more involved.
+ */
+ if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+ s->base.is_jmp = DISAS_YIELD;
+ }
+ break;
+ case 0b00010: /* WFE */
+ if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+ s->base.is_jmp = DISAS_WFE;
+ }
+ break;
+ case 0b00100: /* SEV */
+ case 0b00101: /* SEVL */
+ case 0b00110: /* DGH */
+ /* we treat all as NOP at least for now */
+ break;
+ case 0b00111: /* XPACLRI */
+ if (s->pauth_active) {
+ gen_helper_xpaci(cpu_X[30], cpu_env, cpu_X[30]);
+ }
+ break;
+ case 0b01000: /* PACIA1716 */
+ if (s->pauth_active) {
+ gen_helper_pacia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+ }
+ break;
+ case 0b01010: /* PACIB1716 */
+ if (s->pauth_active) {
+ gen_helper_pacib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+ }
+ break;
+ case 0b01100: /* AUTIA1716 */
+ if (s->pauth_active) {
+ gen_helper_autia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+ }
+ break;
+ case 0b01110: /* AUTIB1716 */
+ if (s->pauth_active) {
+ gen_helper_autib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+ }
+ break;
+ case 0b10000: /* ESB */
+ /* Without RAS, we must implement this as NOP. */
+ if (dc_isar_feature(aa64_ras, s)) {
+ /*
+ * QEMU does not have a source of physical SErrors,
+ * so we are only concerned with virtual SErrors.
+ * The pseudocode in the ARM for this case is
+ * if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
+ * AArch64.vESBOperation();
+ * Most of the condition can be evaluated at translation time.
+ * Test for EL2 present, and defer test for SEL2 to runtime.
+ */
+ if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
+ gen_helper_vesb(cpu_env);
+ }
+ }
+ break;
+ case 0b11000: /* PACIAZ */
+ if (s->pauth_active) {
+ gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30],
+ new_tmp_a64_zero(s));
+ }
+ break;
+ case 0b11001: /* PACIASP */
+ if (s->pauth_active) {
+ gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+ }
+ break;
+ case 0b11010: /* PACIBZ */
+ if (s->pauth_active) {
+ gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30],
+ new_tmp_a64_zero(s));
+ }
+ break;
+ case 0b11011: /* PACIBSP */
+ if (s->pauth_active) {
+ gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+ }
+ break;
+ case 0b11100: /* AUTIAZ */
+ if (s->pauth_active) {
+ gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30],
+ new_tmp_a64_zero(s));
+ }
+ break;
+ case 0b11101: /* AUTIASP */
+ if (s->pauth_active) {
+ gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+ }
+ break;
+ case 0b11110: /* AUTIBZ */
+ if (s->pauth_active) {
+ gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30],
+ new_tmp_a64_zero(s));
+ }
+ break;
+ case 0b11111: /* AUTIBSP */
+ if (s->pauth_active) {
+ gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+ }
+ break;
+ default:
+ /* default specified as NOP equivalent */
+ break;
+ }
+}
+
+static void gen_clrex(DisasContext *s, uint32_t insn)
+{
+ tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+/* CLREX, DSB, DMB, ISB */
+static void handle_sync(DisasContext *s, uint32_t insn,
+ unsigned int op1, unsigned int op2, unsigned int crm)
+{
+ TCGBar bar;
+
+ if (op1 != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (op2) {
+ case 2: /* CLREX */
+ gen_clrex(s, insn);
+ return;
+ case 4: /* DSB */
+ case 5: /* DMB */
+ switch (crm & 3) {
+ case 1: /* MBReqTypes_Reads */
+ bar = TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST;
+ break;
+ case 2: /* MBReqTypes_Writes */
+ bar = TCG_BAR_SC | TCG_MO_ST_ST;
+ break;
+ default: /* MBReqTypes_All */
+ bar = TCG_BAR_SC | TCG_MO_ALL;
+ break;
+ }
+ tcg_gen_mb(bar);
+ return;
+ case 6: /* ISB */
+ /* We need to break the TB after this insn to execute
+ * a self-modified code correctly and also to take
+ * any pending interrupts immediately.
+ */
+ reset_btype(s);
+ gen_goto_tb(s, 0, 4);
+ return;
+
+ case 7: /* SB */
+ if (crm != 0 || !dc_isar_feature(aa64_sb, s)) {
+ goto do_unallocated;
+ }
+ /*
+ * TODO: There is no speculation barrier opcode for TCG;
+ * MB and end the TB instead.
+ */
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+ gen_goto_tb(s, 0, 4);
+ return;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ return;
+ }
+}
+
+static void gen_xaflag(void)
+{
+ TCGv_i32 z = tcg_temp_new_i32();
+
+ tcg_gen_setcondi_i32(TCG_COND_EQ, z, cpu_ZF, 0);
+
+ /*
+ * (!C & !Z) << 31
+ * (!(C | Z)) << 31
+ * ~((C | Z) << 31)
+ * ~-(C | Z)
+ * (C | Z) - 1
+ */
+ tcg_gen_or_i32(cpu_NF, cpu_CF, z);
+ tcg_gen_subi_i32(cpu_NF, cpu_NF, 1);
+
+ /* !(Z & C) */
+ tcg_gen_and_i32(cpu_ZF, z, cpu_CF);
+ tcg_gen_xori_i32(cpu_ZF, cpu_ZF, 1);
+
+ /* (!C & Z) << 31 -> -(Z & ~C) */
+ tcg_gen_andc_i32(cpu_VF, z, cpu_CF);
+ tcg_gen_neg_i32(cpu_VF, cpu_VF);
+
+ /* C | Z */
+ tcg_gen_or_i32(cpu_CF, cpu_CF, z);
+
+ tcg_temp_free_i32(z);
+}
+
+static void gen_axflag(void)
+{
+ tcg_gen_sari_i32(cpu_VF, cpu_VF, 31); /* V ? -1 : 0 */
+ tcg_gen_andc_i32(cpu_CF, cpu_CF, cpu_VF); /* C & !V */
+
+ /* !(Z | V) -> !(!ZF | V) -> ZF & !V -> ZF & ~VF */
+ tcg_gen_andc_i32(cpu_ZF, cpu_ZF, cpu_VF);
+
+ tcg_gen_movi_i32(cpu_NF, 0);
+ tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* MSR (immediate) - move immediate to processor state field */
+static void handle_msr_i(DisasContext *s, uint32_t insn,
+ unsigned int op1, unsigned int op2, unsigned int crm)
+{
+ int op = op1 << 3 | op2;
+
+ /* End the TB by default, chaining is ok. */
+ s->base.is_jmp = DISAS_TOO_MANY;
+
+ switch (op) {
+ case 0x00: /* CFINV */
+ if (crm != 0 || !dc_isar_feature(aa64_condm_4, s)) {
+ goto do_unallocated;
+ }
+ tcg_gen_xori_i32(cpu_CF, cpu_CF, 1);
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+
+ case 0x01: /* XAFlag */
+ if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
+ goto do_unallocated;
+ }
+ gen_xaflag();
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+
+ case 0x02: /* AXFlag */
+ if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
+ goto do_unallocated;
+ }
+ gen_axflag();
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+
+ case 0x03: /* UAO */
+ if (!dc_isar_feature(aa64_uao, s) || s->current_el == 0) {
+ goto do_unallocated;
+ }
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_UAO);
+ } else {
+ clear_pstate_bits(PSTATE_UAO);
+ }
+ gen_rebuild_hflags(s);
+ break;
+
+ case 0x04: /* PAN */
+ if (!dc_isar_feature(aa64_pan, s) || s->current_el == 0) {
+ goto do_unallocated;
+ }
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_PAN);
+ } else {
+ clear_pstate_bits(PSTATE_PAN);
+ }
+ gen_rebuild_hflags(s);
+ break;
+
+ case 0x05: /* SPSel */
+ if (s->current_el == 0) {
+ goto do_unallocated;
+ }
+ gen_helper_msr_i_spsel(cpu_env, tcg_constant_i32(crm & PSTATE_SP));
+ break;
+
+ case 0x19: /* SSBS */
+ if (!dc_isar_feature(aa64_ssbs, s)) {
+ goto do_unallocated;
+ }
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_SSBS);
+ } else {
+ clear_pstate_bits(PSTATE_SSBS);
+ }
+ /* Don't need to rebuild hflags since SSBS is a nop */
+ break;
+
+ case 0x1a: /* DIT */
+ if (!dc_isar_feature(aa64_dit, s)) {
+ goto do_unallocated;
+ }
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_DIT);
+ } else {
+ clear_pstate_bits(PSTATE_DIT);
+ }
+ /* There's no need to rebuild hflags because DIT is a nop */
+ break;
+
+ case 0x1e: /* DAIFSet */
+ gen_helper_msr_i_daifset(cpu_env, tcg_constant_i32(crm));
+ break;
+
+ case 0x1f: /* DAIFClear */
+ gen_helper_msr_i_daifclear(cpu_env, tcg_constant_i32(crm));
+ /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs. */
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ break;
+
+ case 0x1c: /* TCO */
+ if (dc_isar_feature(aa64_mte, s)) {
+ /* Full MTE is enabled -- set the TCO bit as directed. */
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_TCO);
+ } else {
+ clear_pstate_bits(PSTATE_TCO);
+ }
+ gen_rebuild_hflags(s);
+ /* Many factors, including TCO, go into MTE_ACTIVE. */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ } else if (dc_isar_feature(aa64_mte_insn_reg, s)) {
+ /* Only "instructions accessible at EL0" -- PSTATE.TCO is WI. */
+ s->base.is_jmp = DISAS_NEXT;
+ } else {
+ goto do_unallocated;
+ }
+ break;
+
+ case 0x1b: /* SVCR* */
+ if (!dc_isar_feature(aa64_sme, s) || crm < 2 || crm > 7) {
+ goto do_unallocated;
+ }
+ if (sme_access_check(s)) {
+ int old = s->pstate_sm | (s->pstate_za << 1);
+ int new = (crm & 1) * 3;
+ int msk = (crm >> 1) & 3;
+
+ if ((old ^ new) & msk) {
+ /* At least one bit changes. */
+ gen_helper_set_svcr(cpu_env, tcg_constant_i32(new),
+ tcg_constant_i32(msk));
+ } else {
+ s->base.is_jmp = DISAS_NEXT;
+ }
+ }
+ break;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ return;
+ }
+}
+
+static void gen_get_nzcv(TCGv_i64 tcg_rt)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ TCGv_i32 nzcv = tcg_temp_new_i32();
+
+ /* build bit 31, N */
+ tcg_gen_andi_i32(nzcv, cpu_NF, (1U << 31));
+ /* build bit 30, Z */
+ tcg_gen_setcondi_i32(TCG_COND_EQ, tmp, cpu_ZF, 0);
+ tcg_gen_deposit_i32(nzcv, nzcv, tmp, 30, 1);
+ /* build bit 29, C */
+ tcg_gen_deposit_i32(nzcv, nzcv, cpu_CF, 29, 1);
+ /* build bit 28, V */
+ tcg_gen_shri_i32(tmp, cpu_VF, 31);
+ tcg_gen_deposit_i32(nzcv, nzcv, tmp, 28, 1);
+ /* generate result */
+ tcg_gen_extu_i32_i64(tcg_rt, nzcv);
+
+ tcg_temp_free_i32(nzcv);
+ tcg_temp_free_i32(tmp);
+}
+
+static void gen_set_nzcv(TCGv_i64 tcg_rt)
+{
+ TCGv_i32 nzcv = tcg_temp_new_i32();
+
+ /* take NZCV from R[t] */
+ tcg_gen_extrl_i64_i32(nzcv, tcg_rt);
+
+ /* bit 31, N */
+ tcg_gen_andi_i32(cpu_NF, nzcv, (1U << 31));
+ /* bit 30, Z */
+ tcg_gen_andi_i32(cpu_ZF, nzcv, (1 << 30));
+ tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_ZF, cpu_ZF, 0);
+ /* bit 29, C */
+ tcg_gen_andi_i32(cpu_CF, nzcv, (1 << 29));
+ tcg_gen_shri_i32(cpu_CF, cpu_CF, 29);
+ /* bit 28, V */
+ tcg_gen_andi_i32(cpu_VF, nzcv, (1 << 28));
+ tcg_gen_shli_i32(cpu_VF, cpu_VF, 3);
+ tcg_temp_free_i32(nzcv);
+}
+
+static void gen_sysreg_undef(DisasContext *s, bool isread,
+ uint8_t op0, uint8_t op1, uint8_t op2,
+ uint8_t crn, uint8_t crm, uint8_t rt)
+{
+ /*
+ * Generate code to emit an UNDEF with correct syndrome
+ * information for a failed system register access.
+ * This is EC_UNCATEGORIZED (ie a standard UNDEF) in most cases,
+ * but if FEAT_IDST is implemented then read accesses to registers
+ * in the feature ID space are reported with the EC_SYSTEMREGISTERTRAP
+ * syndrome.
+ */
+ uint32_t syndrome;
+
+ if (isread && dc_isar_feature(aa64_ids, s) &&
+ arm_cpreg_encoding_in_idspace(op0, op1, op2, crn, crm)) {
+ syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
+ } else {
+ syndrome = syn_uncategorized();
+ }
+ gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
+}
+
+/* MRS - move from system register
+ * MSR (register) - move to system register
+ * SYS
+ * SYSL
+ * These are all essentially the same insn in 'read' and 'write'
+ * versions, with varying op0 fields.
+ */
+static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
+ unsigned int op0, unsigned int op1, unsigned int op2,
+ unsigned int crn, unsigned int crm, unsigned int rt)
+{
+ uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
+ crn, crm, op0, op1, op2);
+ const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
+ TCGv_ptr tcg_ri = NULL;
+ TCGv_i64 tcg_rt;
+
+ if (!ri) {
+ /* Unknown register; this might be a guest error or a QEMU
+ * unimplemented feature.
+ */
+ qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch64 "
+ "system register op0:%d op1:%d crn:%d crm:%d op2:%d\n",
+ isread ? "read" : "write", op0, op1, crn, crm, op2);
+ gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
+ return;
+ }
+
+ /* Check access permissions */
+ if (!cp_access_ok(s->current_el, ri, isread)) {
+ gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
+ return;
+ }
+
+ if (ri->accessfn || (ri->fgt && s->fgt_active)) {
+ /* Emit code to perform further access permissions checks at
+ * runtime; this may result in an exception.
+ */
+ uint32_t syndrome;
+
+ syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
+ gen_a64_update_pc(s, 0);
+ tcg_ri = tcg_temp_new_ptr();
+ gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
+ tcg_constant_i32(key),
+ tcg_constant_i32(syndrome),
+ tcg_constant_i32(isread));
+ } else if (ri->type & ARM_CP_RAISES_EXC) {
+ /*
+ * The readfn or writefn might raise an exception;
+ * synchronize the CPU state in case it does.
+ */
+ gen_a64_update_pc(s, 0);
+ }
+
+ /* Handle special cases first */
+ switch (ri->type & ARM_CP_SPECIAL_MASK) {
+ case 0:
+ break;
+ case ARM_CP_NOP:
+ goto exit;
+ case ARM_CP_NZCV:
+ tcg_rt = cpu_reg(s, rt);
+ if (isread) {
+ gen_get_nzcv(tcg_rt);
+ } else {
+ gen_set_nzcv(tcg_rt);
+ }
+ goto exit;
+ case ARM_CP_CURRENTEL:
+ /* Reads as current EL value from pstate, which is
+ * guaranteed to be constant by the tb flags.
+ */
+ tcg_rt = cpu_reg(s, rt);
+ tcg_gen_movi_i64(tcg_rt, s->current_el << 2);
+ goto exit;
+ case ARM_CP_DC_ZVA:
+ /* Writes clear the aligned block of memory which rt points into. */
+ if (s->mte_active[0]) {
+ int desc = 0;
+
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+
+ tcg_rt = new_tmp_a64(s);
+ gen_helper_mte_check_zva(tcg_rt, cpu_env,
+ tcg_constant_i32(desc), cpu_reg(s, rt));
+ } else {
+ tcg_rt = clean_data_tbi(s, cpu_reg(s, rt));
+ }
+ gen_helper_dc_zva(cpu_env, tcg_rt);
+ goto exit;
+ case ARM_CP_DC_GVA:
+ {
+ TCGv_i64 clean_addr, tag;
+
+ /*
+ * DC_GVA, like DC_ZVA, requires that we supply the original
+ * pointer for an invalid page. Probe that address first.
+ */
+ tcg_rt = cpu_reg(s, rt);
+ clean_addr = clean_data_tbi(s, tcg_rt);
+ gen_probe_access(s, clean_addr, MMU_DATA_STORE, MO_8);
+
+ if (s->ata) {
+ /* Extract the tag from the register to match STZGM. */
+ tag = tcg_temp_new_i64();
+ tcg_gen_shri_i64(tag, tcg_rt, 56);
+ gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
+ tcg_temp_free_i64(tag);
+ }
+ }
+ goto exit;
+ case ARM_CP_DC_GZVA:
+ {
+ TCGv_i64 clean_addr, tag;
+
+ /* For DC_GZVA, we can rely on DC_ZVA for the proper fault. */
+ tcg_rt = cpu_reg(s, rt);
+ clean_addr = clean_data_tbi(s, tcg_rt);
+ gen_helper_dc_zva(cpu_env, clean_addr);
+
+ if (s->ata) {
+ /* Extract the tag from the register to match STZGM. */
+ tag = tcg_temp_new_i64();
+ tcg_gen_shri_i64(tag, tcg_rt, 56);
+ gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
+ tcg_temp_free_i64(tag);
+ }
+ }
+ goto exit;
+ default:
+ g_assert_not_reached();
+ }
+ if ((ri->type & ARM_CP_FPU) && !fp_access_check_only(s)) {
+ goto exit;
+ } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) {
+ goto exit;
+ } else if ((ri->type & ARM_CP_SME) && !sme_access_check(s)) {
+ goto exit;
+ }
+
+ if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+ gen_io_start();
+ }
+
+ tcg_rt = cpu_reg(s, rt);
+
+ if (isread) {
+ if (ri->type & ARM_CP_CONST) {
+ tcg_gen_movi_i64(tcg_rt, ri->resetvalue);
+ } else if (ri->readfn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ gen_helper_get_cp_reg64(tcg_rt, cpu_env, tcg_ri);
+ } else {
+ tcg_gen_ld_i64(tcg_rt, cpu_env, ri->fieldoffset);
+ }
+ } else {
+ if (ri->type & ARM_CP_CONST) {
+ /* If not forbidden by access permissions, treat as WI */
+ goto exit;
+ } else if (ri->writefn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ gen_helper_set_cp_reg64(cpu_env, tcg_ri, tcg_rt);
+ } else {
+ tcg_gen_st_i64(tcg_rt, cpu_env, ri->fieldoffset);
+ }
+ }
+
+ if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+ /* I/O operations must end the TB here (whether read or write) */
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ }
+ if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+ /*
+ * A write to any coprocessor regiser that ends a TB
+ * must rebuild the hflags for the next TB.
+ */
+ gen_rebuild_hflags(s);
+ /*
+ * We default to ending the TB on a coprocessor register write,
+ * but allow this to be suppressed by the register definition
+ * (usually only necessary to work around guest bugs).
+ */
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ }
+
+ exit:
+ if (tcg_ri) {
+ tcg_temp_free_ptr(tcg_ri);
+ }
+}
+
+/* System
+ * 31 22 21 20 19 18 16 15 12 11 8 7 5 4 0
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ * | 1 1 0 1 0 1 0 1 0 0 | L | op0 | op1 | CRn | CRm | op2 | Rt |
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ */
+static void disas_system(DisasContext *s, uint32_t insn)
+{
+ unsigned int l, op0, op1, crn, crm, op2, rt;
+ l = extract32(insn, 21, 1);
+ op0 = extract32(insn, 19, 2);
+ op1 = extract32(insn, 16, 3);
+ crn = extract32(insn, 12, 4);
+ crm = extract32(insn, 8, 4);
+ op2 = extract32(insn, 5, 3);
+ rt = extract32(insn, 0, 5);
+
+ if (op0 == 0) {
+ if (l || rt != 31) {
+ unallocated_encoding(s);
+ return;
+ }
+ switch (crn) {
+ case 2: /* HINT (including allocated hints like NOP, YIELD, etc) */
+ handle_hint(s, insn, op1, op2, crm);
+ break;
+ case 3: /* CLREX, DSB, DMB, ISB */
+ handle_sync(s, insn, op1, op2, crm);
+ break;
+ case 4: /* MSR (immediate) */
+ handle_msr_i(s, insn, op1, op2, crm);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+ return;
+ }
+ handle_sys(s, insn, l, op0, op1, op2, crn, crm, rt);
+}
+
+/* Exception generation
+ *
+ * 31 24 23 21 20 5 4 2 1 0
+ * +-----------------+-----+------------------------+-----+----+
+ * | 1 1 0 1 0 1 0 0 | opc | imm16 | op2 | LL |
+ * +-----------------------+------------------------+----------+
+ */
+static void disas_exc(DisasContext *s, uint32_t insn)
+{
+ int opc = extract32(insn, 21, 3);
+ int op2_ll = extract32(insn, 0, 5);
+ int imm16 = extract32(insn, 5, 16);
+ uint32_t syndrome;
+
+ switch (opc) {
+ case 0:
+ /* For SVC, HVC and SMC we advance the single-step state
+ * machine before taking the exception. This is architecturally
+ * mandated, to ensure that single-stepping a system call
+ * instruction works properly.
+ */
+ switch (op2_ll) {
+ case 1: /* SVC */
+ syndrome = syn_aa64_svc(imm16);
+ if (s->fgt_svc) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
+ break;
+ }
+ gen_ss_advance(s);
+ gen_exception_insn(s, 4, EXCP_SWI, syndrome);
+ break;
+ case 2: /* HVC */
+ if (s->current_el == 0) {
+ unallocated_encoding(s);
+ break;
+ }
+ /* The pre HVC helper handles cases when HVC gets trapped
+ * as an undefined insn by runtime configuration.
+ */
+ gen_a64_update_pc(s, 0);
+ gen_helper_pre_hvc(cpu_env);
+ gen_ss_advance(s);
+ gen_exception_insn_el(s, 4, EXCP_HVC, syn_aa64_hvc(imm16), 2);
+ break;
+ case 3: /* SMC */
+ if (s->current_el == 0) {
+ unallocated_encoding(s);
+ break;
+ }
+ gen_a64_update_pc(s, 0);
+ gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa64_smc(imm16)));
+ gen_ss_advance(s);
+ gen_exception_insn_el(s, 4, EXCP_SMC, syn_aa64_smc(imm16), 3);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+ break;
+ case 1:
+ if (op2_ll != 0) {
+ unallocated_encoding(s);
+ break;
+ }
+ /* BRK */
+ gen_exception_bkpt_insn(s, syn_aa64_bkpt(imm16));
+ break;
+ case 2:
+ if (op2_ll != 0) {
+ unallocated_encoding(s);
+ break;
+ }
+ /* HLT. This has two purposes.
+ * Architecturally, it is an external halting debug instruction.
+ * Since QEMU doesn't implement external debug, we treat this as
+ * it is required for halting debug disabled: it will UNDEF.
+ * Secondly, "HLT 0xf000" is the A64 semihosting syscall instruction.
+ */
+ if (semihosting_enabled(s->current_el == 0) && imm16 == 0xf000) {
+ gen_exception_internal_insn(s, EXCP_SEMIHOST);
+ } else {
+ unallocated_encoding(s);
+ }
+ break;
+ case 5:
+ if (op2_ll < 1 || op2_ll > 3) {
+ unallocated_encoding(s);
+ break;
+ }
+ /* DCPS1, DCPS2, DCPS3 */
+ unallocated_encoding(s);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* Unconditional branch (register)
+ * 31 25 24 21 20 16 15 10 9 5 4 0
+ * +---------------+-------+-------+-------+------+-------+
+ * | 1 1 0 1 0 1 1 | opc | op2 | op3 | Rn | op4 |
+ * +---------------+-------+-------+-------+------+-------+
+ */
+static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
+{
+ unsigned int opc, op2, op3, rn, op4;
+ unsigned btype_mod = 2; /* 0: BR, 1: BLR, 2: other */
+ TCGv_i64 dst;
+ TCGv_i64 modifier;
+
+ opc = extract32(insn, 21, 4);
+ op2 = extract32(insn, 16, 5);
+ op3 = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ op4 = extract32(insn, 0, 5);
+
+ if (op2 != 0x1f) {
+ goto do_unallocated;
+ }
+
+ switch (opc) {
+ case 0: /* BR */
+ case 1: /* BLR */
+ case 2: /* RET */
+ btype_mod = opc;
+ switch (op3) {
+ case 0:
+ /* BR, BLR, RET */
+ if (op4 != 0) {
+ goto do_unallocated;
+ }
+ dst = cpu_reg(s, rn);
+ break;
+
+ case 2:
+ case 3:
+ if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ if (opc == 2) {
+ /* RETAA, RETAB */
+ if (rn != 0x1f || op4 != 0x1f) {
+ goto do_unallocated;
+ }
+ rn = 30;
+ modifier = cpu_X[31];
+ } else {
+ /* BRAAZ, BRABZ, BLRAAZ, BLRABZ */
+ if (op4 != 0x1f) {
+ goto do_unallocated;
+ }
+ modifier = new_tmp_a64_zero(s);
+ }
+ if (s->pauth_active) {
+ dst = new_tmp_a64(s);
+ if (op3 == 2) {
+ gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
+ } else {
+ gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
+ }
+ } else {
+ dst = cpu_reg(s, rn);
+ }
+ break;
+
+ default:
+ goto do_unallocated;
+ }
+ /* BLR also needs to load return address */
+ if (opc == 1) {
+ TCGv_i64 lr = cpu_reg(s, 30);
+ if (dst == lr) {
+ TCGv_i64 tmp = new_tmp_a64(s);
+ tcg_gen_mov_i64(tmp, dst);
+ dst = tmp;
+ }
+ gen_pc_plus_diff(s, lr, curr_insn_len(s));
+ }
+ gen_a64_set_pc(s, dst);
+ break;
+
+ case 8: /* BRAA */
+ case 9: /* BLRAA */
+ if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ if ((op3 & ~1) != 2) {
+ goto do_unallocated;
+ }
+ btype_mod = opc & 1;
+ if (s->pauth_active) {
+ dst = new_tmp_a64(s);
+ modifier = cpu_reg_sp(s, op4);
+ if (op3 == 2) {
+ gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
+ } else {
+ gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
+ }
+ } else {
+ dst = cpu_reg(s, rn);
+ }
+ /* BLRAA also needs to load return address */
+ if (opc == 9) {
+ TCGv_i64 lr = cpu_reg(s, 30);
+ if (dst == lr) {
+ TCGv_i64 tmp = new_tmp_a64(s);
+ tcg_gen_mov_i64(tmp, dst);
+ dst = tmp;
+ }
+ gen_pc_plus_diff(s, lr, curr_insn_len(s));
+ }
+ gen_a64_set_pc(s, dst);
+ break;
+
+ case 4: /* ERET */
+ if (s->current_el == 0) {
+ goto do_unallocated;
+ }
+ switch (op3) {
+ case 0: /* ERET */
+ if (op4 != 0) {
+ goto do_unallocated;
+ }
+ if (s->fgt_eret) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
+ return;
+ }
+ dst = tcg_temp_new_i64();
+ tcg_gen_ld_i64(dst, cpu_env,
+ offsetof(CPUARMState, elr_el[s->current_el]));
+ break;
+
+ case 2: /* ERETAA */
+ case 3: /* ERETAB */
+ if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ if (rn != 0x1f || op4 != 0x1f) {
+ goto do_unallocated;
+ }
+ /* The FGT trap takes precedence over an auth trap. */
+ if (s->fgt_eret) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
+ return;
+ }
+ dst = tcg_temp_new_i64();
+ tcg_gen_ld_i64(dst, cpu_env,
+ offsetof(CPUARMState, elr_el[s->current_el]));
+ if (s->pauth_active) {
+ modifier = cpu_X[31];
+ if (op3 == 2) {
+ gen_helper_autia(dst, cpu_env, dst, modifier);
+ } else {
+ gen_helper_autib(dst, cpu_env, dst, modifier);
+ }
+ }
+ break;
+
+ default:
+ goto do_unallocated;
+ }
+ if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+ gen_io_start();
+ }
+
+ gen_helper_exception_return(cpu_env, dst);
+ tcg_temp_free_i64(dst);
+ /* Must exit loop to check un-masked IRQs */
+ s->base.is_jmp = DISAS_EXIT;
+ return;
+
+ case 5: /* DRPS */
+ if (op3 != 0 || op4 != 0 || rn != 0x1f) {
+ goto do_unallocated;
+ } else {
+ unallocated_encoding(s);
+ }
+ return;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (btype_mod) {
+ case 0: /* BR */
+ if (dc_isar_feature(aa64_bti, s)) {
+ /* BR to {x16,x17} or !guard -> 1, else 3. */
+ set_btype(s, rn == 16 || rn == 17 || !s->guarded_page ? 1 : 3);
+ }
+ break;
+
+ case 1: /* BLR */
+ if (dc_isar_feature(aa64_bti, s)) {
+ /* BLR sets BTYPE to 2, regardless of source guarded page. */
+ set_btype(s, 2);
+ }
+ break;
+
+ default: /* RET or none of the above. */
+ /* BTYPE will be set to 0 by normal end-of-insn processing. */
+ break;
+ }
+
+ s->base.is_jmp = DISAS_JUMP;
+}
+
+/* Branches, exception generating and system instructions */
+static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
+{
+ switch (extract32(insn, 25, 7)) {
+ case 0x0a: case 0x0b:
+ case 0x4a: case 0x4b: /* Unconditional branch (immediate) */
+ disas_uncond_b_imm(s, insn);
+ break;
+ case 0x1a: case 0x5a: /* Compare & branch (immediate) */
+ disas_comp_b_imm(s, insn);
+ break;
+ case 0x1b: case 0x5b: /* Test & branch (immediate) */
+ disas_test_b_imm(s, insn);
+ break;
+ case 0x2a: /* Conditional branch (immediate) */
+ disas_cond_b_imm(s, insn);
+ break;
+ case 0x6a: /* Exception generation / System */
+ if (insn & (1 << 24)) {
+ if (extract32(insn, 22, 2) == 0) {
+ disas_system(s, insn);
+ } else {
+ unallocated_encoding(s);
+ }
+ } else {
+ disas_exc(s, insn);
+ }
+ break;
+ case 0x6b: /* Unconditional branch (register) */
+ disas_uncond_b_reg(s, insn);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/*
+ * Load/Store exclusive instructions are implemented by remembering
+ * the value/address loaded, and seeing if these are the same
+ * when the store is performed. This is not actually the architecturally
+ * mandated semantics, but it works for typical guest code sequences
+ * and avoids having to monitor regular stores.
+ *
+ * The store exclusive uses the atomic cmpxchg primitives to avoid
+ * races in multi-threaded linux-user and when MTTCG softmmu is
+ * enabled.
+ */
+static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
+ TCGv_i64 addr, int size, bool is_pair)
+{
+ int idx = get_mem_index(s);
+ MemOp memop = s->be_data;
+
+ g_assert(size <= 3);
+ if (is_pair) {
+ g_assert(size >= 2);
+ if (size == 2) {
+ /* The pair must be single-copy atomic for the doubleword. */
+ memop |= MO_64 | MO_ALIGN;
+ tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
+ if (s->be_data == MO_LE) {
+ tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 0, 32);
+ tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 32, 32);
+ } else {
+ tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 32, 32);
+ tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 0, 32);
+ }
+ } else {
+ /* The pair must be single-copy atomic for *each* doubleword, not
+ the entire quadword, however it must be quadword aligned. */
+ memop |= MO_64;
+ tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx,
+ memop | MO_ALIGN_16);
+
+ TCGv_i64 addr2 = tcg_temp_new_i64();
+ tcg_gen_addi_i64(addr2, addr, 8);
+ tcg_gen_qemu_ld_i64(cpu_exclusive_high, addr2, idx, memop);
+ tcg_temp_free_i64(addr2);
+
+ tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
+ tcg_gen_mov_i64(cpu_reg(s, rt2), cpu_exclusive_high);
+ }
+ } else {
+ memop |= size | MO_ALIGN;
+ tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
+ tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
+ }
+ tcg_gen_mov_i64(cpu_exclusive_addr, addr);
+}
+
+static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
+ TCGv_i64 addr, int size, int is_pair)
+{
+ /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]
+ * && (!is_pair || env->exclusive_high == [addr + datasize])) {
+ * [addr] = {Rt};
+ * if (is_pair) {
+ * [addr + datasize] = {Rt2};
+ * }
+ * {Rd} = 0;
+ * } else {
+ * {Rd} = 1;
+ * }
+ * env->exclusive_addr = -1;
+ */
+ TCGLabel *fail_label = gen_new_label();
+ TCGLabel *done_label = gen_new_label();
+ TCGv_i64 tmp;
+
+ tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
+
+ tmp = tcg_temp_new_i64();
+ if (is_pair) {
+ if (size == 2) {
+ if (s->be_data == MO_LE) {
+ tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
+ } else {
+ tcg_gen_concat32_i64(tmp, cpu_reg(s, rt2), cpu_reg(s, rt));
+ }
+ tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr,
+ cpu_exclusive_val, tmp,
+ get_mem_index(s),
+ MO_64 | MO_ALIGN | s->be_data);
+ tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
+ } else {
+ TCGv_i128 t16 = tcg_temp_new_i128();
+ TCGv_i128 c16 = tcg_temp_new_i128();
+ TCGv_i64 a, b;
+
+ if (s->be_data == MO_LE) {
+ tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt), cpu_reg(s, rt2));
+ tcg_gen_concat_i64_i128(c16, cpu_exclusive_val,
+ cpu_exclusive_high);
+ } else {
+ tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt2), cpu_reg(s, rt));
+ tcg_gen_concat_i64_i128(c16, cpu_exclusive_high,
+ cpu_exclusive_val);
+ }
+
+ tcg_gen_atomic_cmpxchg_i128(t16, cpu_exclusive_addr, c16, t16,
+ get_mem_index(s),
+ MO_128 | MO_ALIGN | s->be_data);
+ tcg_temp_free_i128(c16);
+
+ a = tcg_temp_new_i64();
+ b = tcg_temp_new_i64();
+ if (s->be_data == MO_LE) {
+ tcg_gen_extr_i128_i64(a, b, t16);
+ } else {
+ tcg_gen_extr_i128_i64(b, a, t16);
+ }
+
+ tcg_gen_xor_i64(a, a, cpu_exclusive_val);
+ tcg_gen_xor_i64(b, b, cpu_exclusive_high);
+ tcg_gen_or_i64(tmp, a, b);
+ tcg_temp_free_i64(a);
+ tcg_temp_free_i64(b);
+ tcg_temp_free_i128(t16);
+
+ tcg_gen_setcondi_i64(TCG_COND_NE, tmp, tmp, 0);
+ }
+ } else {
+ tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val,
+ cpu_reg(s, rt), get_mem_index(s),
+ size | MO_ALIGN | s->be_data);
+ tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
+ }
+ tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+ tcg_temp_free_i64(tmp);
+ tcg_gen_br(done_label);
+
+ gen_set_label(fail_label);
+ tcg_gen_movi_i64(cpu_reg(s, rd), 1);
+ gen_set_label(done_label);
+ tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+static void gen_compare_and_swap(DisasContext *s, int rs, int rt,
+ int rn, int size)
+{
+ TCGv_i64 tcg_rs = cpu_reg(s, rs);
+ TCGv_i64 tcg_rt = cpu_reg(s, rt);
+ int memidx = get_mem_index(s);
+ TCGv_i64 clean_addr;
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size);
+ tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx,
+ size | MO_ALIGN | s->be_data);
+}
+
+static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
+ int rn, int size)
+{
+ TCGv_i64 s1 = cpu_reg(s, rs);
+ TCGv_i64 s2 = cpu_reg(s, rs + 1);
+ TCGv_i64 t1 = cpu_reg(s, rt);
+ TCGv_i64 t2 = cpu_reg(s, rt + 1);
+ TCGv_i64 clean_addr;
+ int memidx = get_mem_index(s);
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ /* This is a single atomic access, despite the "pair". */
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1);
+
+ if (size == 2) {
+ TCGv_i64 cmp = tcg_temp_new_i64();
+ TCGv_i64 val = tcg_temp_new_i64();
+
+ if (s->be_data == MO_LE) {
+ tcg_gen_concat32_i64(val, t1, t2);
+ tcg_gen_concat32_i64(cmp, s1, s2);
+ } else {
+ tcg_gen_concat32_i64(val, t2, t1);
+ tcg_gen_concat32_i64(cmp, s2, s1);
+ }
+
+ tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx,
+ MO_64 | MO_ALIGN | s->be_data);
+ tcg_temp_free_i64(val);
+
+ if (s->be_data == MO_LE) {
+ tcg_gen_extr32_i64(s1, s2, cmp);
+ } else {
+ tcg_gen_extr32_i64(s2, s1, cmp);
+ }
+ tcg_temp_free_i64(cmp);
+ } else {
+ TCGv_i128 cmp = tcg_temp_new_i128();
+ TCGv_i128 val = tcg_temp_new_i128();
+
+ if (s->be_data == MO_LE) {
+ tcg_gen_concat_i64_i128(val, t1, t2);
+ tcg_gen_concat_i64_i128(cmp, s1, s2);
+ } else {
+ tcg_gen_concat_i64_i128(val, t2, t1);
+ tcg_gen_concat_i64_i128(cmp, s2, s1);
+ }
+
+ tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx,
+ MO_128 | MO_ALIGN | s->be_data);
+ tcg_temp_free_i128(val);
+
+ if (s->be_data == MO_LE) {
+ tcg_gen_extr_i128_i64(s1, s2, cmp);
+ } else {
+ tcg_gen_extr_i128_i64(s2, s1, cmp);
+ }
+ tcg_temp_free_i128(cmp);
+ }
+}
+
+/* Update the Sixty-Four bit (SF) registersize. This logic is derived
+ * from the ARMv8 specs for LDR (Shared decode for all encodings).
+ */
+static bool disas_ldst_compute_iss_sf(int size, bool is_signed, int opc)
+{
+ int opc0 = extract32(opc, 0, 1);
+ int regsize;
+
+ if (is_signed) {
+ regsize = opc0 ? 32 : 64;
+ } else {
+ regsize = size == 3 ? 64 : 32;
+ }
+ return regsize == 64;
+}
+
+/* Load/store exclusive
+ *
+ * 31 30 29 24 23 22 21 20 16 15 14 10 9 5 4 0
+ * +-----+-------------+----+---+----+------+----+-------+------+------+
+ * | sz | 0 0 1 0 0 0 | o2 | L | o1 | Rs | o0 | Rt2 | Rn | Rt |
+ * +-----+-------------+----+---+----+------+----+-------+------+------+
+ *
+ * sz: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64 bit
+ * L: 0 -> store, 1 -> load
+ * o2: 0 -> exclusive, 1 -> not
+ * o1: 0 -> single register, 1 -> register pair
+ * o0: 1 -> load-acquire/store-release, 0 -> not
+ */
+static void disas_ldst_excl(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rt2 = extract32(insn, 10, 5);
+ int rs = extract32(insn, 16, 5);
+ int is_lasr = extract32(insn, 15, 1);
+ int o2_L_o1_o0 = extract32(insn, 21, 3) * 2 | is_lasr;
+ int size = extract32(insn, 30, 2);
+ TCGv_i64 clean_addr;
+
+ switch (o2_L_o1_o0) {
+ case 0x0: /* STXR */
+ case 0x1: /* STLXR */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ if (is_lasr) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ true, rn != 31, size);
+ gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false);
+ return;
+
+ case 0x4: /* LDXR */
+ case 0x5: /* LDAXR */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ false, rn != 31, size);
+ s->is_ldex = true;
+ gen_load_exclusive(s, rt, rt2, clean_addr, size, false);
+ if (is_lasr) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ }
+ return;
+
+ case 0x8: /* STLLR */
+ if (!dc_isar_feature(aa64_lor, s)) {
+ break;
+ }
+ /* StoreLORelease is the same as Store-Release for QEMU. */
+ /* fall through */
+ case 0x9: /* STLR */
+ /* Generate ISS for non-exclusive accesses including LASR. */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ true, rn != 31, size);
+ /* TODO: ARMv8.4-LSE SCTLR.nAA */
+ do_gpr_st(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, true, rt,
+ disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
+ return;
+
+ case 0xc: /* LDLAR */
+ if (!dc_isar_feature(aa64_lor, s)) {
+ break;
+ }
+ /* LoadLOAcquire is the same as Load-Acquire for QEMU. */
+ /* fall through */
+ case 0xd: /* LDAR */
+ /* Generate ISS for non-exclusive accesses including LASR. */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ false, rn != 31, size);
+ /* TODO: ARMv8.4-LSE SCTLR.nAA */
+ do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, false, true,
+ rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ return;
+
+ case 0x2: case 0x3: /* CASP / STXP */
+ if (size & 2) { /* STXP / STLXP */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ if (is_lasr) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ true, rn != 31, size);
+ gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true);
+ return;
+ }
+ if (rt2 == 31
+ && ((rt | rs) & 1) == 0
+ && dc_isar_feature(aa64_atomics, s)) {
+ /* CASP / CASPL */
+ gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
+ return;
+ }
+ break;
+
+ case 0x6: case 0x7: /* CASPA / LDXP */
+ if (size & 2) { /* LDXP / LDAXP */
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+ false, rn != 31, size);
+ s->is_ldex = true;
+ gen_load_exclusive(s, rt, rt2, clean_addr, size, true);
+ if (is_lasr) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ }
+ return;
+ }
+ if (rt2 == 31
+ && ((rt | rs) & 1) == 0
+ && dc_isar_feature(aa64_atomics, s)) {
+ /* CASPA / CASPAL */
+ gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
+ return;
+ }
+ break;
+
+ case 0xa: /* CAS */
+ case 0xb: /* CASL */
+ case 0xe: /* CASA */
+ case 0xf: /* CASAL */
+ if (rt2 == 31 && dc_isar_feature(aa64_atomics, s)) {
+ gen_compare_and_swap(s, rs, rt, rn, size);
+ return;
+ }
+ break;
+ }
+ unallocated_encoding(s);
+}
+
+/*
+ * Load register (literal)
+ *
+ * 31 30 29 27 26 25 24 23 5 4 0
+ * +-----+-------+---+-----+-------------------+-------+
+ * | opc | 0 1 1 | V | 0 0 | imm19 | Rt |
+ * +-----+-------+---+-----+-------------------+-------+
+ *
+ * V: 1 -> vector (simd/fp)
+ * opc (non-vector): 00 -> 32 bit, 01 -> 64 bit,
+ * 10-> 32 bit signed, 11 -> prefetch
+ * opc (vector): 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit (11 unallocated)
+ */
+static void disas_ld_lit(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int64_t imm = sextract32(insn, 5, 19) << 2;
+ bool is_vector = extract32(insn, 26, 1);
+ int opc = extract32(insn, 30, 2);
+ bool is_signed = false;
+ int size = 2;
+ TCGv_i64 tcg_rt, clean_addr;
+
+ if (is_vector) {
+ if (opc == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ size = 2 + opc;
+ if (!fp_access_check(s)) {
+ return;
+ }
+ } else {
+ if (opc == 3) {
+ /* PRFM (literal) : prefetch */
+ return;
+ }
+ size = 2 + extract32(opc, 0, 1);
+ is_signed = extract32(opc, 1, 1);
+ }
+
+ tcg_rt = cpu_reg(s, rt);
+
+ clean_addr = new_tmp_a64(s);
+ gen_pc_plus_diff(s, clean_addr, imm);
+ if (is_vector) {
+ do_fp_ld(s, rt, clean_addr, size);
+ } else {
+ /* Only unsigned 32bit loads target 32bit registers. */
+ bool iss_sf = opc != 0;
+
+ do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+ false, true, rt, iss_sf, false);
+ }
+}
+
+/*
+ * LDNP (Load Pair - non-temporal hint)
+ * LDP (Load Pair - non vector)
+ * LDPSW (Load Pair Signed Word - non vector)
+ * STNP (Store Pair - non-temporal hint)
+ * STP (Store Pair - non vector)
+ * LDNP (Load Pair of SIMD&FP - non-temporal hint)
+ * LDP (Load Pair of SIMD&FP)
+ * STNP (Store Pair of SIMD&FP - non-temporal hint)
+ * STP (Store Pair of SIMD&FP)
+ *
+ * 31 30 29 27 26 25 24 23 22 21 15 14 10 9 5 4 0
+ * +-----+-------+---+---+-------+---+-----------------------------+
+ * | opc | 1 0 1 | V | 0 | index | L | imm7 | Rt2 | Rn | Rt |
+ * +-----+-------+---+---+-------+---+-------+-------+------+------+
+ *
+ * opc: LDP/STP/LDNP/STNP 00 -> 32 bit, 10 -> 64 bit
+ * LDPSW/STGP 01
+ * LDP/STP/LDNP/STNP (SIMD) 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit
+ * V: 0 -> GPR, 1 -> Vector
+ * idx: 00 -> signed offset with non-temporal hint, 01 -> post-index,
+ * 10 -> signed offset, 11 -> pre-index
+ * L: 0 -> Store 1 -> Load
+ *
+ * Rt, Rt2 = GPR or SIMD registers to be stored
+ * Rn = general purpose register containing address
+ * imm7 = signed offset (multiple of 4 or 8 depending on size)
+ */
+static void disas_ldst_pair(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rt2 = extract32(insn, 10, 5);
+ uint64_t offset = sextract64(insn, 15, 7);
+ int index = extract32(insn, 23, 2);
+ bool is_vector = extract32(insn, 26, 1);
+ bool is_load = extract32(insn, 22, 1);
+ int opc = extract32(insn, 30, 2);
+
+ bool is_signed = false;
+ bool postindex = false;
+ bool wback = false;
+ bool set_tag = false;
+
+ TCGv_i64 clean_addr, dirty_addr;
+
+ int size;
+
+ if (opc == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (is_vector) {
+ size = 2 + opc;
+ } else if (opc == 1 && !is_load) {
+ /* STGP */
+ if (!dc_isar_feature(aa64_mte_insn_reg, s) || index == 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ size = 3;
+ set_tag = true;
+ } else {
+ size = 2 + extract32(opc, 1, 1);
+ is_signed = extract32(opc, 0, 1);
+ if (!is_load && is_signed) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ switch (index) {
+ case 1: /* post-index */
+ postindex = true;
+ wback = true;
+ break;
+ case 0:
+ /* signed offset with "non-temporal" hint. Since we don't emulate
+ * caches we don't care about hints to the cache system about
+ * data access patterns, and handle this identically to plain
+ * signed offset.
+ */
+ if (is_signed) {
+ /* There is no non-temporal-hint version of LDPSW */
+ unallocated_encoding(s);
+ return;
+ }
+ postindex = false;
+ break;
+ case 2: /* signed offset, rn not updated */
+ postindex = false;
+ break;
+ case 3: /* pre-index */
+ postindex = false;
+ wback = true;
+ break;
+ }
+
+ if (is_vector && !fp_access_check(s)) {
+ return;
+ }
+
+ offset <<= (set_tag ? LOG2_TAG_GRANULE : size);
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+ if (!postindex) {
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+ }
+
+ if (set_tag) {
+ if (!s->ata) {
+ /*
+ * TODO: We could rely on the stores below, at least for
+ * system mode, if we arrange to add MO_ALIGN_16.
+ */
+ gen_helper_stg_stub(cpu_env, dirty_addr);
+ } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+ gen_helper_stg_parallel(cpu_env, dirty_addr, dirty_addr);
+ } else {
+ gen_helper_stg(cpu_env, dirty_addr, dirty_addr);
+ }
+ }
+
+ clean_addr = gen_mte_checkN(s, dirty_addr, !is_load,
+ (wback || rn != 31) && !set_tag, 2 << size);
+
+ if (is_vector) {
+ if (is_load) {
+ do_fp_ld(s, rt, clean_addr, size);
+ } else {
+ do_fp_st(s, rt, clean_addr, size);
+ }
+ tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+ if (is_load) {
+ do_fp_ld(s, rt2, clean_addr, size);
+ } else {
+ do_fp_st(s, rt2, clean_addr, size);
+ }
+ } else {
+ TCGv_i64 tcg_rt = cpu_reg(s, rt);
+ TCGv_i64 tcg_rt2 = cpu_reg(s, rt2);
+
+ if (is_load) {
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ /* Do not modify tcg_rt before recognizing any exception
+ * from the second load.
+ */
+ do_gpr_ld(s, tmp, clean_addr, size + is_signed * MO_SIGN,
+ false, false, 0, false, false);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+ do_gpr_ld(s, tcg_rt2, clean_addr, size + is_signed * MO_SIGN,
+ false, false, 0, false, false);
+
+ tcg_gen_mov_i64(tcg_rt, tmp);
+ tcg_temp_free_i64(tmp);
+ } else {
+ do_gpr_st(s, tcg_rt, clean_addr, size,
+ false, 0, false, false);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+ do_gpr_st(s, tcg_rt2, clean_addr, size,
+ false, 0, false, false);
+ }
+ }
+
+ if (wback) {
+ if (postindex) {
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+ }
+ tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
+ }
+}
+
+/*
+ * Load/store (immediate post-indexed)
+ * Load/store (immediate pre-indexed)
+ * Load/store (unscaled immediate)
+ *
+ * 31 30 29 27 26 25 24 23 22 21 20 12 11 10 9 5 4 0
+ * +----+-------+---+-----+-----+---+--------+-----+------+------+
+ * |size| 1 1 1 | V | 0 0 | opc | 0 | imm9 | idx | Rn | Rt |
+ * +----+-------+---+-----+-----+---+--------+-----+------+------+
+ *
+ * idx = 01 -> post-indexed, 11 pre-indexed, 00 unscaled imm. (no writeback)
+ 10 -> unprivileged
+ * V = 0 -> non-vector
+ * size: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64bit
+ * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ */
+static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn,
+ int opc,
+ int size,
+ int rt,
+ bool is_vector)
+{
+ int rn = extract32(insn, 5, 5);
+ int imm9 = sextract32(insn, 12, 9);
+ int idx = extract32(insn, 10, 2);
+ bool is_signed = false;
+ bool is_store = false;
+ bool is_extended = false;
+ bool is_unpriv = (idx == 2);
+ bool iss_valid;
+ bool post_index;
+ bool writeback;
+ int memidx;
+
+ TCGv_i64 clean_addr, dirty_addr;
+
+ if (is_vector) {
+ size |= (opc & 2) << 1;
+ if (size > 4 || is_unpriv) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = ((opc & 1) == 0);
+ if (!fp_access_check(s)) {
+ return;
+ }
+ } else {
+ if (size == 3 && opc == 2) {
+ /* PRFM - prefetch */
+ if (idx != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ return;
+ }
+ if (opc == 3 && size > 1) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = (opc == 0);
+ is_signed = extract32(opc, 1, 1);
+ is_extended = (size < 3) && extract32(opc, 0, 1);
+ }
+
+ switch (idx) {
+ case 0:
+ case 2:
+ post_index = false;
+ writeback = false;
+ break;
+ case 1:
+ post_index = true;
+ writeback = true;
+ break;
+ case 3:
+ post_index = false;
+ writeback = true;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ iss_valid = !is_vector && !writeback;
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+ if (!post_index) {
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
+ }
+
+ memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s);
+ clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store,
+ writeback || rn != 31,
+ size, is_unpriv, memidx);
+
+ if (is_vector) {
+ if (is_store) {
+ do_fp_st(s, rt, clean_addr, size);
+ } else {
+ do_fp_ld(s, rt, clean_addr, size);
+ }
+ } else {
+ TCGv_i64 tcg_rt = cpu_reg(s, rt);
+ bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+
+ if (is_store) {
+ do_gpr_st_memidx(s, tcg_rt, clean_addr, size, memidx,
+ iss_valid, rt, iss_sf, false);
+ } else {
+ do_gpr_ld_memidx(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+ is_extended, memidx,
+ iss_valid, rt, iss_sf, false);
+ }
+ }
+
+ if (writeback) {
+ TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
+ if (post_index) {
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
+ }
+ tcg_gen_mov_i64(tcg_rn, dirty_addr);
+ }
+}
+
+/*
+ * Load/store (register offset)
+ *
+ * 31 30 29 27 26 25 24 23 22 21 20 16 15 13 12 11 10 9 5 4 0
+ * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
+ * |size| 1 1 1 | V | 0 0 | opc | 1 | Rm | opt | S| 1 0 | Rn | Rt |
+ * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
+ *
+ * For non-vector:
+ * size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
+ * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ * For vector:
+ * size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
+ * opc<0>: 0 -> store, 1 -> load
+ * V: 1 -> vector/simd
+ * opt: extend encoding (see DecodeRegExtend)
+ * S: if S=1 then scale (essentially index by sizeof(size))
+ * Rt: register to transfer into/out of
+ * Rn: address register or SP for base
+ * Rm: offset register or ZR for offset
+ */
+static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
+ int opc,
+ int size,
+ int rt,
+ bool is_vector)
+{
+ int rn = extract32(insn, 5, 5);
+ int shift = extract32(insn, 12, 1);
+ int rm = extract32(insn, 16, 5);
+ int opt = extract32(insn, 13, 3);
+ bool is_signed = false;
+ bool is_store = false;
+ bool is_extended = false;
+
+ TCGv_i64 tcg_rm, clean_addr, dirty_addr;
+
+ if (extract32(opt, 1, 1) == 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (is_vector) {
+ size |= (opc & 2) << 1;
+ if (size > 4) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = !extract32(opc, 0, 1);
+ if (!fp_access_check(s)) {
+ return;
+ }
+ } else {
+ if (size == 3 && opc == 2) {
+ /* PRFM - prefetch */
+ return;
+ }
+ if (opc == 3 && size > 1) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = (opc == 0);
+ is_signed = extract32(opc, 1, 1);
+ is_extended = (size < 3) && extract32(opc, 0, 1);
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+
+ tcg_rm = read_cpu_reg(s, rm, 1);
+ ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0);
+
+ tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm);
+ clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size);
+
+ if (is_vector) {
+ if (is_store) {
+ do_fp_st(s, rt, clean_addr, size);
+ } else {
+ do_fp_ld(s, rt, clean_addr, size);
+ }
+ } else {
+ TCGv_i64 tcg_rt = cpu_reg(s, rt);
+ bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+ if (is_store) {
+ do_gpr_st(s, tcg_rt, clean_addr, size,
+ true, rt, iss_sf, false);
+ } else {
+ do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+ is_extended, true, rt, iss_sf, false);
+ }
+ }
+}
+
+/*
+ * Load/store (unsigned immediate)
+ *
+ * 31 30 29 27 26 25 24 23 22 21 10 9 5
+ * +----+-------+---+-----+-----+------------+-------+------+
+ * |size| 1 1 1 | V | 0 1 | opc | imm12 | Rn | Rt |
+ * +----+-------+---+-----+-----+------------+-------+------+
+ *
+ * For non-vector:
+ * size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
+ * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ * For vector:
+ * size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
+ * opc<0>: 0 -> store, 1 -> load
+ * Rn: base address register (inc SP)
+ * Rt: target register
+ */
+static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn,
+ int opc,
+ int size,
+ int rt,
+ bool is_vector)
+{
+ int rn = extract32(insn, 5, 5);
+ unsigned int imm12 = extract32(insn, 10, 12);
+ unsigned int offset;
+
+ TCGv_i64 clean_addr, dirty_addr;
+
+ bool is_store;
+ bool is_signed = false;
+ bool is_extended = false;
+
+ if (is_vector) {
+ size |= (opc & 2) << 1;
+ if (size > 4) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = !extract32(opc, 0, 1);
+ if (!fp_access_check(s)) {
+ return;
+ }
+ } else {
+ if (size == 3 && opc == 2) {
+ /* PRFM - prefetch */
+ return;
+ }
+ if (opc == 3 && size > 1) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_store = (opc == 0);
+ is_signed = extract32(opc, 1, 1);
+ is_extended = (size < 3) && extract32(opc, 0, 1);
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+ offset = imm12 << size;
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+ clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size);
+
+ if (is_vector) {
+ if (is_store) {
+ do_fp_st(s, rt, clean_addr, size);
+ } else {
+ do_fp_ld(s, rt, clean_addr, size);
+ }
+ } else {
+ TCGv_i64 tcg_rt = cpu_reg(s, rt);
+ bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+ if (is_store) {
+ do_gpr_st(s, tcg_rt, clean_addr, size,
+ true, rt, iss_sf, false);
+ } else {
+ do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+ is_extended, true, rt, iss_sf, false);
+ }
+ }
+}
+
+/* Atomic memory operations
+ *
+ * 31 30 27 26 24 22 21 16 15 12 10 5 0
+ * +------+-------+---+-----+-----+---+----+----+-----+-----+----+-----+
+ * | size | 1 1 1 | V | 0 0 | A R | 1 | Rs | o3 | opc | 0 0 | Rn | Rt |
+ * +------+-------+---+-----+-----+--------+----+-----+-----+----+-----+
+ *
+ * Rt: the result register
+ * Rn: base address or SP
+ * Rs: the source register for the operation
+ * V: vector flag (always 0 as of v8.3)
+ * A: acquire flag
+ * R: release flag
+ */
+static void disas_ldst_atomic(DisasContext *s, uint32_t insn,
+ int size, int rt, bool is_vector)
+{
+ int rs = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int o3_opc = extract32(insn, 12, 4);
+ bool r = extract32(insn, 22, 1);
+ bool a = extract32(insn, 23, 1);
+ TCGv_i64 tcg_rs, tcg_rt, clean_addr;
+ AtomicThreeOpFn *fn = NULL;
+ MemOp mop = s->be_data | size | MO_ALIGN;
+
+ if (is_vector || !dc_isar_feature(aa64_atomics, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ switch (o3_opc) {
+ case 000: /* LDADD */
+ fn = tcg_gen_atomic_fetch_add_i64;
+ break;
+ case 001: /* LDCLR */
+ fn = tcg_gen_atomic_fetch_and_i64;
+ break;
+ case 002: /* LDEOR */
+ fn = tcg_gen_atomic_fetch_xor_i64;
+ break;
+ case 003: /* LDSET */
+ fn = tcg_gen_atomic_fetch_or_i64;
+ break;
+ case 004: /* LDSMAX */
+ fn = tcg_gen_atomic_fetch_smax_i64;
+ mop |= MO_SIGN;
+ break;
+ case 005: /* LDSMIN */
+ fn = tcg_gen_atomic_fetch_smin_i64;
+ mop |= MO_SIGN;
+ break;
+ case 006: /* LDUMAX */
+ fn = tcg_gen_atomic_fetch_umax_i64;
+ break;
+ case 007: /* LDUMIN */
+ fn = tcg_gen_atomic_fetch_umin_i64;
+ break;
+ case 010: /* SWP */
+ fn = tcg_gen_atomic_xchg_i64;
+ break;
+ case 014: /* LDAPR, LDAPRH, LDAPRB */
+ if (!dc_isar_feature(aa64_rcpc_8_3, s) ||
+ rs != 31 || a != 1 || r != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size);
+
+ if (o3_opc == 014) {
+ /*
+ * LDAPR* are a special case because they are a simple load, not a
+ * fetch-and-do-something op.
+ * The architectural consistency requirements here are weaker than
+ * full load-acquire (we only need "load-acquire processor consistent"),
+ * but we choose to implement them as full LDAQ.
+ */
+ do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false,
+ true, rt, disas_ldst_compute_iss_sf(size, false, 0), true);
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ return;
+ }
+
+ tcg_rs = read_cpu_reg(s, rs, true);
+ tcg_rt = cpu_reg(s, rt);
+
+ if (o3_opc == 1) { /* LDCLR */
+ tcg_gen_not_i64(tcg_rs, tcg_rs);
+ }
+
+ /* The tcg atomic primitives are all full barriers. Therefore we
+ * can ignore the Acquire and Release bits of this instruction.
+ */
+ fn(tcg_rt, clean_addr, tcg_rs, get_mem_index(s), mop);
+
+ if ((mop & MO_SIGN) && size != MO_64) {
+ tcg_gen_ext32u_i64(tcg_rt, tcg_rt);
+ }
+}
+
+/*
+ * PAC memory operations
+ *
+ * 31 30 27 26 24 22 21 12 11 10 5 0
+ * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
+ * | size | 1 1 1 | V | 0 0 | M S | 1 | imm9 | W | 1 | Rn | Rt |
+ * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
+ *
+ * Rt: the result register
+ * Rn: base address or SP
+ * V: vector flag (always 0 as of v8.3)
+ * M: clear for key DA, set for key DB
+ * W: pre-indexing flag
+ * S: sign for imm9.
+ */
+static void disas_ldst_pac(DisasContext *s, uint32_t insn,
+ int size, int rt, bool is_vector)
+{
+ int rn = extract32(insn, 5, 5);
+ bool is_wback = extract32(insn, 11, 1);
+ bool use_key_a = !extract32(insn, 23, 1);
+ int offset;
+ TCGv_i64 clean_addr, dirty_addr, tcg_rt;
+
+ if (size != 3 || is_vector || !dc_isar_feature(aa64_pauth, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+
+ if (s->pauth_active) {
+ if (use_key_a) {
+ gen_helper_autda(dirty_addr, cpu_env, dirty_addr,
+ new_tmp_a64_zero(s));
+ } else {
+ gen_helper_autdb(dirty_addr, cpu_env, dirty_addr,
+ new_tmp_a64_zero(s));
+ }
+ }
+
+ /* Form the 10-bit signed, scaled offset. */
+ offset = (extract32(insn, 22, 1) << 9) | extract32(insn, 12, 9);
+ offset = sextract32(offset << size, 0, 10 + size);
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+
+ /* Note that "clean" and "dirty" here refer to TBI not PAC. */
+ clean_addr = gen_mte_check1(s, dirty_addr, false,
+ is_wback || rn != 31, size);
+
+ tcg_rt = cpu_reg(s, rt);
+ do_gpr_ld(s, tcg_rt, clean_addr, size,
+ /* extend */ false, /* iss_valid */ !is_wback,
+ /* iss_srt */ rt, /* iss_sf */ true, /* iss_ar */ false);
+
+ if (is_wback) {
+ tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
+ }
+}
+
+/*
+ * LDAPR/STLR (unscaled immediate)
+ *
+ * 31 30 24 22 21 12 10 5 0
+ * +------+-------------+-----+---+--------+-----+----+-----+
+ * | size | 0 1 1 0 0 1 | opc | 0 | imm9 | 0 0 | Rn | Rt |
+ * +------+-------------+-----+---+--------+-----+----+-----+
+ *
+ * Rt: source or destination register
+ * Rn: base register
+ * imm9: unscaled immediate offset
+ * opc: 00: STLUR*, 01/10/11: various LDAPUR*
+ * size: size of load/store
+ */
+static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int offset = sextract32(insn, 12, 9);
+ int opc = extract32(insn, 22, 2);
+ int size = extract32(insn, 30, 2);
+ TCGv_i64 clean_addr, dirty_addr;
+ bool is_store = false;
+ bool extend = false;
+ bool iss_sf;
+ MemOp mop;
+
+ if (!dc_isar_feature(aa64_rcpc_8_4, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* TODO: ARMv8.4-LSE SCTLR.nAA */
+ mop = size | MO_ALIGN;
+
+ switch (opc) {
+ case 0: /* STLURB */
+ is_store = true;
+ break;
+ case 1: /* LDAPUR* */
+ break;
+ case 2: /* LDAPURS* 64-bit variant */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ mop |= MO_SIGN;
+ break;
+ case 3: /* LDAPURS* 32-bit variant */
+ if (size > 1) {
+ unallocated_encoding(s);
+ return;
+ }
+ mop |= MO_SIGN;
+ extend = true; /* zero-extend 32->64 after signed load */
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ iss_sf = disas_ldst_compute_iss_sf(size, (mop & MO_SIGN) != 0, opc);
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ dirty_addr = read_cpu_reg_sp(s, rn, 1);
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+ clean_addr = clean_data_tbi(s, dirty_addr);
+
+ if (is_store) {
+ /* Store-Release semantics */
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ do_gpr_st(s, cpu_reg(s, rt), clean_addr, mop, true, rt, iss_sf, true);
+ } else {
+ /*
+ * Load-AcquirePC semantics; we implement as the slightly more
+ * restrictive Load-Acquire.
+ */
+ do_gpr_ld(s, cpu_reg(s, rt), clean_addr, mop,
+ extend, true, rt, iss_sf, true);
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ }
+}
+
+/* Load/store register (all forms) */
+static void disas_ldst_reg(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int opc = extract32(insn, 22, 2);
+ bool is_vector = extract32(insn, 26, 1);
+ int size = extract32(insn, 30, 2);
+
+ switch (extract32(insn, 24, 2)) {
+ case 0:
+ if (extract32(insn, 21, 1) == 0) {
+ /* Load/store register (unscaled immediate)
+ * Load/store immediate pre/post-indexed
+ * Load/store register unprivileged
+ */
+ disas_ldst_reg_imm9(s, insn, opc, size, rt, is_vector);
+ return;
+ }
+ switch (extract32(insn, 10, 2)) {
+ case 0:
+ disas_ldst_atomic(s, insn, size, rt, is_vector);
+ return;
+ case 2:
+ disas_ldst_reg_roffset(s, insn, opc, size, rt, is_vector);
+ return;
+ default:
+ disas_ldst_pac(s, insn, size, rt, is_vector);
+ return;
+ }
+ break;
+ case 1:
+ disas_ldst_reg_unsigned_imm(s, insn, opc, size, rt, is_vector);
+ return;
+ }
+ unallocated_encoding(s);
+}
+
+/* AdvSIMD load/store multiple structures
+ *
+ * 31 30 29 23 22 21 16 15 12 11 10 9 5 4 0
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size | Rn | Rt |
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ *
+ * AdvSIMD load/store multiple structures (post-indexed)
+ *
+ * 31 30 29 23 22 21 20 16 15 12 11 10 9 5 4 0
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 | Rm | opcode | size | Rn | Rt |
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ */
+static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 10, 2);
+ int opcode = extract32(insn, 12, 4);
+ bool is_store = !extract32(insn, 22, 1);
+ bool is_postidx = extract32(insn, 23, 1);
+ bool is_q = extract32(insn, 30, 1);
+ TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
+ MemOp endian, align, mop;
+
+ int total; /* total bytes */
+ int elements; /* elements per vector */
+ int rpt; /* num iterations */
+ int selem; /* structure elements */
+ int r;
+
+ if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!is_postidx && rm != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* From the shared decode logic */
+ switch (opcode) {
+ case 0x0:
+ rpt = 1;
+ selem = 4;
+ break;
+ case 0x2:
+ rpt = 4;
+ selem = 1;
+ break;
+ case 0x4:
+ rpt = 1;
+ selem = 3;
+ break;
+ case 0x6:
+ rpt = 3;
+ selem = 1;
+ break;
+ case 0x7:
+ rpt = 1;
+ selem = 1;
+ break;
+ case 0x8:
+ rpt = 1;
+ selem = 2;
+ break;
+ case 0xa:
+ rpt = 2;
+ selem = 1;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size == 3 && !is_q && selem != 1) {
+ /* reserved */
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ /* For our purposes, bytes are always little-endian. */
+ endian = s->be_data;
+ if (size == 0) {
+ endian = MO_LE;
+ }
+
+ total = rpt * selem * (is_q ? 16 : 8);
+ tcg_rn = cpu_reg_sp(s, rn);
+
+ /*
+ * Issue the MTE check vs the logical repeat count, before we
+ * promote consecutive little-endian elements below.
+ */
+ clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31,
+ total);
+
+ /*
+ * Consecutive little-endian elements from a single register
+ * can be promoted to a larger little-endian operation.
+ */
+ align = MO_ALIGN;
+ if (selem == 1 && endian == MO_LE) {
+ align = pow2_align(size);
+ size = 3;
+ }
+ if (!s->align_mem) {
+ align = 0;
+ }
+ mop = endian | size | align;
+
+ elements = (is_q ? 16 : 8) >> size;
+ tcg_ebytes = tcg_constant_i64(1 << size);
+ for (r = 0; r < rpt; r++) {
+ int e;
+ for (e = 0; e < elements; e++) {
+ int xs;
+ for (xs = 0; xs < selem; xs++) {
+ int tt = (rt + r + xs) % 32;
+ if (is_store) {
+ do_vec_st(s, tt, e, clean_addr, mop);
+ } else {
+ do_vec_ld(s, tt, e, clean_addr, mop);
+ }
+ tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
+ }
+ }
+ }
+
+ if (!is_store) {
+ /* For non-quad operations, setting a slice of the low
+ * 64 bits of the register clears the high 64 bits (in
+ * the ARM ARM pseudocode this is implicit in the fact
+ * that 'rval' is a 64 bit wide variable).
+ * For quad operations, we might still need to zero the
+ * high bits of SVE.
+ */
+ for (r = 0; r < rpt * selem; r++) {
+ int tt = (rt + r) % 32;
+ clear_vec_high(s, is_q, tt);
+ }
+ }
+
+ if (is_postidx) {
+ if (rm == 31) {
+ tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
+ } else {
+ tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+ }
+ }
+}
+
+/* AdvSIMD load/store single structure
+ *
+ * 31 30 29 23 22 21 20 16 15 13 12 11 10 9 5 4 0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 0 | L R | 0 0 0 0 0 | opc | S | size | Rn | Rt |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * AdvSIMD load/store single structure (post-indexed)
+ *
+ * 31 30 29 23 22 21 20 16 15 13 12 11 10 9 5 4 0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 1 | L R | Rm | opc | S | size | Rn | Rt |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ * index = encoded in Q:S:size dependent on size
+ *
+ * lane_size = encoded in R, opc
+ * transfer width = encoded in opc, S, size
+ */
+static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 10, 2);
+ int S = extract32(insn, 12, 1);
+ int opc = extract32(insn, 13, 3);
+ int R = extract32(insn, 21, 1);
+ int is_load = extract32(insn, 22, 1);
+ int is_postidx = extract32(insn, 23, 1);
+ int is_q = extract32(insn, 30, 1);
+
+ int scale = extract32(opc, 1, 2);
+ int selem = (extract32(opc, 0, 1) << 1 | R) + 1;
+ bool replicate = false;
+ int index = is_q << 3 | S << 2 | size;
+ int xs, total;
+ TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
+ MemOp mop;
+
+ if (extract32(insn, 31, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!is_postidx && rm != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (scale) {
+ case 3:
+ if (!is_load || S) {
+ unallocated_encoding(s);
+ return;
+ }
+ scale = size;
+ replicate = true;
+ break;
+ case 0:
+ break;
+ case 1:
+ if (extract32(size, 0, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+ index >>= 1;
+ break;
+ case 2:
+ if (extract32(size, 1, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!extract32(size, 0, 1)) {
+ index >>= 2;
+ } else {
+ if (S) {
+ unallocated_encoding(s);
+ return;
+ }
+ index >>= 3;
+ scale = 3;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ total = selem << scale;
+ tcg_rn = cpu_reg_sp(s, rn);
+
+ clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31,
+ total);
+ mop = finalize_memop(s, scale);
+
+ tcg_ebytes = tcg_constant_i64(1 << scale);
+ for (xs = 0; xs < selem; xs++) {
+ if (replicate) {
+ /* Load and replicate to all elements */
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ tcg_gen_qemu_ld_i64(tcg_tmp, clean_addr, get_mem_index(s), mop);
+ tcg_gen_gvec_dup_i64(scale, vec_full_reg_offset(s, rt),
+ (is_q + 1) * 8, vec_full_reg_size(s),
+ tcg_tmp);
+ tcg_temp_free_i64(tcg_tmp);
+ } else {
+ /* Load/store one element per register */
+ if (is_load) {
+ do_vec_ld(s, rt, index, clean_addr, mop);
+ } else {
+ do_vec_st(s, rt, index, clean_addr, mop);
+ }
+ }
+ tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
+ rt = (rt + 1) % 32;
+ }
+
+ if (is_postidx) {
+ if (rm == 31) {
+ tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
+ } else {
+ tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+ }
+ }
+}
+
+/*
+ * Load/Store memory tags
+ *
+ * 31 30 29 24 22 21 12 10 5 0
+ * +-----+-------------+-----+---+------+-----+------+------+
+ * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 | Rn | Rt |
+ * +-----+-------------+-----+---+------+-----+------+------+
+ */
+static void disas_ldst_tag(DisasContext *s, uint32_t insn)
+{
+ int rt = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE;
+ int op2 = extract32(insn, 10, 2);
+ int op1 = extract32(insn, 22, 2);
+ bool is_load = false, is_pair = false, is_zero = false, is_mult = false;
+ int index = 0;
+ TCGv_i64 addr, clean_addr, tcg_rt;
+
+ /* We checked insn bits [29:24,21] in the caller. */
+ if (extract32(insn, 30, 2) != 3) {
+ goto do_unallocated;
+ }
+
+ /*
+ * @index is a tri-state variable which has 3 states:
+ * < 0 : post-index, writeback
+ * = 0 : signed offset
+ * > 0 : pre-index, writeback
+ */
+ switch (op1) {
+ case 0:
+ if (op2 != 0) {
+ /* STG */
+ index = op2 - 2;
+ } else {
+ /* STZGM */
+ if (s->current_el == 0 || offset != 0) {
+ goto do_unallocated;
+ }
+ is_mult = is_zero = true;
+ }
+ break;
+ case 1:
+ if (op2 != 0) {
+ /* STZG */
+ is_zero = true;
+ index = op2 - 2;
+ } else {
+ /* LDG */
+ is_load = true;
+ }
+ break;
+ case 2:
+ if (op2 != 0) {
+ /* ST2G */
+ is_pair = true;
+ index = op2 - 2;
+ } else {
+ /* STGM */
+ if (s->current_el == 0 || offset != 0) {
+ goto do_unallocated;
+ }
+ is_mult = true;
+ }
+ break;
+ case 3:
+ if (op2 != 0) {
+ /* STZ2G */
+ is_pair = is_zero = true;
+ index = op2 - 2;
+ } else {
+ /* LDGM */
+ if (s->current_el == 0 || offset != 0) {
+ goto do_unallocated;
+ }
+ is_mult = is_load = true;
+ }
+ break;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (is_mult
+ ? !dc_isar_feature(aa64_mte, s)
+ : !dc_isar_feature(aa64_mte_insn_reg, s)) {
+ goto do_unallocated;
+ }
+
+ if (rn == 31) {
+ gen_check_sp_alignment(s);
+ }
+
+ addr = read_cpu_reg_sp(s, rn, true);
+ if (index >= 0) {
+ /* pre-index or signed offset */
+ tcg_gen_addi_i64(addr, addr, offset);
+ }
+
+ if (is_mult) {
+ tcg_rt = cpu_reg(s, rt);
+
+ if (is_zero) {
+ int size = 4 << s->dcz_blocksize;
+
+ if (s->ata) {
+ gen_helper_stzgm_tags(cpu_env, addr, tcg_rt);
+ }
+ /*
+ * The non-tags portion of STZGM is mostly like DC_ZVA,
+ * except the alignment happens before the access.
+ */
+ clean_addr = clean_data_tbi(s, addr);
+ tcg_gen_andi_i64(clean_addr, clean_addr, -size);
+ gen_helper_dc_zva(cpu_env, clean_addr);
+ } else if (s->ata) {
+ if (is_load) {
+ gen_helper_ldgm(tcg_rt, cpu_env, addr);
+ } else {
+ gen_helper_stgm(cpu_env, addr, tcg_rt);
+ }
+ } else {
+ MMUAccessType acc = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
+ int size = 4 << GMID_EL1_BS;
+
+ clean_addr = clean_data_tbi(s, addr);
+ tcg_gen_andi_i64(clean_addr, clean_addr, -size);
+ gen_probe_access(s, clean_addr, acc, size);
+
+ if (is_load) {
+ /* The result tags are zeros. */
+ tcg_gen_movi_i64(tcg_rt, 0);
+ }
+ }
+ return;
+ }
+
+ if (is_load) {
+ tcg_gen_andi_i64(addr, addr, -TAG_GRANULE);
+ tcg_rt = cpu_reg(s, rt);
+ if (s->ata) {
+ gen_helper_ldg(tcg_rt, cpu_env, addr, tcg_rt);
+ } else {
+ clean_addr = clean_data_tbi(s, addr);
+ gen_probe_access(s, clean_addr, MMU_DATA_LOAD, MO_8);
+ gen_address_with_allocation_tag0(tcg_rt, addr);
+ }
+ } else {
+ tcg_rt = cpu_reg_sp(s, rt);
+ if (!s->ata) {
+ /*
+ * For STG and ST2G, we need to check alignment and probe memory.
+ * TODO: For STZG and STZ2G, we could rely on the stores below,
+ * at least for system mode; user-only won't enforce alignment.
+ */
+ if (is_pair) {
+ gen_helper_st2g_stub(cpu_env, addr);
+ } else {
+ gen_helper_stg_stub(cpu_env, addr);
+ }
+ } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+ if (is_pair) {
+ gen_helper_st2g_parallel(cpu_env, addr, tcg_rt);
+ } else {
+ gen_helper_stg_parallel(cpu_env, addr, tcg_rt);
+ }
+ } else {
+ if (is_pair) {
+ gen_helper_st2g(cpu_env, addr, tcg_rt);
+ } else {
+ gen_helper_stg(cpu_env, addr, tcg_rt);
+ }
+ }
+ }
+
+ if (is_zero) {
+ TCGv_i64 clean_addr = clean_data_tbi(s, addr);
+ TCGv_i64 tcg_zero = tcg_constant_i64(0);
+ int mem_index = get_mem_index(s);
+ int i, n = (1 + is_pair) << LOG2_TAG_GRANULE;
+
+ tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index,
+ MO_UQ | MO_ALIGN_16);
+ for (i = 8; i < n; i += 8) {
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_UQ);
+ }
+ }
+
+ if (index != 0) {
+ /* pre-index or post-index */
+ if (index < 0) {
+ /* post-index */
+ tcg_gen_addi_i64(addr, addr, offset);
+ }
+ tcg_gen_mov_i64(cpu_reg_sp(s, rn), addr);
+ }
+}
+
+/* Loads and stores */
+static void disas_ldst(DisasContext *s, uint32_t insn)
+{
+ switch (extract32(insn, 24, 6)) {
+ case 0x08: /* Load/store exclusive */
+ disas_ldst_excl(s, insn);
+ break;
+ case 0x18: case 0x1c: /* Load register (literal) */
+ disas_ld_lit(s, insn);
+ break;
+ case 0x28: case 0x29:
+ case 0x2c: case 0x2d: /* Load/store pair (all forms) */
+ disas_ldst_pair(s, insn);
+ break;
+ case 0x38: case 0x39:
+ case 0x3c: case 0x3d: /* Load/store register (all forms) */
+ disas_ldst_reg(s, insn);
+ break;
+ case 0x0c: /* AdvSIMD load/store multiple structures */
+ disas_ldst_multiple_struct(s, insn);
+ break;
+ case 0x0d: /* AdvSIMD load/store single structure */
+ disas_ldst_single_struct(s, insn);
+ break;
+ case 0x19:
+ if (extract32(insn, 21, 1) != 0) {
+ disas_ldst_tag(s, insn);
+ } else if (extract32(insn, 10, 2) == 0) {
+ disas_ldst_ldapr_stlr(s, insn);
+ } else {
+ unallocated_encoding(s);
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* PC-rel. addressing
+ * 31 30 29 28 24 23 5 4 0
+ * +----+-------+-----------+-------------------+------+
+ * | op | immlo | 1 0 0 0 0 | immhi | Rd |
+ * +----+-------+-----------+-------------------+------+
+ */
+static void disas_pc_rel_adr(DisasContext *s, uint32_t insn)
+{
+ unsigned int page, rd;
+ int64_t offset;
+
+ page = extract32(insn, 31, 1);
+ /* SignExtend(immhi:immlo) -> offset */
+ offset = sextract64(insn, 5, 19);
+ offset = offset << 2 | extract32(insn, 29, 2);
+ rd = extract32(insn, 0, 5);
+
+ if (page) {
+ /* ADRP (page based) */
+ offset <<= 12;
+ /* The page offset is ok for TARGET_TB_PCREL. */
+ offset -= s->pc_curr & 0xfff;
+ }
+
+ gen_pc_plus_diff(s, cpu_reg(s, rd), offset);
+}
+
+/*
+ * Add/subtract (immediate)
+ *
+ * 31 30 29 28 23 22 21 10 9 5 4 0
+ * +--+--+--+-------------+--+-------------+-----+-----+
+ * |sf|op| S| 1 0 0 0 1 0 |sh| imm12 | Rn | Rd |
+ * +--+--+--+-------------+--+-------------+-----+-----+
+ *
+ * sf: 0 -> 32bit, 1 -> 64bit
+ * op: 0 -> add , 1 -> sub
+ * S: 1 -> set flags
+ * sh: 1 -> LSL imm by 12
+ */
+static void disas_add_sub_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ uint64_t imm = extract32(insn, 10, 12);
+ bool shift = extract32(insn, 22, 1);
+ bool setflags = extract32(insn, 29, 1);
+ bool sub_op = extract32(insn, 30, 1);
+ bool is_64bit = extract32(insn, 31, 1);
+
+ TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
+ TCGv_i64 tcg_rd = setflags ? cpu_reg(s, rd) : cpu_reg_sp(s, rd);
+ TCGv_i64 tcg_result;
+
+ if (shift) {
+ imm <<= 12;
+ }
+
+ tcg_result = tcg_temp_new_i64();
+ if (!setflags) {
+ if (sub_op) {
+ tcg_gen_subi_i64(tcg_result, tcg_rn, imm);
+ } else {
+ tcg_gen_addi_i64(tcg_result, tcg_rn, imm);
+ }
+ } else {
+ TCGv_i64 tcg_imm = tcg_constant_i64(imm);
+ if (sub_op) {
+ gen_sub_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
+ } else {
+ gen_add_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
+ }
+ }
+
+ if (is_64bit) {
+ tcg_gen_mov_i64(tcg_rd, tcg_result);
+ } else {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+ }
+
+ tcg_temp_free_i64(tcg_result);
+}
+
+/*
+ * Add/subtract (immediate, with tags)
+ *
+ * 31 30 29 28 23 22 21 16 14 10 9 5 4 0
+ * +--+--+--+-------------+--+---------+--+-------+-----+-----+
+ * |sf|op| S| 1 0 0 0 1 1 |o2| uimm6 |o3| uimm4 | Rn | Rd |
+ * +--+--+--+-------------+--+---------+--+-------+-----+-----+
+ *
+ * op: 0 -> add, 1 -> sub
+ */
+static void disas_add_sub_imm_with_tags(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int uimm4 = extract32(insn, 10, 4);
+ int uimm6 = extract32(insn, 16, 6);
+ bool sub_op = extract32(insn, 30, 1);
+ TCGv_i64 tcg_rn, tcg_rd;
+ int imm;
+
+ /* Test all of sf=1, S=0, o2=0, o3=0. */
+ if ((insn & 0xa040c000u) != 0x80000000u ||
+ !dc_isar_feature(aa64_mte_insn_reg, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ imm = uimm6 << LOG2_TAG_GRANULE;
+ if (sub_op) {
+ imm = -imm;
+ }
+
+ tcg_rn = cpu_reg_sp(s, rn);
+ tcg_rd = cpu_reg_sp(s, rd);
+
+ if (s->ata) {
+ gen_helper_addsubg(tcg_rd, cpu_env, tcg_rn,
+ tcg_constant_i32(imm),
+ tcg_constant_i32(uimm4));
+ } else {
+ tcg_gen_addi_i64(tcg_rd, tcg_rn, imm);
+ gen_address_with_allocation_tag0(tcg_rd, tcg_rd);
+ }
+}
+
+/* The input should be a value in the bottom e bits (with higher
+ * bits zero); returns that value replicated into every element
+ * of size e in a 64 bit integer.
+ */
+static uint64_t bitfield_replicate(uint64_t mask, unsigned int e)
+{
+ assert(e != 0);
+ while (e < 64) {
+ mask |= mask << e;
+ e *= 2;
+ }
+ return mask;
+}
+
+/* Return a value with the bottom len bits set (where 0 < len <= 64) */
+static inline uint64_t bitmask64(unsigned int length)
+{
+ assert(length > 0 && length <= 64);
+ return ~0ULL >> (64 - length);
+}
+
+/* Simplified variant of pseudocode DecodeBitMasks() for the case where we
+ * only require the wmask. Returns false if the imms/immr/immn are a reserved
+ * value (ie should cause a guest UNDEF exception), and true if they are
+ * valid, in which case the decoded bit pattern is written to result.
+ */
+bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+ unsigned int imms, unsigned int immr)
+{
+ uint64_t mask;
+ unsigned e, levels, s, r;
+ int len;
+
+ assert(immn < 2 && imms < 64 && immr < 64);
+
+ /* The bit patterns we create here are 64 bit patterns which
+ * are vectors of identical elements of size e = 2, 4, 8, 16, 32 or
+ * 64 bits each. Each element contains the same value: a run
+ * of between 1 and e-1 non-zero bits, rotated within the
+ * element by between 0 and e-1 bits.
+ *
+ * The element size and run length are encoded into immn (1 bit)
+ * and imms (6 bits) as follows:
+ * 64 bit elements: immn = 1, imms = <length of run - 1>
+ * 32 bit elements: immn = 0, imms = 0 : <length of run - 1>
+ * 16 bit elements: immn = 0, imms = 10 : <length of run - 1>
+ * 8 bit elements: immn = 0, imms = 110 : <length of run - 1>
+ * 4 bit elements: immn = 0, imms = 1110 : <length of run - 1>
+ * 2 bit elements: immn = 0, imms = 11110 : <length of run - 1>
+ * Notice that immn = 0, imms = 11111x is the only combination
+ * not covered by one of the above options; this is reserved.
+ * Further, <length of run - 1> all-ones is a reserved pattern.
+ *
+ * In all cases the rotation is by immr % e (and immr is 6 bits).
+ */
+
+ /* First determine the element size */
+ len = 31 - clz32((immn << 6) | (~imms & 0x3f));
+ if (len < 1) {
+ /* This is the immn == 0, imms == 0x11111x case */
+ return false;
+ }
+ e = 1 << len;
+
+ levels = e - 1;
+ s = imms & levels;
+ r = immr & levels;
+
+ if (s == levels) {
+ /* <length of run - 1> mustn't be all-ones. */
+ return false;
+ }
+
+ /* Create the value of one element: s+1 set bits rotated
+ * by r within the element (which is e bits wide)...
+ */
+ mask = bitmask64(s + 1);
+ if (r) {
+ mask = (mask >> r) | (mask << (e - r));
+ mask &= bitmask64(e);
+ }
+ /* ...then replicate the element over the whole 64 bit value */
+ mask = bitfield_replicate(mask, e);
+ *result = mask;
+ return true;
+}
+
+/* Logical (immediate)
+ * 31 30 29 28 23 22 21 16 15 10 9 5 4 0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 0 0 | N | immr | imms | Rn | Rd |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_logic_imm(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, opc, is_n, immr, imms, rn, rd;
+ TCGv_i64 tcg_rd, tcg_rn;
+ uint64_t wmask;
+ bool is_and = false;
+
+ sf = extract32(insn, 31, 1);
+ opc = extract32(insn, 29, 2);
+ is_n = extract32(insn, 22, 1);
+ immr = extract32(insn, 16, 6);
+ imms = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ if (!sf && is_n) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (opc == 0x3) { /* ANDS */
+ tcg_rd = cpu_reg(s, rd);
+ } else {
+ tcg_rd = cpu_reg_sp(s, rd);
+ }
+ tcg_rn = cpu_reg(s, rn);
+
+ if (!logic_imm_decode_wmask(&wmask, is_n, imms, immr)) {
+ /* some immediate field values are reserved */
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!sf) {
+ wmask &= 0xffffffff;
+ }
+
+ switch (opc) {
+ case 0x3: /* ANDS */
+ case 0x0: /* AND */
+ tcg_gen_andi_i64(tcg_rd, tcg_rn, wmask);
+ is_and = true;
+ break;
+ case 0x1: /* ORR */
+ tcg_gen_ori_i64(tcg_rd, tcg_rn, wmask);
+ break;
+ case 0x2: /* EOR */
+ tcg_gen_xori_i64(tcg_rd, tcg_rn, wmask);
+ break;
+ default:
+ assert(FALSE); /* must handle all above */
+ break;
+ }
+
+ if (!sf && !is_and) {
+ /* zero extend final result; we know we can skip this for AND
+ * since the immediate had the high 32 bits clear.
+ */
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+
+ if (opc == 3) { /* ANDS */
+ gen_logic_CC(sf, tcg_rd);
+ }
+}
+
+/*
+ * Move wide (immediate)
+ *
+ * 31 30 29 28 23 22 21 20 5 4 0
+ * +--+-----+-------------+-----+----------------+------+
+ * |sf| opc | 1 0 0 1 0 1 | hw | imm16 | Rd |
+ * +--+-----+-------------+-----+----------------+------+
+ *
+ * sf: 0 -> 32 bit, 1 -> 64 bit
+ * opc: 00 -> N, 10 -> Z, 11 -> K
+ * hw: shift/16 (0,16, and sf only 32, 48)
+ */
+static void disas_movw_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ uint64_t imm = extract32(insn, 5, 16);
+ int sf = extract32(insn, 31, 1);
+ int opc = extract32(insn, 29, 2);
+ int pos = extract32(insn, 21, 2) << 4;
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+
+ if (!sf && (pos >= 32)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opc) {
+ case 0: /* MOVN */
+ case 2: /* MOVZ */
+ imm <<= pos;
+ if (opc == 0) {
+ imm = ~imm;
+ }
+ if (!sf) {
+ imm &= 0xffffffffu;
+ }
+ tcg_gen_movi_i64(tcg_rd, imm);
+ break;
+ case 3: /* MOVK */
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_constant_i64(imm), pos, 16);
+ if (!sf) {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* Bitfield
+ * 31 30 29 28 23 22 21 16 15 10 9 5 4 0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 1 0 | N | immr | imms | Rn | Rd |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_bitfield(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, n, opc, ri, si, rn, rd, bitsize, pos, len;
+ TCGv_i64 tcg_rd, tcg_tmp;
+
+ sf = extract32(insn, 31, 1);
+ opc = extract32(insn, 29, 2);
+ n = extract32(insn, 22, 1);
+ ri = extract32(insn, 16, 6);
+ si = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+ bitsize = sf ? 64 : 32;
+
+ if (sf != n || ri >= bitsize || si >= bitsize || opc > 2) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rd = cpu_reg(s, rd);
+
+ /* Suppress the zero-extend for !sf. Since RI and SI are constrained
+ to be smaller than bitsize, we'll never reference data outside the
+ low 32-bits anyway. */
+ tcg_tmp = read_cpu_reg(s, rn, 1);
+
+ /* Recognize simple(r) extractions. */
+ if (si >= ri) {
+ /* Wd<s-r:0> = Wn<s:r> */
+ len = (si - ri) + 1;
+ if (opc == 0) { /* SBFM: ASR, SBFX, SXTB, SXTH, SXTW */
+ tcg_gen_sextract_i64(tcg_rd, tcg_tmp, ri, len);
+ goto done;
+ } else if (opc == 2) { /* UBFM: UBFX, LSR, UXTB, UXTH */
+ tcg_gen_extract_i64(tcg_rd, tcg_tmp, ri, len);
+ return;
+ }
+ /* opc == 1, BFXIL fall through to deposit */
+ tcg_gen_shri_i64(tcg_tmp, tcg_tmp, ri);
+ pos = 0;
+ } else {
+ /* Handle the ri > si case with a deposit
+ * Wd<32+s-r,32-r> = Wn<s:0>
+ */
+ len = si + 1;
+ pos = (bitsize - ri) & (bitsize - 1);
+ }
+
+ if (opc == 0 && len < ri) {
+ /* SBFM: sign extend the destination field from len to fill
+ the balance of the word. Let the deposit below insert all
+ of those sign bits. */
+ tcg_gen_sextract_i64(tcg_tmp, tcg_tmp, 0, len);
+ len = ri;
+ }
+
+ if (opc == 1) { /* BFM, BFXIL */
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, pos, len);
+ } else {
+ /* SBFM or UBFM: We start with zero, and we haven't modified
+ any bits outside bitsize, therefore the zero-extension
+ below is unneeded. */
+ tcg_gen_deposit_z_i64(tcg_rd, tcg_tmp, pos, len);
+ return;
+ }
+
+ done:
+ if (!sf) { /* zero extend final result */
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+}
+
+/* Extract
+ * 31 30 29 28 23 22 21 20 16 15 10 9 5 4 0
+ * +----+------+-------------+---+----+------+--------+------+------+
+ * | sf | op21 | 1 0 0 1 1 1 | N | o0 | Rm | imms | Rn | Rd |
+ * +----+------+-------------+---+----+------+--------+------+------+
+ */
+static void disas_extract(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, n, rm, imm, rn, rd, bitsize, op21, op0;
+
+ sf = extract32(insn, 31, 1);
+ n = extract32(insn, 22, 1);
+ rm = extract32(insn, 16, 5);
+ imm = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+ op21 = extract32(insn, 29, 2);
+ op0 = extract32(insn, 21, 1);
+ bitsize = sf ? 64 : 32;
+
+ if (sf != n || op21 || op0 || imm >= bitsize) {
+ unallocated_encoding(s);
+ } else {
+ TCGv_i64 tcg_rd, tcg_rm, tcg_rn;
+
+ tcg_rd = cpu_reg(s, rd);
+
+ if (unlikely(imm == 0)) {
+ /* tcg shl_i32/shl_i64 is undefined for 32/64 bit shifts,
+ * so an extract from bit 0 is a special case.
+ */
+ if (sf) {
+ tcg_gen_mov_i64(tcg_rd, cpu_reg(s, rm));
+ } else {
+ tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rm));
+ }
+ } else {
+ tcg_rm = cpu_reg(s, rm);
+ tcg_rn = cpu_reg(s, rn);
+
+ if (sf) {
+ /* Specialization to ROR happens in EXTRACT2. */
+ tcg_gen_extract2_i64(tcg_rd, tcg_rm, tcg_rn, imm);
+ } else {
+ TCGv_i32 t0 = tcg_temp_new_i32();
+
+ tcg_gen_extrl_i64_i32(t0, tcg_rm);
+ if (rm == rn) {
+ tcg_gen_rotri_i32(t0, t0, imm);
+ } else {
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t1, tcg_rn);
+ tcg_gen_extract2_i32(t0, t0, t1, imm);
+ tcg_temp_free_i32(t1);
+ }
+ tcg_gen_extu_i32_i64(tcg_rd, t0);
+ tcg_temp_free_i32(t0);
+ }
+ }
+ }
+}
+
+/* Data processing - immediate */
+static void disas_data_proc_imm(DisasContext *s, uint32_t insn)
+{
+ switch (extract32(insn, 23, 6)) {
+ case 0x20: case 0x21: /* PC-rel. addressing */
+ disas_pc_rel_adr(s, insn);
+ break;
+ case 0x22: /* Add/subtract (immediate) */
+ disas_add_sub_imm(s, insn);
+ break;
+ case 0x23: /* Add/subtract (immediate, with tags) */
+ disas_add_sub_imm_with_tags(s, insn);
+ break;
+ case 0x24: /* Logical (immediate) */
+ disas_logic_imm(s, insn);
+ break;
+ case 0x25: /* Move wide (immediate) */
+ disas_movw_imm(s, insn);
+ break;
+ case 0x26: /* Bitfield */
+ disas_bitfield(s, insn);
+ break;
+ case 0x27: /* Extract */
+ disas_extract(s, insn);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* Shift a TCGv src by TCGv shift_amount, put result in dst.
+ * Note that it is the caller's responsibility to ensure that the
+ * shift amount is in range (ie 0..31 or 0..63) and provide the ARM
+ * mandated semantics for out of range shifts.
+ */
+static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf,
+ enum a64_shift_type shift_type, TCGv_i64 shift_amount)
+{
+ switch (shift_type) {
+ case A64_SHIFT_TYPE_LSL:
+ tcg_gen_shl_i64(dst, src, shift_amount);
+ break;
+ case A64_SHIFT_TYPE_LSR:
+ tcg_gen_shr_i64(dst, src, shift_amount);
+ break;
+ case A64_SHIFT_TYPE_ASR:
+ if (!sf) {
+ tcg_gen_ext32s_i64(dst, src);
+ }
+ tcg_gen_sar_i64(dst, sf ? src : dst, shift_amount);
+ break;
+ case A64_SHIFT_TYPE_ROR:
+ if (sf) {
+ tcg_gen_rotr_i64(dst, src, shift_amount);
+ } else {
+ TCGv_i32 t0, t1;
+ t0 = tcg_temp_new_i32();
+ t1 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t0, src);
+ tcg_gen_extrl_i64_i32(t1, shift_amount);
+ tcg_gen_rotr_i32(t0, t0, t1);
+ tcg_gen_extu_i32_i64(dst, t0);
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+ }
+ break;
+ default:
+ assert(FALSE); /* all shift types should be handled */
+ break;
+ }
+
+ if (!sf) { /* zero extend final result */
+ tcg_gen_ext32u_i64(dst, dst);
+ }
+}
+
+/* Shift a TCGv src by immediate, put result in dst.
+ * The shift amount must be in range (this should always be true as the
+ * relevant instructions will UNDEF on bad shift immediates).
+ */
+static void shift_reg_imm(TCGv_i64 dst, TCGv_i64 src, int sf,
+ enum a64_shift_type shift_type, unsigned int shift_i)
+{
+ assert(shift_i < (sf ? 64 : 32));
+
+ if (shift_i == 0) {
+ tcg_gen_mov_i64(dst, src);
+ } else {
+ shift_reg(dst, src, sf, shift_type, tcg_constant_i64(shift_i));
+ }
+}
+
+/* Logical (shifted register)
+ * 31 30 29 28 24 23 22 21 20 16 15 10 9 5 4 0
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ * | sf | opc | 0 1 0 1 0 | shift | N | Rm | imm6 | Rn | Rd |
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ */
+static void disas_logic_reg(DisasContext *s, uint32_t insn)
+{
+ TCGv_i64 tcg_rd, tcg_rn, tcg_rm;
+ unsigned int sf, opc, shift_type, invert, rm, shift_amount, rn, rd;
+
+ sf = extract32(insn, 31, 1);
+ opc = extract32(insn, 29, 2);
+ shift_type = extract32(insn, 22, 2);
+ invert = extract32(insn, 21, 1);
+ rm = extract32(insn, 16, 5);
+ shift_amount = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ if (!sf && (shift_amount & (1 << 5))) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rd = cpu_reg(s, rd);
+
+ if (opc == 1 && shift_amount == 0 && shift_type == 0 && rn == 31) {
+ /* Unshifted ORR and ORN with WZR/XZR is the standard encoding for
+ * register-register MOV and MVN, so it is worth special casing.
+ */
+ tcg_rm = cpu_reg(s, rm);
+ if (invert) {
+ tcg_gen_not_i64(tcg_rd, tcg_rm);
+ if (!sf) {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+ } else {
+ if (sf) {
+ tcg_gen_mov_i64(tcg_rd, tcg_rm);
+ } else {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rm);
+ }
+ }
+ return;
+ }
+
+ tcg_rm = read_cpu_reg(s, rm, sf);
+
+ if (shift_amount) {
+ shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, shift_amount);
+ }
+
+ tcg_rn = cpu_reg(s, rn);
+
+ switch (opc | (invert << 2)) {
+ case 0: /* AND */
+ case 3: /* ANDS */
+ tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 1: /* ORR */
+ tcg_gen_or_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 2: /* EOR */
+ tcg_gen_xor_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 4: /* BIC */
+ case 7: /* BICS */
+ tcg_gen_andc_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 5: /* ORN */
+ tcg_gen_orc_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 6: /* EON */
+ tcg_gen_eqv_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ default:
+ assert(FALSE);
+ break;
+ }
+
+ if (!sf) {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+
+ if (opc == 3) {
+ gen_logic_CC(sf, tcg_rd);
+ }
+}
+
+/*
+ * Add/subtract (extended register)
+ *
+ * 31|30|29|28 24|23 22|21|20 16|15 13|12 10|9 5|4 0|
+ * +--+--+--+-----------+-----+--+-------+------+------+----+----+
+ * |sf|op| S| 0 1 0 1 1 | opt | 1| Rm |option| imm3 | Rn | Rd |
+ * +--+--+--+-----------+-----+--+-------+------+------+----+----+
+ *
+ * sf: 0 -> 32bit, 1 -> 64bit
+ * op: 0 -> add , 1 -> sub
+ * S: 1 -> set flags
+ * opt: 00
+ * option: extension type (see DecodeRegExtend)
+ * imm3: optional shift to Rm
+ *
+ * Rd = Rn + LSL(extend(Rm), amount)
+ */
+static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int imm3 = extract32(insn, 10, 3);
+ int option = extract32(insn, 13, 3);
+ int rm = extract32(insn, 16, 5);
+ int opt = extract32(insn, 22, 2);
+ bool setflags = extract32(insn, 29, 1);
+ bool sub_op = extract32(insn, 30, 1);
+ bool sf = extract32(insn, 31, 1);
+
+ TCGv_i64 tcg_rm, tcg_rn; /* temps */
+ TCGv_i64 tcg_rd;
+ TCGv_i64 tcg_result;
+
+ if (imm3 > 4 || opt != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* non-flag setting ops may use SP */
+ if (!setflags) {
+ tcg_rd = cpu_reg_sp(s, rd);
+ } else {
+ tcg_rd = cpu_reg(s, rd);
+ }
+ tcg_rn = read_cpu_reg_sp(s, rn, sf);
+
+ tcg_rm = read_cpu_reg(s, rm, sf);
+ ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3);
+
+ tcg_result = tcg_temp_new_i64();
+
+ if (!setflags) {
+ if (sub_op) {
+ tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
+ } else {
+ tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
+ }
+ } else {
+ if (sub_op) {
+ gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
+ } else {
+ gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
+ }
+ }
+
+ if (sf) {
+ tcg_gen_mov_i64(tcg_rd, tcg_result);
+ } else {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+ }
+
+ tcg_temp_free_i64(tcg_result);
+}
+
+/*
+ * Add/subtract (shifted register)
+ *
+ * 31 30 29 28 24 23 22 21 20 16 15 10 9 5 4 0
+ * +--+--+--+-----------+-----+--+-------+---------+------+------+
+ * |sf|op| S| 0 1 0 1 1 |shift| 0| Rm | imm6 | Rn | Rd |
+ * +--+--+--+-----------+-----+--+-------+---------+------+------+
+ *
+ * sf: 0 -> 32bit, 1 -> 64bit
+ * op: 0 -> add , 1 -> sub
+ * S: 1 -> set flags
+ * shift: 00 -> LSL, 01 -> LSR, 10 -> ASR, 11 -> RESERVED
+ * imm6: Shift amount to apply to Rm before the add/sub
+ */
+static void disas_add_sub_reg(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int imm6 = extract32(insn, 10, 6);
+ int rm = extract32(insn, 16, 5);
+ int shift_type = extract32(insn, 22, 2);
+ bool setflags = extract32(insn, 29, 1);
+ bool sub_op = extract32(insn, 30, 1);
+ bool sf = extract32(insn, 31, 1);
+
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+ TCGv_i64 tcg_rn, tcg_rm;
+ TCGv_i64 tcg_result;
+
+ if ((shift_type == 3) || (!sf && (imm6 > 31))) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rn = read_cpu_reg(s, rn, sf);
+ tcg_rm = read_cpu_reg(s, rm, sf);
+
+ shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, imm6);
+
+ tcg_result = tcg_temp_new_i64();
+
+ if (!setflags) {
+ if (sub_op) {
+ tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
+ } else {
+ tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
+ }
+ } else {
+ if (sub_op) {
+ gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
+ } else {
+ gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
+ }
+ }
+
+ if (sf) {
+ tcg_gen_mov_i64(tcg_rd, tcg_result);
+ } else {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+ }
+
+ tcg_temp_free_i64(tcg_result);
+}
+
+/* Data-processing (3 source)
+ *
+ * 31 30 29 28 24 23 21 20 16 15 14 10 9 5 4 0
+ * +--+------+-----------+------+------+----+------+------+------+
+ * |sf| op54 | 1 1 0 1 1 | op31 | Rm | o0 | Ra | Rn | Rd |
+ * +--+------+-----------+------+------+----+------+------+------+
+ */
+static void disas_data_proc_3src(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int ra = extract32(insn, 10, 5);
+ int rm = extract32(insn, 16, 5);
+ int op_id = (extract32(insn, 29, 3) << 4) |
+ (extract32(insn, 21, 3) << 1) |
+ extract32(insn, 15, 1);
+ bool sf = extract32(insn, 31, 1);
+ bool is_sub = extract32(op_id, 0, 1);
+ bool is_high = extract32(op_id, 2, 1);
+ bool is_signed = false;
+ TCGv_i64 tcg_op1;
+ TCGv_i64 tcg_op2;
+ TCGv_i64 tcg_tmp;
+
+ /* Note that op_id is sf:op54:op31:o0 so it includes the 32/64 size flag */
+ switch (op_id) {
+ case 0x42: /* SMADDL */
+ case 0x43: /* SMSUBL */
+ case 0x44: /* SMULH */
+ is_signed = true;
+ break;
+ case 0x0: /* MADD (32bit) */
+ case 0x1: /* MSUB (32bit) */
+ case 0x40: /* MADD (64bit) */
+ case 0x41: /* MSUB (64bit) */
+ case 0x4a: /* UMADDL */
+ case 0x4b: /* UMSUBL */
+ case 0x4c: /* UMULH */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (is_high) {
+ TCGv_i64 low_bits = tcg_temp_new_i64(); /* low bits discarded */
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+ TCGv_i64 tcg_rn = cpu_reg(s, rn);
+ TCGv_i64 tcg_rm = cpu_reg(s, rm);
+
+ if (is_signed) {
+ tcg_gen_muls2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
+ } else {
+ tcg_gen_mulu2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
+ }
+
+ tcg_temp_free_i64(low_bits);
+ return;
+ }
+
+ tcg_op1 = tcg_temp_new_i64();
+ tcg_op2 = tcg_temp_new_i64();
+ tcg_tmp = tcg_temp_new_i64();
+
+ if (op_id < 0x42) {
+ tcg_gen_mov_i64(tcg_op1, cpu_reg(s, rn));
+ tcg_gen_mov_i64(tcg_op2, cpu_reg(s, rm));
+ } else {
+ if (is_signed) {
+ tcg_gen_ext32s_i64(tcg_op1, cpu_reg(s, rn));
+ tcg_gen_ext32s_i64(tcg_op2, cpu_reg(s, rm));
+ } else {
+ tcg_gen_ext32u_i64(tcg_op1, cpu_reg(s, rn));
+ tcg_gen_ext32u_i64(tcg_op2, cpu_reg(s, rm));
+ }
+ }
+
+ if (ra == 31 && !is_sub) {
+ /* Special-case MADD with rA == XZR; it is the standard MUL alias */
+ tcg_gen_mul_i64(cpu_reg(s, rd), tcg_op1, tcg_op2);
+ } else {
+ tcg_gen_mul_i64(tcg_tmp, tcg_op1, tcg_op2);
+ if (is_sub) {
+ tcg_gen_sub_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
+ } else {
+ tcg_gen_add_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
+ }
+ }
+
+ if (!sf) {
+ tcg_gen_ext32u_i64(cpu_reg(s, rd), cpu_reg(s, rd));
+ }
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Add/subtract (with carry)
+ * 31 30 29 28 27 26 25 24 23 22 21 20 16 15 10 9 5 4 0
+ * +--+--+--+------------------------+------+-------------+------+-----+
+ * |sf|op| S| 1 1 0 1 0 0 0 0 | rm | 0 0 0 0 0 0 | Rn | Rd |
+ * +--+--+--+------------------------+------+-------------+------+-----+
+ */
+
+static void disas_adc_sbc(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, op, setflags, rm, rn, rd;
+ TCGv_i64 tcg_y, tcg_rn, tcg_rd;
+
+ sf = extract32(insn, 31, 1);
+ op = extract32(insn, 30, 1);
+ setflags = extract32(insn, 29, 1);
+ rm = extract32(insn, 16, 5);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ tcg_rd = cpu_reg(s, rd);
+ tcg_rn = cpu_reg(s, rn);
+
+ if (op) {
+ tcg_y = new_tmp_a64(s);
+ tcg_gen_not_i64(tcg_y, cpu_reg(s, rm));
+ } else {
+ tcg_y = cpu_reg(s, rm);
+ }
+
+ if (setflags) {
+ gen_adc_CC(sf, tcg_rd, tcg_rn, tcg_y);
+ } else {
+ gen_adc(sf, tcg_rd, tcg_rn, tcg_y);
+ }
+}
+
+/*
+ * Rotate right into flags
+ * 31 30 29 21 15 10 5 4 0
+ * +--+--+--+-----------------+--------+-----------+------+--+------+
+ * |sf|op| S| 1 1 0 1 0 0 0 0 | imm6 | 0 0 0 0 1 | Rn |o2| mask |
+ * +--+--+--+-----------------+--------+-----------+------+--+------+
+ */
+static void disas_rotate_right_into_flags(DisasContext *s, uint32_t insn)
+{
+ int mask = extract32(insn, 0, 4);
+ int o2 = extract32(insn, 4, 1);
+ int rn = extract32(insn, 5, 5);
+ int imm6 = extract32(insn, 15, 6);
+ int sf_op_s = extract32(insn, 29, 3);
+ TCGv_i64 tcg_rn;
+ TCGv_i32 nzcv;
+
+ if (sf_op_s != 5 || o2 != 0 || !dc_isar_feature(aa64_condm_4, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rn = read_cpu_reg(s, rn, 1);
+ tcg_gen_rotri_i64(tcg_rn, tcg_rn, imm6);
+
+ nzcv = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(nzcv, tcg_rn);
+
+ if (mask & 8) { /* N */
+ tcg_gen_shli_i32(cpu_NF, nzcv, 31 - 3);
+ }
+ if (mask & 4) { /* Z */
+ tcg_gen_not_i32(cpu_ZF, nzcv);
+ tcg_gen_andi_i32(cpu_ZF, cpu_ZF, 4);
+ }
+ if (mask & 2) { /* C */
+ tcg_gen_extract_i32(cpu_CF, nzcv, 1, 1);
+ }
+ if (mask & 1) { /* V */
+ tcg_gen_shli_i32(cpu_VF, nzcv, 31 - 0);
+ }
+
+ tcg_temp_free_i32(nzcv);
+}
+
+/*
+ * Evaluate into flags
+ * 31 30 29 21 15 14 10 5 4 0
+ * +--+--+--+-----------------+---------+----+---------+------+--+------+
+ * |sf|op| S| 1 1 0 1 0 0 0 0 | opcode2 | sz | 0 0 1 0 | Rn |o3| mask |
+ * +--+--+--+-----------------+---------+----+---------+------+--+------+
+ */
+static void disas_evaluate_into_flags(DisasContext *s, uint32_t insn)
+{
+ int o3_mask = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int o2 = extract32(insn, 15, 6);
+ int sz = extract32(insn, 14, 1);
+ int sf_op_s = extract32(insn, 29, 3);
+ TCGv_i32 tmp;
+ int shift;
+
+ if (sf_op_s != 1 || o2 != 0 || o3_mask != 0xd ||
+ !dc_isar_feature(aa64_condm_4, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ shift = sz ? 16 : 24; /* SETF16 or SETF8 */
+
+ tmp = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tmp, cpu_reg(s, rn));
+ tcg_gen_shli_i32(cpu_NF, tmp, shift);
+ tcg_gen_shli_i32(cpu_VF, tmp, shift - 1);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_xor_i32(cpu_VF, cpu_VF, cpu_NF);
+ tcg_temp_free_i32(tmp);
+}
+
+/* Conditional compare (immediate / register)
+ * 31 30 29 28 27 26 25 24 23 22 21 20 16 15 12 11 10 9 5 4 3 0
+ * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
+ * |sf|op| S| 1 1 0 1 0 0 1 0 |imm5/rm | cond |i/r |o2| Rn |o3|nzcv |
+ * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
+ * [1] y [0] [0]
+ */
+static void disas_cc(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, op, y, cond, rn, nzcv, is_imm;
+ TCGv_i32 tcg_t0, tcg_t1, tcg_t2;
+ TCGv_i64 tcg_tmp, tcg_y, tcg_rn;
+ DisasCompare c;
+
+ if (!extract32(insn, 29, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (insn & (1 << 10 | 1 << 4)) {
+ unallocated_encoding(s);
+ return;
+ }
+ sf = extract32(insn, 31, 1);
+ op = extract32(insn, 30, 1);
+ is_imm = extract32(insn, 11, 1);
+ y = extract32(insn, 16, 5); /* y = rm (reg) or imm5 (imm) */
+ cond = extract32(insn, 12, 4);
+ rn = extract32(insn, 5, 5);
+ nzcv = extract32(insn, 0, 4);
+
+ /* Set T0 = !COND. */
+ tcg_t0 = tcg_temp_new_i32();
+ arm_test_cc(&c, cond);
+ tcg_gen_setcondi_i32(tcg_invert_cond(c.cond), tcg_t0, c.value, 0);
+ arm_free_cc(&c);
+
+ /* Load the arguments for the new comparison. */
+ if (is_imm) {
+ tcg_y = new_tmp_a64(s);
+ tcg_gen_movi_i64(tcg_y, y);
+ } else {
+ tcg_y = cpu_reg(s, y);
+ }
+ tcg_rn = cpu_reg(s, rn);
+
+ /* Set the flags for the new comparison. */
+ tcg_tmp = tcg_temp_new_i64();
+ if (op) {
+ gen_sub_CC(sf, tcg_tmp, tcg_rn, tcg_y);
+ } else {
+ gen_add_CC(sf, tcg_tmp, tcg_rn, tcg_y);
+ }
+ tcg_temp_free_i64(tcg_tmp);
+
+ /* If COND was false, force the flags to #nzcv. Compute two masks
+ * to help with this: T1 = (COND ? 0 : -1), T2 = (COND ? -1 : 0).
+ * For tcg hosts that support ANDC, we can make do with just T1.
+ * In either case, allow the tcg optimizer to delete any unused mask.
+ */
+ tcg_t1 = tcg_temp_new_i32();
+ tcg_t2 = tcg_temp_new_i32();
+ tcg_gen_neg_i32(tcg_t1, tcg_t0);
+ tcg_gen_subi_i32(tcg_t2, tcg_t0, 1);
+
+ if (nzcv & 8) { /* N */
+ tcg_gen_or_i32(cpu_NF, cpu_NF, tcg_t1);
+ } else {
+ if (TCG_TARGET_HAS_andc_i32) {
+ tcg_gen_andc_i32(cpu_NF, cpu_NF, tcg_t1);
+ } else {
+ tcg_gen_and_i32(cpu_NF, cpu_NF, tcg_t2);
+ }
+ }
+ if (nzcv & 4) { /* Z */
+ if (TCG_TARGET_HAS_andc_i32) {
+ tcg_gen_andc_i32(cpu_ZF, cpu_ZF, tcg_t1);
+ } else {
+ tcg_gen_and_i32(cpu_ZF, cpu_ZF, tcg_t2);
+ }
+ } else {
+ tcg_gen_or_i32(cpu_ZF, cpu_ZF, tcg_t0);
+ }
+ if (nzcv & 2) { /* C */
+ tcg_gen_or_i32(cpu_CF, cpu_CF, tcg_t0);
+ } else {
+ if (TCG_TARGET_HAS_andc_i32) {
+ tcg_gen_andc_i32(cpu_CF, cpu_CF, tcg_t1);
+ } else {
+ tcg_gen_and_i32(cpu_CF, cpu_CF, tcg_t2);
+ }
+ }
+ if (nzcv & 1) { /* V */
+ tcg_gen_or_i32(cpu_VF, cpu_VF, tcg_t1);
+ } else {
+ if (TCG_TARGET_HAS_andc_i32) {
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, tcg_t1);
+ } else {
+ tcg_gen_and_i32(cpu_VF, cpu_VF, tcg_t2);
+ }
+ }
+ tcg_temp_free_i32(tcg_t0);
+ tcg_temp_free_i32(tcg_t1);
+ tcg_temp_free_i32(tcg_t2);
+}
+
+/* Conditional select
+ * 31 30 29 28 21 20 16 15 12 11 10 9 5 4 0
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ * | sf | op | S | 1 1 0 1 0 1 0 0 | Rm | cond | op2 | Rn | Rd |
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ */
+static void disas_cond_select(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, else_inv, rm, cond, else_inc, rn, rd;
+ TCGv_i64 tcg_rd, zero;
+ DisasCompare64 c;
+
+ if (extract32(insn, 29, 1) || extract32(insn, 11, 1)) {
+ /* S == 1 or op2<1> == 1 */
+ unallocated_encoding(s);
+ return;
+ }
+ sf = extract32(insn, 31, 1);
+ else_inv = extract32(insn, 30, 1);
+ rm = extract32(insn, 16, 5);
+ cond = extract32(insn, 12, 4);
+ else_inc = extract32(insn, 10, 1);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ tcg_rd = cpu_reg(s, rd);
+
+ a64_test_cc(&c, cond);
+ zero = tcg_constant_i64(0);
+
+ if (rn == 31 && rm == 31 && (else_inc ^ else_inv)) {
+ /* CSET & CSETM. */
+ tcg_gen_setcond_i64(tcg_invert_cond(c.cond), tcg_rd, c.value, zero);
+ if (else_inv) {
+ tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ }
+ } else {
+ TCGv_i64 t_true = cpu_reg(s, rn);
+ TCGv_i64 t_false = read_cpu_reg(s, rm, 1);
+ if (else_inv && else_inc) {
+ tcg_gen_neg_i64(t_false, t_false);
+ } else if (else_inv) {
+ tcg_gen_not_i64(t_false, t_false);
+ } else if (else_inc) {
+ tcg_gen_addi_i64(t_false, t_false, 1);
+ }
+ tcg_gen_movcond_i64(c.cond, tcg_rd, c.value, zero, t_true, t_false);
+ }
+
+ a64_free_cc(&c);
+
+ if (!sf) {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+}
+
+static void handle_clz(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_rd, tcg_rn;
+ tcg_rd = cpu_reg(s, rd);
+ tcg_rn = cpu_reg(s, rn);
+
+ if (sf) {
+ tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
+ } else {
+ TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+ tcg_gen_clzi_i32(tcg_tmp32, tcg_tmp32, 32);
+ tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+ tcg_temp_free_i32(tcg_tmp32);
+ }
+}
+
+static void handle_cls(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_rd, tcg_rn;
+ tcg_rd = cpu_reg(s, rd);
+ tcg_rn = cpu_reg(s, rn);
+
+ if (sf) {
+ tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
+ } else {
+ TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+ tcg_gen_clrsb_i32(tcg_tmp32, tcg_tmp32);
+ tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+ tcg_temp_free_i32(tcg_tmp32);
+ }
+}
+
+static void handle_rbit(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_rd, tcg_rn;
+ tcg_rd = cpu_reg(s, rd);
+ tcg_rn = cpu_reg(s, rn);
+
+ if (sf) {
+ gen_helper_rbit64(tcg_rd, tcg_rn);
+ } else {
+ TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+ gen_helper_rbit(tcg_tmp32, tcg_tmp32);
+ tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+ tcg_temp_free_i32(tcg_tmp32);
+ }
+}
+
+/* REV with sf==1, opcode==3 ("REV64") */
+static void handle_rev64(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ if (!sf) {
+ unallocated_encoding(s);
+ return;
+ }
+ tcg_gen_bswap64_i64(cpu_reg(s, rd), cpu_reg(s, rn));
+}
+
+/* REV with sf==0, opcode==2
+ * REV32 (sf==1, opcode==2)
+ */
+static void handle_rev32(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+ TCGv_i64 tcg_rn = cpu_reg(s, rn);
+
+ if (sf) {
+ tcg_gen_bswap64_i64(tcg_rd, tcg_rn);
+ tcg_gen_rotri_i64(tcg_rd, tcg_rd, 32);
+ } else {
+ tcg_gen_bswap32_i64(tcg_rd, tcg_rn, TCG_BSWAP_OZ);
+ }
+}
+
+/* REV16 (opcode==1) */
+static void handle_rev16(DisasContext *s, unsigned int sf,
+ unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+ TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+ TCGv_i64 mask = tcg_constant_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
+
+ tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
+ tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
+ tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
+ tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);
+ tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_tmp);
+
+ tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Data-processing (1 source)
+ * 31 30 29 28 21 20 16 15 10 9 5 4 0
+ * +----+---+---+-----------------+---------+--------+------+------+
+ * | sf | 1 | S | 1 1 0 1 0 1 1 0 | opcode2 | opcode | Rn | Rd |
+ * +----+---+---+-----------------+---------+--------+------+------+
+ */
+static void disas_data_proc_1src(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, opcode, opcode2, rn, rd;
+ TCGv_i64 tcg_rd;
+
+ if (extract32(insn, 29, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ sf = extract32(insn, 31, 1);
+ opcode = extract32(insn, 10, 6);
+ opcode2 = extract32(insn, 16, 5);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+#define MAP(SF, O2, O1) ((SF) | (O1 << 1) | (O2 << 7))
+
+ switch (MAP(sf, opcode2, opcode)) {
+ case MAP(0, 0x00, 0x00): /* RBIT */
+ case MAP(1, 0x00, 0x00):
+ handle_rbit(s, sf, rn, rd);
+ break;
+ case MAP(0, 0x00, 0x01): /* REV16 */
+ case MAP(1, 0x00, 0x01):
+ handle_rev16(s, sf, rn, rd);
+ break;
+ case MAP(0, 0x00, 0x02): /* REV/REV32 */
+ case MAP(1, 0x00, 0x02):
+ handle_rev32(s, sf, rn, rd);
+ break;
+ case MAP(1, 0x00, 0x03): /* REV64 */
+ handle_rev64(s, sf, rn, rd);
+ break;
+ case MAP(0, 0x00, 0x04): /* CLZ */
+ case MAP(1, 0x00, 0x04):
+ handle_clz(s, sf, rn, rd);
+ break;
+ case MAP(0, 0x00, 0x05): /* CLS */
+ case MAP(1, 0x00, 0x05):
+ handle_cls(s, sf, rn, rd);
+ break;
+ case MAP(1, 0x01, 0x00): /* PACIA */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x01): /* PACIB */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x02): /* PACDA */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x03): /* PACDB */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x04): /* AUTIA */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x05): /* AUTIB */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x06): /* AUTDA */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x07): /* AUTDB */
+ if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+ } else if (!dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ break;
+ case MAP(1, 0x01, 0x08): /* PACIZA */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x09): /* PACIZB */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0a): /* PACDZA */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0b): /* PACDZB */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0c): /* AUTIZA */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0d): /* AUTIZB */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0e): /* AUTDZA */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x0f): /* AUTDZB */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+ }
+ break;
+ case MAP(1, 0x01, 0x10): /* XPACI */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_xpaci(tcg_rd, cpu_env, tcg_rd);
+ }
+ break;
+ case MAP(1, 0x01, 0x11): /* XPACD */
+ if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+ goto do_unallocated;
+ } else if (s->pauth_active) {
+ tcg_rd = cpu_reg(s, rd);
+ gen_helper_xpacd(tcg_rd, cpu_env, tcg_rd);
+ }
+ break;
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ break;
+ }
+
+#undef MAP
+}
+
+static void handle_div(DisasContext *s, bool is_signed, unsigned int sf,
+ unsigned int rm, unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_n, tcg_m, tcg_rd;
+ tcg_rd = cpu_reg(s, rd);
+
+ if (!sf && is_signed) {
+ tcg_n = new_tmp_a64(s);
+ tcg_m = new_tmp_a64(s);
+ tcg_gen_ext32s_i64(tcg_n, cpu_reg(s, rn));
+ tcg_gen_ext32s_i64(tcg_m, cpu_reg(s, rm));
+ } else {
+ tcg_n = read_cpu_reg(s, rn, sf);
+ tcg_m = read_cpu_reg(s, rm, sf);
+ }
+
+ if (is_signed) {
+ gen_helper_sdiv64(tcg_rd, tcg_n, tcg_m);
+ } else {
+ gen_helper_udiv64(tcg_rd, tcg_n, tcg_m);
+ }
+
+ if (!sf) { /* zero extend final result */
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+}
+
+/* LSLV, LSRV, ASRV, RORV */
+static void handle_shift_reg(DisasContext *s,
+ enum a64_shift_type shift_type, unsigned int sf,
+ unsigned int rm, unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_shift = tcg_temp_new_i64();
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+ TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+
+ tcg_gen_andi_i64(tcg_shift, cpu_reg(s, rm), sf ? 63 : 31);
+ shift_reg(tcg_rd, tcg_rn, sf, shift_type, tcg_shift);
+ tcg_temp_free_i64(tcg_shift);
+}
+
+/* CRC32[BHWX], CRC32C[BHWX] */
+static void handle_crc32(DisasContext *s,
+ unsigned int sf, unsigned int sz, bool crc32c,
+ unsigned int rm, unsigned int rn, unsigned int rd)
+{
+ TCGv_i64 tcg_acc, tcg_val;
+ TCGv_i32 tcg_bytes;
+
+ if (!dc_isar_feature(aa64_crc32, s)
+ || (sf == 1 && sz != 3)
+ || (sf == 0 && sz == 3)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (sz == 3) {
+ tcg_val = cpu_reg(s, rm);
+ } else {
+ uint64_t mask;
+ switch (sz) {
+ case 0:
+ mask = 0xFF;
+ break;
+ case 1:
+ mask = 0xFFFF;
+ break;
+ case 2:
+ mask = 0xFFFFFFFF;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_val = new_tmp_a64(s);
+ tcg_gen_andi_i64(tcg_val, cpu_reg(s, rm), mask);
+ }
+
+ tcg_acc = cpu_reg(s, rn);
+ tcg_bytes = tcg_constant_i32(1 << sz);
+
+ if (crc32c) {
+ gen_helper_crc32c_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+ } else {
+ gen_helper_crc32_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+ }
+}
+
+/* Data-processing (2 source)
+ * 31 30 29 28 21 20 16 15 10 9 5 4 0
+ * +----+---+---+-----------------+------+--------+------+------+
+ * | sf | 0 | S | 1 1 0 1 0 1 1 0 | Rm | opcode | Rn | Rd |
+ * +----+---+---+-----------------+------+--------+------+------+
+ */
+static void disas_data_proc_2src(DisasContext *s, uint32_t insn)
+{
+ unsigned int sf, rm, opcode, rn, rd, setflag;
+ sf = extract32(insn, 31, 1);
+ setflag = extract32(insn, 29, 1);
+ rm = extract32(insn, 16, 5);
+ opcode = extract32(insn, 10, 6);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ if (setflag && opcode != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0: /* SUBP(S) */
+ if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+ goto do_unallocated;
+ } else {
+ TCGv_i64 tcg_n, tcg_m, tcg_d;
+
+ tcg_n = read_cpu_reg_sp(s, rn, true);
+ tcg_m = read_cpu_reg_sp(s, rm, true);
+ tcg_gen_sextract_i64(tcg_n, tcg_n, 0, 56);
+ tcg_gen_sextract_i64(tcg_m, tcg_m, 0, 56);
+ tcg_d = cpu_reg(s, rd);
+
+ if (setflag) {
+ gen_sub_CC(true, tcg_d, tcg_n, tcg_m);
+ } else {
+ tcg_gen_sub_i64(tcg_d, tcg_n, tcg_m);
+ }
+ }
+ break;
+ case 2: /* UDIV */
+ handle_div(s, false, sf, rm, rn, rd);
+ break;
+ case 3: /* SDIV */
+ handle_div(s, true, sf, rm, rn, rd);
+ break;
+ case 4: /* IRG */
+ if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+ goto do_unallocated;
+ }
+ if (s->ata) {
+ gen_helper_irg(cpu_reg_sp(s, rd), cpu_env,
+ cpu_reg_sp(s, rn), cpu_reg(s, rm));
+ } else {
+ gen_address_with_allocation_tag0(cpu_reg_sp(s, rd),
+ cpu_reg_sp(s, rn));
+ }
+ break;
+ case 5: /* GMI */
+ if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+ goto do_unallocated;
+ } else {
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_extract_i64(t, cpu_reg_sp(s, rn), 56, 4);
+ tcg_gen_shl_i64(t, tcg_constant_i64(1), t);
+ tcg_gen_or_i64(cpu_reg(s, rd), cpu_reg(s, rm), t);
+
+ tcg_temp_free_i64(t);
+ }
+ break;
+ case 8: /* LSLV */
+ handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd);
+ break;
+ case 9: /* LSRV */
+ handle_shift_reg(s, A64_SHIFT_TYPE_LSR, sf, rm, rn, rd);
+ break;
+ case 10: /* ASRV */
+ handle_shift_reg(s, A64_SHIFT_TYPE_ASR, sf, rm, rn, rd);
+ break;
+ case 11: /* RORV */
+ handle_shift_reg(s, A64_SHIFT_TYPE_ROR, sf, rm, rn, rd);
+ break;
+ case 12: /* PACGA */
+ if (sf == 0 || !dc_isar_feature(aa64_pauth, s)) {
+ goto do_unallocated;
+ }
+ gen_helper_pacga(cpu_reg(s, rd), cpu_env,
+ cpu_reg(s, rn), cpu_reg_sp(s, rm));
+ break;
+ case 16:
+ case 17:
+ case 18:
+ case 19:
+ case 20:
+ case 21:
+ case 22:
+ case 23: /* CRC32 */
+ {
+ int sz = extract32(opcode, 0, 2);
+ bool crc32c = extract32(opcode, 2, 1);
+ handle_crc32(s, sf, sz, crc32c, rm, rn, rd);
+ break;
+ }
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/*
+ * Data processing - register
+ * 31 30 29 28 25 21 20 16 10 0
+ * +--+---+--+---+-------+-----+-------+-------+---------+
+ * | |op0| |op1| 1 0 1 | op2 | | op3 | |
+ * +--+---+--+---+-------+-----+-------+-------+---------+
+ */
+static void disas_data_proc_reg(DisasContext *s, uint32_t insn)
+{
+ int op0 = extract32(insn, 30, 1);
+ int op1 = extract32(insn, 28, 1);
+ int op2 = extract32(insn, 21, 4);
+ int op3 = extract32(insn, 10, 6);
+
+ if (!op1) {
+ if (op2 & 8) {
+ if (op2 & 1) {
+ /* Add/sub (extended register) */
+ disas_add_sub_ext_reg(s, insn);
+ } else {
+ /* Add/sub (shifted register) */
+ disas_add_sub_reg(s, insn);
+ }
+ } else {
+ /* Logical (shifted register) */
+ disas_logic_reg(s, insn);
+ }
+ return;
+ }
+
+ switch (op2) {
+ case 0x0:
+ switch (op3) {
+ case 0x00: /* Add/subtract (with carry) */
+ disas_adc_sbc(s, insn);
+ break;
+
+ case 0x01: /* Rotate right into flags */
+ case 0x21:
+ disas_rotate_right_into_flags(s, insn);
+ break;
+
+ case 0x02: /* Evaluate into flags */
+ case 0x12:
+ case 0x22:
+ case 0x32:
+ disas_evaluate_into_flags(s, insn);
+ break;
+
+ default:
+ goto do_unallocated;
+ }
+ break;
+
+ case 0x2: /* Conditional compare */
+ disas_cc(s, insn); /* both imm and reg forms */
+ break;
+
+ case 0x4: /* Conditional select */
+ disas_cond_select(s, insn);
+ break;
+
+ case 0x6: /* Data-processing */
+ if (op0) { /* (1 source) */
+ disas_data_proc_1src(s, insn);
+ } else { /* (2 source) */
+ disas_data_proc_2src(s, insn);
+ }
+ break;
+ case 0x8 ... 0xf: /* (3 source) */
+ disas_data_proc_3src(s, insn);
+ break;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+static void handle_fp_compare(DisasContext *s, int size,
+ unsigned int rn, unsigned int rm,
+ bool cmp_with_zero, bool signal_all_nans)
+{
+ TCGv_i64 tcg_flags = tcg_temp_new_i64();
+ TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ if (size == MO_64) {
+ TCGv_i64 tcg_vn, tcg_vm;
+
+ tcg_vn = read_fp_dreg(s, rn);
+ if (cmp_with_zero) {
+ tcg_vm = tcg_constant_i64(0);
+ } else {
+ tcg_vm = read_fp_dreg(s, rm);
+ }
+ if (signal_all_nans) {
+ gen_helper_vfp_cmped_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ } else {
+ gen_helper_vfp_cmpd_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ }
+ tcg_temp_free_i64(tcg_vn);
+ tcg_temp_free_i64(tcg_vm);
+ } else {
+ TCGv_i32 tcg_vn = tcg_temp_new_i32();
+ TCGv_i32 tcg_vm = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_vn, rn, 0, size);
+ if (cmp_with_zero) {
+ tcg_gen_movi_i32(tcg_vm, 0);
+ } else {
+ read_vec_element_i32(s, tcg_vm, rm, 0, size);
+ }
+
+ switch (size) {
+ case MO_32:
+ if (signal_all_nans) {
+ gen_helper_vfp_cmpes_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ } else {
+ gen_helper_vfp_cmps_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ }
+ break;
+ case MO_16:
+ if (signal_all_nans) {
+ gen_helper_vfp_cmpeh_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ } else {
+ gen_helper_vfp_cmph_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_temp_free_i32(tcg_vn);
+ tcg_temp_free_i32(tcg_vm);
+ }
+
+ tcg_temp_free_ptr(fpst);
+
+ gen_set_nzcv(tcg_flags);
+
+ tcg_temp_free_i64(tcg_flags);
+}
+
+/* Floating point compare
+ * 31 30 29 28 24 23 22 21 20 16 15 14 13 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | op | 1 0 0 0 | Rn | op2 |
+ * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
+ */
+static void disas_fp_compare(DisasContext *s, uint32_t insn)
+{
+ unsigned int mos, type, rm, op, rn, opc, op2r;
+ int size;
+
+ mos = extract32(insn, 29, 3);
+ type = extract32(insn, 22, 2);
+ rm = extract32(insn, 16, 5);
+ op = extract32(insn, 14, 2);
+ rn = extract32(insn, 5, 5);
+ opc = extract32(insn, 3, 2);
+ op2r = extract32(insn, 0, 3);
+
+ if (mos || op || op2r) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ size = MO_32;
+ break;
+ case 1:
+ size = MO_64;
+ break;
+ case 3:
+ size = MO_16;
+ if (dc_isar_feature(aa64_fp16, s)) {
+ break;
+ }
+ /* fallthru */
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_fp_compare(s, size, rn, rm, opc & 1, opc & 2);
+}
+
+/* Floating point conditional compare
+ * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 3 0
+ * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | cond | 0 1 | Rn | op | nzcv |
+ * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
+ */
+static void disas_fp_ccomp(DisasContext *s, uint32_t insn)
+{
+ unsigned int mos, type, rm, cond, rn, op, nzcv;
+ TCGLabel *label_continue = NULL;
+ int size;
+
+ mos = extract32(insn, 29, 3);
+ type = extract32(insn, 22, 2);
+ rm = extract32(insn, 16, 5);
+ cond = extract32(insn, 12, 4);
+ rn = extract32(insn, 5, 5);
+ op = extract32(insn, 4, 1);
+ nzcv = extract32(insn, 0, 4);
+
+ if (mos) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ size = MO_32;
+ break;
+ case 1:
+ size = MO_64;
+ break;
+ case 3:
+ size = MO_16;
+ if (dc_isar_feature(aa64_fp16, s)) {
+ break;
+ }
+ /* fallthru */
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (cond < 0x0e) { /* not always */
+ TCGLabel *label_match = gen_new_label();
+ label_continue = gen_new_label();
+ arm_gen_test_cc(cond, label_match);
+ /* nomatch: */
+ gen_set_nzcv(tcg_constant_i64(nzcv << 28));
+ tcg_gen_br(label_continue);
+ gen_set_label(label_match);
+ }
+
+ handle_fp_compare(s, size, rn, rm, false, op);
+
+ if (cond < 0x0e) {
+ gen_set_label(label_continue);
+ }
+}
+
+/* Floating point conditional select
+ * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+------+-----+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | cond | 1 1 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+------+-----+------+------+
+ */
+static void disas_fp_csel(DisasContext *s, uint32_t insn)
+{
+ unsigned int mos, type, rm, cond, rn, rd;
+ TCGv_i64 t_true, t_false;
+ DisasCompare64 c;
+ MemOp sz;
+
+ mos = extract32(insn, 29, 3);
+ type = extract32(insn, 22, 2);
+ rm = extract32(insn, 16, 5);
+ cond = extract32(insn, 12, 4);
+ rn = extract32(insn, 5, 5);
+ rd = extract32(insn, 0, 5);
+
+ if (mos) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ sz = MO_32;
+ break;
+ case 1:
+ sz = MO_64;
+ break;
+ case 3:
+ sz = MO_16;
+ if (dc_isar_feature(aa64_fp16, s)) {
+ break;
+ }
+ /* fallthru */
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ /* Zero extend sreg & hreg inputs to 64 bits now. */
+ t_true = tcg_temp_new_i64();
+ t_false = tcg_temp_new_i64();
+ read_vec_element(s, t_true, rn, 0, sz);
+ read_vec_element(s, t_false, rm, 0, sz);
+
+ a64_test_cc(&c, cond);
+ tcg_gen_movcond_i64(c.cond, t_true, c.value, tcg_constant_i64(0),
+ t_true, t_false);
+ tcg_temp_free_i64(t_false);
+ a64_free_cc(&c);
+
+ /* Note that sregs & hregs write back zeros to the high bits,
+ and we've already done the zero-extension. */
+ write_fp_dreg(s, rd, t_true);
+ tcg_temp_free_i64(t_true);
+}
+
+/* Floating-point data-processing (1 source) - half precision */
+static void handle_fp_1src_half(DisasContext *s, int opcode, int rd, int rn)
+{
+ TCGv_ptr fpst = NULL;
+ TCGv_i32 tcg_op = read_fp_hreg(s, rn);
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ switch (opcode) {
+ case 0x0: /* FMOV */
+ tcg_gen_mov_i32(tcg_res, tcg_op);
+ break;
+ case 0x1: /* FABS */
+ tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
+ break;
+ case 0x2: /* FNEG */
+ tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+ break;
+ case 0x3: /* FSQRT */
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_sqrt_f16(tcg_res, tcg_op, fpst);
+ break;
+ case 0x8: /* FRINTN */
+ case 0x9: /* FRINTP */
+ case 0xa: /* FRINTM */
+ case 0xb: /* FRINTZ */
+ case 0xc: /* FRINTA */
+ {
+ TCGv_i32 tcg_rmode = tcg_const_i32(arm_rmode_to_sf(opcode & 7));
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ break;
+ }
+ case 0xe: /* FRINTX */
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, fpst);
+ break;
+ case 0xf: /* FRINTI */
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ if (fpst) {
+ tcg_temp_free_ptr(fpst);
+ }
+ tcg_temp_free_i32(tcg_op);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (1 source) - single precision */
+static void handle_fp_1src_single(DisasContext *s, int opcode, int rd, int rn)
+{
+ void (*gen_fpst)(TCGv_i32, TCGv_i32, TCGv_ptr);
+ TCGv_i32 tcg_op, tcg_res;
+ TCGv_ptr fpst;
+ int rmode = -1;
+
+ tcg_op = read_fp_sreg(s, rn);
+ tcg_res = tcg_temp_new_i32();
+
+ switch (opcode) {
+ case 0x0: /* FMOV */
+ tcg_gen_mov_i32(tcg_res, tcg_op);
+ goto done;
+ case 0x1: /* FABS */
+ gen_helper_vfp_abss(tcg_res, tcg_op);
+ goto done;
+ case 0x2: /* FNEG */
+ gen_helper_vfp_negs(tcg_res, tcg_op);
+ goto done;
+ case 0x3: /* FSQRT */
+ gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
+ goto done;
+ case 0x6: /* BFCVT */
+ gen_fpst = gen_helper_bfcvt;
+ break;
+ case 0x8: /* FRINTN */
+ case 0x9: /* FRINTP */
+ case 0xa: /* FRINTM */
+ case 0xb: /* FRINTZ */
+ case 0xc: /* FRINTA */
+ rmode = arm_rmode_to_sf(opcode & 7);
+ gen_fpst = gen_helper_rints;
+ break;
+ case 0xe: /* FRINTX */
+ gen_fpst = gen_helper_rints_exact;
+ break;
+ case 0xf: /* FRINTI */
+ gen_fpst = gen_helper_rints;
+ break;
+ case 0x10: /* FRINT32Z */
+ rmode = float_round_to_zero;
+ gen_fpst = gen_helper_frint32_s;
+ break;
+ case 0x11: /* FRINT32X */
+ gen_fpst = gen_helper_frint32_s;
+ break;
+ case 0x12: /* FRINT64Z */
+ rmode = float_round_to_zero;
+ gen_fpst = gen_helper_frint64_s;
+ break;
+ case 0x13: /* FRINT64X */
+ gen_fpst = gen_helper_frint64_s;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ if (rmode >= 0) {
+ TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_fpst(tcg_res, tcg_op, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ } else {
+ gen_fpst(tcg_res, tcg_op, fpst);
+ }
+ tcg_temp_free_ptr(fpst);
+
+ done:
+ write_fp_sreg(s, rd, tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (1 source) - double precision */
+static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn)
+{
+ void (*gen_fpst)(TCGv_i64, TCGv_i64, TCGv_ptr);
+ TCGv_i64 tcg_op, tcg_res;
+ TCGv_ptr fpst;
+ int rmode = -1;
+
+ switch (opcode) {
+ case 0x0: /* FMOV */
+ gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0);
+ return;
+ }
+
+ tcg_op = read_fp_dreg(s, rn);
+ tcg_res = tcg_temp_new_i64();
+
+ switch (opcode) {
+ case 0x1: /* FABS */
+ gen_helper_vfp_absd(tcg_res, tcg_op);
+ goto done;
+ case 0x2: /* FNEG */
+ gen_helper_vfp_negd(tcg_res, tcg_op);
+ goto done;
+ case 0x3: /* FSQRT */
+ gen_helper_vfp_sqrtd(tcg_res, tcg_op, cpu_env);
+ goto done;
+ case 0x8: /* FRINTN */
+ case 0x9: /* FRINTP */
+ case 0xa: /* FRINTM */
+ case 0xb: /* FRINTZ */
+ case 0xc: /* FRINTA */
+ rmode = arm_rmode_to_sf(opcode & 7);
+ gen_fpst = gen_helper_rintd;
+ break;
+ case 0xe: /* FRINTX */
+ gen_fpst = gen_helper_rintd_exact;
+ break;
+ case 0xf: /* FRINTI */
+ gen_fpst = gen_helper_rintd;
+ break;
+ case 0x10: /* FRINT32Z */
+ rmode = float_round_to_zero;
+ gen_fpst = gen_helper_frint32_d;
+ break;
+ case 0x11: /* FRINT32X */
+ gen_fpst = gen_helper_frint32_d;
+ break;
+ case 0x12: /* FRINT64Z */
+ rmode = float_round_to_zero;
+ gen_fpst = gen_helper_frint64_d;
+ break;
+ case 0x13: /* FRINT64X */
+ gen_fpst = gen_helper_frint64_d;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ if (rmode >= 0) {
+ TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_fpst(tcg_res, tcg_op, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ } else {
+ gen_fpst(tcg_res, tcg_op, fpst);
+ }
+ tcg_temp_free_ptr(fpst);
+
+ done:
+ write_fp_dreg(s, rd, tcg_res);
+ tcg_temp_free_i64(tcg_op);
+ tcg_temp_free_i64(tcg_res);
+}
+
+static void handle_fp_fcvt(DisasContext *s, int opcode,
+ int rd, int rn, int dtype, int ntype)
+{
+ switch (ntype) {
+ case 0x0:
+ {
+ TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
+ if (dtype == 1) {
+ /* Single to double */
+ TCGv_i64 tcg_rd = tcg_temp_new_i64();
+ gen_helper_vfp_fcvtds(tcg_rd, tcg_rn, cpu_env);
+ write_fp_dreg(s, rd, tcg_rd);
+ tcg_temp_free_i64(tcg_rd);
+ } else {
+ /* Single to half */
+ TCGv_i32 tcg_rd = tcg_temp_new_i32();
+ TCGv_i32 ahp = get_ahp_flag();
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+ gen_helper_vfp_fcvt_f32_to_f16(tcg_rd, tcg_rn, fpst, ahp);
+ /* write_fp_sreg is OK here because top half of tcg_rd is zero */
+ write_fp_sreg(s, rd, tcg_rd);
+ tcg_temp_free_i32(tcg_rd);
+ tcg_temp_free_i32(ahp);
+ tcg_temp_free_ptr(fpst);
+ }
+ tcg_temp_free_i32(tcg_rn);
+ break;
+ }
+ case 0x1:
+ {
+ TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+ TCGv_i32 tcg_rd = tcg_temp_new_i32();
+ if (dtype == 0) {
+ /* Double to single */
+ gen_helper_vfp_fcvtsd(tcg_rd, tcg_rn, cpu_env);
+ } else {
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+ TCGv_i32 ahp = get_ahp_flag();
+ /* Double to half */
+ gen_helper_vfp_fcvt_f64_to_f16(tcg_rd, tcg_rn, fpst, ahp);
+ /* write_fp_sreg is OK here because top half of tcg_rd is zero */
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(ahp);
+ }
+ write_fp_sreg(s, rd, tcg_rd);
+ tcg_temp_free_i32(tcg_rd);
+ tcg_temp_free_i64(tcg_rn);
+ break;
+ }
+ case 0x3:
+ {
+ TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
+ TCGv_ptr tcg_fpst = fpstatus_ptr(FPST_FPCR);
+ TCGv_i32 tcg_ahp = get_ahp_flag();
+ tcg_gen_ext16u_i32(tcg_rn, tcg_rn);
+ if (dtype == 0) {
+ /* Half to single */
+ TCGv_i32 tcg_rd = tcg_temp_new_i32();
+ gen_helper_vfp_fcvt_f16_to_f32(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
+ write_fp_sreg(s, rd, tcg_rd);
+ tcg_temp_free_i32(tcg_rd);
+ } else {
+ /* Half to double */
+ TCGv_i64 tcg_rd = tcg_temp_new_i64();
+ gen_helper_vfp_fcvt_f16_to_f64(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
+ write_fp_dreg(s, rd, tcg_rd);
+ tcg_temp_free_i64(tcg_rd);
+ }
+ tcg_temp_free_i32(tcg_rn);
+ tcg_temp_free_ptr(tcg_fpst);
+ tcg_temp_free_i32(tcg_ahp);
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Floating point data-processing (1 source)
+ * 31 30 29 28 24 23 22 21 20 15 14 10 9 5 4 0
+ * +---+---+---+-----------+------+---+--------+-----------+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | opcode | 1 0 0 0 0 | Rn | Rd |
+ * +---+---+---+-----------+------+---+--------+-----------+------+------+
+ */
+static void disas_fp_1src(DisasContext *s, uint32_t insn)
+{
+ int mos = extract32(insn, 29, 3);
+ int type = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 15, 6);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ if (mos) {
+ goto do_unallocated;
+ }
+
+ switch (opcode) {
+ case 0x4: case 0x5: case 0x7:
+ {
+ /* FCVT between half, single and double precision */
+ int dtype = extract32(opcode, 0, 2);
+ if (type == 2 || dtype == type) {
+ goto do_unallocated;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_fp_fcvt(s, opcode, rd, rn, dtype, type);
+ break;
+ }
+
+ case 0x10 ... 0x13: /* FRINT{32,64}{X,Z} */
+ if (type > 1 || !dc_isar_feature(aa64_frint, s)) {
+ goto do_unallocated;
+ }
+ /* fall through */
+ case 0x0 ... 0x3:
+ case 0x8 ... 0xc:
+ case 0xe ... 0xf:
+ /* 32-to-32 and 64-to-64 ops */
+ switch (type) {
+ case 0:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_1src_single(s, opcode, rd, rn);
+ break;
+ case 1:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_1src_double(s, opcode, rd, rn);
+ break;
+ case 3:
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ goto do_unallocated;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_1src_half(s, opcode, rd, rn);
+ break;
+ default:
+ goto do_unallocated;
+ }
+ break;
+
+ case 0x6:
+ switch (type) {
+ case 1: /* BFCVT */
+ if (!dc_isar_feature(aa64_bf16, s)) {
+ goto do_unallocated;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_1src_single(s, opcode, rd, rn);
+ break;
+ default:
+ goto do_unallocated;
+ }
+ break;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* Floating-point data-processing (2 source) - single precision */
+static void handle_fp_2src_single(DisasContext *s, int opcode,
+ int rd, int rn, int rm)
+{
+ TCGv_i32 tcg_op1;
+ TCGv_i32 tcg_op2;
+ TCGv_i32 tcg_res;
+ TCGv_ptr fpst;
+
+ tcg_res = tcg_temp_new_i32();
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tcg_op1 = read_fp_sreg(s, rn);
+ tcg_op2 = read_fp_sreg(s, rm);
+
+ switch (opcode) {
+ case 0x0: /* FMUL */
+ gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1: /* FDIV */
+ gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2: /* FADD */
+ gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3: /* FSUB */
+ gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x4: /* FMAX */
+ gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5: /* FMIN */
+ gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x6: /* FMAXNM */
+ gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7: /* FMINNM */
+ gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x8: /* FNMUL */
+ gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_negs(tcg_res, tcg_res);
+ break;
+ }
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (2 source) - double precision */
+static void handle_fp_2src_double(DisasContext *s, int opcode,
+ int rd, int rn, int rm)
+{
+ TCGv_i64 tcg_op1;
+ TCGv_i64 tcg_op2;
+ TCGv_i64 tcg_res;
+ TCGv_ptr fpst;
+
+ tcg_res = tcg_temp_new_i64();
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tcg_op1 = read_fp_dreg(s, rn);
+ tcg_op2 = read_fp_dreg(s, rm);
+
+ switch (opcode) {
+ case 0x0: /* FMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1: /* FDIV */
+ gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2: /* FADD */
+ gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3: /* FSUB */
+ gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x4: /* FMAX */
+ gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5: /* FMIN */
+ gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x6: /* FMAXNM */
+ gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7: /* FMINNM */
+ gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x8: /* FNMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_negd(tcg_res, tcg_res);
+ break;
+ }
+
+ write_fp_dreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_res);
+}
+
+/* Floating-point data-processing (2 source) - half precision */
+static void handle_fp_2src_half(DisasContext *s, int opcode,
+ int rd, int rn, int rm)
+{
+ TCGv_i32 tcg_op1;
+ TCGv_i32 tcg_op2;
+ TCGv_i32 tcg_res;
+ TCGv_ptr fpst;
+
+ tcg_res = tcg_temp_new_i32();
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ tcg_op1 = read_fp_hreg(s, rn);
+ tcg_op2 = read_fp_hreg(s, rm);
+
+ switch (opcode) {
+ case 0x0: /* FMUL */
+ gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1: /* FDIV */
+ gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2: /* FADD */
+ gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3: /* FSUB */
+ gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x4: /* FMAX */
+ gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5: /* FMIN */
+ gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x6: /* FMAXNM */
+ gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7: /* FMINNM */
+ gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x8: /* FNMUL */
+ gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+ tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating point data-processing (2 source)
+ * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | opcode | 1 0 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_fp_2src(DisasContext *s, uint32_t insn)
+{
+ int mos = extract32(insn, 29, 3);
+ int type = extract32(insn, 22, 2);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int opcode = extract32(insn, 12, 4);
+
+ if (opcode > 8 || mos) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_2src_single(s, opcode, rd, rn, rm);
+ break;
+ case 1:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_2src_double(s, opcode, rd, rn, rm);
+ break;
+ case 3:
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_2src_half(s, opcode, rd, rn, rm);
+ break;
+ default:
+ unallocated_encoding(s);
+ }
+}
+
+/* Floating-point data-processing (3 source) - single precision */
+static void handle_fp_3src_single(DisasContext *s, bool o0, bool o1,
+ int rd, int rn, int rm, int ra)
+{
+ TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+ tcg_op1 = read_fp_sreg(s, rn);
+ tcg_op2 = read_fp_sreg(s, rm);
+ tcg_op3 = read_fp_sreg(s, ra);
+
+ /* These are fused multiply-add, and must be done as one
+ * floating point operation with no rounding between the
+ * multiplication and addition steps.
+ * NB that doing the negations here as separate steps is
+ * correct : an input NaN should come out with its sign bit
+ * flipped if it is a negated-input.
+ */
+ if (o1 == true) {
+ gen_helper_vfp_negs(tcg_op3, tcg_op3);
+ }
+
+ if (o0 != o1) {
+ gen_helper_vfp_negs(tcg_op1, tcg_op1);
+ }
+
+ gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_op3);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (3 source) - double precision */
+static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,
+ int rd, int rn, int rm, int ra)
+{
+ TCGv_i64 tcg_op1, tcg_op2, tcg_op3;
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+ tcg_op1 = read_fp_dreg(s, rn);
+ tcg_op2 = read_fp_dreg(s, rm);
+ tcg_op3 = read_fp_dreg(s, ra);
+
+ /* These are fused multiply-add, and must be done as one
+ * floating point operation with no rounding between the
+ * multiplication and addition steps.
+ * NB that doing the negations here as separate steps is
+ * correct : an input NaN should come out with its sign bit
+ * flipped if it is a negated-input.
+ */
+ if (o1 == true) {
+ gen_helper_vfp_negd(tcg_op3, tcg_op3);
+ }
+
+ if (o0 != o1) {
+ gen_helper_vfp_negd(tcg_op1, tcg_op1);
+ }
+
+ gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+ write_fp_dreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_op3);
+ tcg_temp_free_i64(tcg_res);
+}
+
+/* Floating-point data-processing (3 source) - half precision */
+static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,
+ int rd, int rn, int rm, int ra)
+{
+ TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ tcg_op1 = read_fp_hreg(s, rn);
+ tcg_op2 = read_fp_hreg(s, rm);
+ tcg_op3 = read_fp_hreg(s, ra);
+
+ /* These are fused multiply-add, and must be done as one
+ * floating point operation with no rounding between the
+ * multiplication and addition steps.
+ * NB that doing the negations here as separate steps is
+ * correct : an input NaN should come out with its sign bit
+ * flipped if it is a negated-input.
+ */
+ if (o1 == true) {
+ tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000);
+ }
+
+ if (o0 != o1) {
+ tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
+ }
+
+ gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_op3);
+ tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating point data-processing (3 source)
+ * 31 30 29 28 24 23 22 21 20 16 15 14 10 9 5 4 0
+ * +---+---+---+-----------+------+----+------+----+------+------+------+
+ * | M | 0 | S | 1 1 1 1 1 | type | o1 | Rm | o0 | Ra | Rn | Rd |
+ * +---+---+---+-----------+------+----+------+----+------+------+------+
+ */
+static void disas_fp_3src(DisasContext *s, uint32_t insn)
+{
+ int mos = extract32(insn, 29, 3);
+ int type = extract32(insn, 22, 2);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int ra = extract32(insn, 10, 5);
+ int rm = extract32(insn, 16, 5);
+ bool o0 = extract32(insn, 15, 1);
+ bool o1 = extract32(insn, 21, 1);
+
+ if (mos) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_3src_single(s, o0, o1, rd, rn, rm, ra);
+ break;
+ case 1:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra);
+ break;
+ case 3:
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra);
+ break;
+ default:
+ unallocated_encoding(s);
+ }
+}
+
+/* Floating point immediate
+ * 31 30 29 28 24 23 22 21 20 13 12 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------------+-------+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | imm8 | 1 0 0 | imm5 | Rd |
+ * +---+---+---+-----------+------+---+------------+-------+------+------+
+ */
+static void disas_fp_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int imm5 = extract32(insn, 5, 5);
+ int imm8 = extract32(insn, 13, 8);
+ int type = extract32(insn, 22, 2);
+ int mos = extract32(insn, 29, 3);
+ uint64_t imm;
+ MemOp sz;
+
+ if (mos || imm5) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0:
+ sz = MO_32;
+ break;
+ case 1:
+ sz = MO_64;
+ break;
+ case 3:
+ sz = MO_16;
+ if (dc_isar_feature(aa64_fp16, s)) {
+ break;
+ }
+ /* fallthru */
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ imm = vfp_expand_imm(sz, imm8);
+ write_fp_dreg(s, rd, tcg_constant_i64(imm));
+}
+
+/* Handle floating point <=> fixed point conversions. Note that we can
+ * also deal with fp <=> integer conversions as a special case (scale == 64)
+ * OPTME: consider handling that special case specially or at least skipping
+ * the call to scalbn in the helpers for zero shifts.
+ */
+static void handle_fpfpcvt(DisasContext *s, int rd, int rn, int opcode,
+ bool itof, int rmode, int scale, int sf, int type)
+{
+ bool is_signed = !(opcode & 1);
+ TCGv_ptr tcg_fpstatus;
+ TCGv_i32 tcg_shift, tcg_single;
+ TCGv_i64 tcg_double;
+
+ tcg_fpstatus = fpstatus_ptr(type == 3 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ tcg_shift = tcg_constant_i32(64 - scale);
+
+ if (itof) {
+ TCGv_i64 tcg_int = cpu_reg(s, rn);
+ if (!sf) {
+ TCGv_i64 tcg_extend = new_tmp_a64(s);
+
+ if (is_signed) {
+ tcg_gen_ext32s_i64(tcg_extend, tcg_int);
+ } else {
+ tcg_gen_ext32u_i64(tcg_extend, tcg_int);
+ }
+
+ tcg_int = tcg_extend;
+ }
+
+ switch (type) {
+ case 1: /* float64 */
+ tcg_double = tcg_temp_new_i64();
+ if (is_signed) {
+ gen_helper_vfp_sqtod(tcg_double, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_uqtod(tcg_double, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ }
+ write_fp_dreg(s, rd, tcg_double);
+ tcg_temp_free_i64(tcg_double);
+ break;
+
+ case 0: /* float32 */
+ tcg_single = tcg_temp_new_i32();
+ if (is_signed) {
+ gen_helper_vfp_sqtos(tcg_single, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_uqtos(tcg_single, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ }
+ write_fp_sreg(s, rd, tcg_single);
+ tcg_temp_free_i32(tcg_single);
+ break;
+
+ case 3: /* float16 */
+ tcg_single = tcg_temp_new_i32();
+ if (is_signed) {
+ gen_helper_vfp_sqtoh(tcg_single, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_uqtoh(tcg_single, tcg_int,
+ tcg_shift, tcg_fpstatus);
+ }
+ write_fp_sreg(s, rd, tcg_single);
+ tcg_temp_free_i32(tcg_single);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ TCGv_i64 tcg_int = cpu_reg(s, rd);
+ TCGv_i32 tcg_rmode;
+
+ if (extract32(opcode, 2, 1)) {
+ /* There are too many rounding modes to all fit into rmode,
+ * so FCVTA[US] is a special case.
+ */
+ rmode = FPROUNDING_TIEAWAY;
+ }
+
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+
+ switch (type) {
+ case 1: /* float64 */
+ tcg_double = read_fp_dreg(s, rn);
+ if (is_signed) {
+ if (!sf) {
+ gen_helper_vfp_tosld(tcg_int, tcg_double,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_tosqd(tcg_int, tcg_double,
+ tcg_shift, tcg_fpstatus);
+ }
+ } else {
+ if (!sf) {
+ gen_helper_vfp_tould(tcg_int, tcg_double,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_touqd(tcg_int, tcg_double,
+ tcg_shift, tcg_fpstatus);
+ }
+ }
+ if (!sf) {
+ tcg_gen_ext32u_i64(tcg_int, tcg_int);
+ }
+ tcg_temp_free_i64(tcg_double);
+ break;
+
+ case 0: /* float32 */
+ tcg_single = read_fp_sreg(s, rn);
+ if (sf) {
+ if (is_signed) {
+ gen_helper_vfp_tosqs(tcg_int, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_touqs(tcg_int, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ }
+ } else {
+ TCGv_i32 tcg_dest = tcg_temp_new_i32();
+ if (is_signed) {
+ gen_helper_vfp_tosls(tcg_dest, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_touls(tcg_dest, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ }
+ tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
+ tcg_temp_free_i32(tcg_dest);
+ }
+ tcg_temp_free_i32(tcg_single);
+ break;
+
+ case 3: /* float16 */
+ tcg_single = read_fp_sreg(s, rn);
+ if (sf) {
+ if (is_signed) {
+ gen_helper_vfp_tosqh(tcg_int, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_touqh(tcg_int, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ }
+ } else {
+ TCGv_i32 tcg_dest = tcg_temp_new_i32();
+ if (is_signed) {
+ gen_helper_vfp_toslh(tcg_dest, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_toulh(tcg_dest, tcg_single,
+ tcg_shift, tcg_fpstatus);
+ }
+ tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
+ tcg_temp_free_i32(tcg_dest);
+ }
+ tcg_temp_free_i32(tcg_single);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ tcg_temp_free_i32(tcg_rmode);
+ }
+
+ tcg_temp_free_ptr(tcg_fpstatus);
+}
+
+/* Floating point <-> fixed point conversions
+ * 31 30 29 28 24 23 22 21 20 19 18 16 15 10 9 5 4 0
+ * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
+ * | sf | 0 | S | 1 1 1 1 0 | type | 0 | rmode | opcode | scale | Rn | Rd |
+ * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
+ */
+static void disas_fp_fixed_conv(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int scale = extract32(insn, 10, 6);
+ int opcode = extract32(insn, 16, 3);
+ int rmode = extract32(insn, 19, 2);
+ int type = extract32(insn, 22, 2);
+ bool sbit = extract32(insn, 29, 1);
+ bool sf = extract32(insn, 31, 1);
+ bool itof;
+
+ if (sbit || (!sf && scale < 32)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (type) {
+ case 0: /* float32 */
+ case 1: /* float64 */
+ break;
+ case 3: /* float16 */
+ if (dc_isar_feature(aa64_fp16, s)) {
+ break;
+ }
+ /* fallthru */
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch ((rmode << 3) | opcode) {
+ case 0x2: /* SCVTF */
+ case 0x3: /* UCVTF */
+ itof = true;
+ break;
+ case 0x18: /* FCVTZS */
+ case 0x19: /* FCVTZU */
+ itof = false;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_fpfpcvt(s, rd, rn, opcode, itof, FPROUNDING_ZERO, scale, sf, type);
+}
+
+static void handle_fmov(DisasContext *s, int rd, int rn, int type, bool itof)
+{
+ /* FMOV: gpr to or from float, double, or top half of quad fp reg,
+ * without conversion.
+ */
+
+ if (itof) {
+ TCGv_i64 tcg_rn = cpu_reg(s, rn);
+ TCGv_i64 tmp;
+
+ switch (type) {
+ case 0:
+ /* 32 bit */
+ tmp = tcg_temp_new_i64();
+ tcg_gen_ext32u_i64(tmp, tcg_rn);
+ write_fp_dreg(s, rd, tmp);
+ tcg_temp_free_i64(tmp);
+ break;
+ case 1:
+ /* 64 bit */
+ write_fp_dreg(s, rd, tcg_rn);
+ break;
+ case 2:
+ /* 64 bit to top half. */
+ tcg_gen_st_i64(tcg_rn, cpu_env, fp_reg_hi_offset(s, rd));
+ clear_vec_high(s, true, rd);
+ break;
+ case 3:
+ /* 16 bit */
+ tmp = tcg_temp_new_i64();
+ tcg_gen_ext16u_i64(tmp, tcg_rn);
+ write_fp_dreg(s, rd, tmp);
+ tcg_temp_free_i64(tmp);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ TCGv_i64 tcg_rd = cpu_reg(s, rd);
+
+ switch (type) {
+ case 0:
+ /* 32 bit */
+ tcg_gen_ld32u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_32));
+ break;
+ case 1:
+ /* 64 bit */
+ tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_64));
+ break;
+ case 2:
+ /* 64 bits from top half */
+ tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_hi_offset(s, rn));
+ break;
+ case 3:
+ /* 16 bit */
+ tcg_gen_ld16u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_16));
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+}
+
+static void handle_fjcvtzs(DisasContext *s, int rd, int rn)
+{
+ TCGv_i64 t = read_fp_dreg(s, rn);
+ TCGv_ptr fpstatus = fpstatus_ptr(FPST_FPCR);
+
+ gen_helper_fjcvtzs(t, t, fpstatus);
+
+ tcg_temp_free_ptr(fpstatus);
+
+ tcg_gen_ext32u_i64(cpu_reg(s, rd), t);
+ tcg_gen_extrh_i64_i32(cpu_ZF, t);
+ tcg_gen_movi_i32(cpu_CF, 0);
+ tcg_gen_movi_i32(cpu_NF, 0);
+ tcg_gen_movi_i32(cpu_VF, 0);
+
+ tcg_temp_free_i64(t);
+}
+
+/* Floating point <-> integer conversions
+ * 31 30 29 28 24 23 22 21 20 19 18 16 15 10 9 5 4 0
+ * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
+ * | sf | 0 | S | 1 1 1 1 0 | type | 1 | rmode | opc | 0 0 0 0 0 0 | Rn | Rd |
+ * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
+ */
+static void disas_fp_int_conv(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 16, 3);
+ int rmode = extract32(insn, 19, 2);
+ int type = extract32(insn, 22, 2);
+ bool sbit = extract32(insn, 29, 1);
+ bool sf = extract32(insn, 31, 1);
+ bool itof = false;
+
+ if (sbit) {
+ goto do_unallocated;
+ }
+
+ switch (opcode) {
+ case 2: /* SCVTF */
+ case 3: /* UCVTF */
+ itof = true;
+ /* fallthru */
+ case 4: /* FCVTAS */
+ case 5: /* FCVTAU */
+ if (rmode != 0) {
+ goto do_unallocated;
+ }
+ /* fallthru */
+ case 0: /* FCVT[NPMZ]S */
+ case 1: /* FCVT[NPMZ]U */
+ switch (type) {
+ case 0: /* float32 */
+ case 1: /* float64 */
+ break;
+ case 3: /* float16 */
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ goto do_unallocated;
+ }
+ break;
+ default:
+ goto do_unallocated;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fpfpcvt(s, rd, rn, opcode, itof, rmode, 64, sf, type);
+ break;
+
+ default:
+ switch (sf << 7 | type << 5 | rmode << 3 | opcode) {
+ case 0b01100110: /* FMOV half <-> 32-bit int */
+ case 0b01100111:
+ case 0b11100110: /* FMOV half <-> 64-bit int */
+ case 0b11100111:
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ goto do_unallocated;
+ }
+ /* fallthru */
+ case 0b00000110: /* FMOV 32-bit */
+ case 0b00000111:
+ case 0b10100110: /* FMOV 64-bit */
+ case 0b10100111:
+ case 0b11001110: /* FMOV top half of 128-bit */
+ case 0b11001111:
+ if (!fp_access_check(s)) {
+ return;
+ }
+ itof = opcode & 1;
+ handle_fmov(s, rd, rn, type, itof);
+ break;
+
+ case 0b00111110: /* FJCVTZS */
+ if (!dc_isar_feature(aa64_jscvt, s)) {
+ goto do_unallocated;
+ } else if (fp_access_check(s)) {
+ handle_fjcvtzs(s, rd, rn);
+ }
+ break;
+
+ default:
+ do_unallocated:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+}
+
+/* FP-specific subcases of table C3-6 (SIMD and FP data processing)
+ * 31 30 29 28 25 24 0
+ * +---+---+---+---------+-----------------------------+
+ * | | 0 | | 1 1 1 1 | |
+ * +---+---+---+---------+-----------------------------+
+ */
+static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
+{
+ if (extract32(insn, 24, 1)) {
+ /* Floating point data-processing (3 source) */
+ disas_fp_3src(s, insn);
+ } else if (extract32(insn, 21, 1) == 0) {
+ /* Floating point to fixed point conversions */
+ disas_fp_fixed_conv(s, insn);
+ } else {
+ switch (extract32(insn, 10, 2)) {
+ case 1:
+ /* Floating point conditional compare */
+ disas_fp_ccomp(s, insn);
+ break;
+ case 2:
+ /* Floating point data-processing (2 source) */
+ disas_fp_2src(s, insn);
+ break;
+ case 3:
+ /* Floating point conditional select */
+ disas_fp_csel(s, insn);
+ break;
+ case 0:
+ switch (ctz32(extract32(insn, 12, 4))) {
+ case 0: /* [15:12] == xxx1 */
+ /* Floating point immediate */
+ disas_fp_imm(s, insn);
+ break;
+ case 1: /* [15:12] == xx10 */
+ /* Floating point compare */
+ disas_fp_compare(s, insn);
+ break;
+ case 2: /* [15:12] == x100 */
+ /* Floating point data-processing (1 source) */
+ disas_fp_1src(s, insn);
+ break;
+ case 3: /* [15:12] == 1000 */
+ unallocated_encoding(s);
+ break;
+ default: /* [15:12] == 0000 */
+ /* Floating point <-> integer conversions */
+ disas_fp_int_conv(s, insn);
+ break;
+ }
+ break;
+ }
+ }
+}
+
+static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right,
+ int pos)
+{
+ /* Extract 64 bits from the middle of two concatenated 64 bit
+ * vector register slices left:right. The extracted bits start
+ * at 'pos' bits into the right (least significant) side.
+ * We return the result in tcg_right, and guarantee not to
+ * trash tcg_left.
+ */
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+ assert(pos > 0 && pos < 64);
+
+ tcg_gen_shri_i64(tcg_right, tcg_right, pos);
+ tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos);
+ tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp);
+
+ tcg_temp_free_i64(tcg_tmp);
+}
+
+/* EXT
+ * 31 30 29 24 23 22 21 20 16 15 14 11 10 9 5 4 0
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 | Rm | 0 | imm4 | 0 | Rn | Rd |
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ */
+static void disas_simd_ext(DisasContext *s, uint32_t insn)
+{
+ int is_q = extract32(insn, 30, 1);
+ int op2 = extract32(insn, 22, 2);
+ int imm4 = extract32(insn, 11, 4);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ int pos = imm4 << 3;
+ TCGv_i64 tcg_resl, tcg_resh;
+
+ if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_resh = tcg_temp_new_i64();
+ tcg_resl = tcg_temp_new_i64();
+
+ /* Vd gets bits starting at pos bits into Vm:Vn. This is
+ * either extracting 128 bits from a 128:128 concatenation, or
+ * extracting 64 bits from a 64:64 concatenation.
+ */
+ if (!is_q) {
+ read_vec_element(s, tcg_resl, rn, 0, MO_64);
+ if (pos != 0) {
+ read_vec_element(s, tcg_resh, rm, 0, MO_64);
+ do_ext64(s, tcg_resh, tcg_resl, pos);
+ }
+ } else {
+ TCGv_i64 tcg_hh;
+ typedef struct {
+ int reg;
+ int elt;
+ } EltPosns;
+ EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} };
+ EltPosns *elt = eltposns;
+
+ if (pos >= 64) {
+ elt++;
+ pos -= 64;
+ }
+
+ read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
+ elt++;
+ read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
+ elt++;
+ if (pos != 0) {
+ do_ext64(s, tcg_resh, tcg_resl, pos);
+ tcg_hh = tcg_temp_new_i64();
+ read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
+ do_ext64(s, tcg_hh, tcg_resh, pos);
+ tcg_temp_free_i64(tcg_hh);
+ }
+ }
+
+ write_vec_element(s, tcg_resl, rd, 0, MO_64);
+ tcg_temp_free_i64(tcg_resl);
+ if (is_q) {
+ write_vec_element(s, tcg_resh, rd, 1, MO_64);
+ }
+ tcg_temp_free_i64(tcg_resh);
+ clear_vec_high(s, is_q, rd);
+}
+
+/* TBL/TBX
+ * 31 30 29 24 23 22 21 20 16 15 14 13 12 11 10 9 5 4 0
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 | Rm | 0 | len | op | 0 0 | Rn | Rd |
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ */
+static void disas_simd_tb(DisasContext *s, uint32_t insn)
+{
+ int op2 = extract32(insn, 22, 2);
+ int is_q = extract32(insn, 30, 1);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ int is_tbx = extract32(insn, 12, 1);
+ int len = (extract32(insn, 13, 2) + 1) * 16;
+
+ if (op2 != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rm), cpu_env,
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ (len << 6) | (is_tbx << 5) | rn,
+ gen_helper_simd_tblx);
+}
+
+/* ZIP/UZP/TRN
+ * 31 30 29 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | size | 0 | Rm | 0 | opc | 1 0 | Rn | Rd |
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ */
+static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ /* opc field bits [1:0] indicate ZIP/UZP/TRN;
+ * bit 2 indicates 1 vs 2 variant of the insn.
+ */
+ int opcode = extract32(insn, 12, 2);
+ bool part = extract32(insn, 14, 1);
+ bool is_q = extract32(insn, 30, 1);
+ int esize = 8 << size;
+ int i, ofs;
+ int datasize = is_q ? 128 : 64;
+ int elements = datasize / esize;
+ TCGv_i64 tcg_res, tcg_resl, tcg_resh;
+
+ if (opcode == 0 || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_resl = tcg_const_i64(0);
+ tcg_resh = is_q ? tcg_const_i64(0) : NULL;
+ tcg_res = tcg_temp_new_i64();
+
+ for (i = 0; i < elements; i++) {
+ switch (opcode) {
+ case 1: /* UZP1/2 */
+ {
+ int midpoint = elements / 2;
+ if (i < midpoint) {
+ read_vec_element(s, tcg_res, rn, 2 * i + part, size);
+ } else {
+ read_vec_element(s, tcg_res, rm,
+ 2 * (i - midpoint) + part, size);
+ }
+ break;
+ }
+ case 2: /* TRN1/2 */
+ if (i & 1) {
+ read_vec_element(s, tcg_res, rm, (i & ~1) + part, size);
+ } else {
+ read_vec_element(s, tcg_res, rn, (i & ~1) + part, size);
+ }
+ break;
+ case 3: /* ZIP1/2 */
+ {
+ int base = part * elements / 2;
+ if (i & 1) {
+ read_vec_element(s, tcg_res, rm, base + (i >> 1), size);
+ } else {
+ read_vec_element(s, tcg_res, rn, base + (i >> 1), size);
+ }
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ ofs = i * esize;
+ if (ofs < 64) {
+ tcg_gen_shli_i64(tcg_res, tcg_res, ofs);
+ tcg_gen_or_i64(tcg_resl, tcg_resl, tcg_res);
+ } else {
+ tcg_gen_shli_i64(tcg_res, tcg_res, ofs - 64);
+ tcg_gen_or_i64(tcg_resh, tcg_resh, tcg_res);
+ }
+ }
+
+ tcg_temp_free_i64(tcg_res);
+
+ write_vec_element(s, tcg_resl, rd, 0, MO_64);
+ tcg_temp_free_i64(tcg_resl);
+
+ if (is_q) {
+ write_vec_element(s, tcg_resh, rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_resh);
+ }
+ clear_vec_high(s, is_q, rd);
+}
+
+/*
+ * do_reduction_op helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function, TCG temps should be freed by the
+ * calling function once it is done with the values.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
+ int esize, int size, int vmap, TCGv_ptr fpst)
+{
+ if (esize == size) {
+ int element;
+ MemOp msize = esize == 16 ? MO_16 : MO_32;
+ TCGv_i32 tcg_elem;
+
+ /* We should have one register left here */
+ assert(ctpop8(vmap) == 1);
+ element = ctz32(vmap);
+ assert(element < 8);
+
+ tcg_elem = tcg_temp_new_i32();
+ read_vec_element_i32(s, tcg_elem, rn, element, msize);
+ return tcg_elem;
+ } else {
+ int bits = size / 2;
+ int shift = ctpop8(vmap) / 2;
+ int vmap_lo = (vmap >> shift) & vmap;
+ int vmap_hi = (vmap & ~vmap_lo);
+ TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+ tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
+ tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+ tcg_res = tcg_temp_new_i32();
+
+ switch (fpopcode) {
+ case 0x0c: /* fmaxnmv half-precision */
+ gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x0f: /* fmaxv half-precision */
+ gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x1c: /* fminnmv half-precision */
+ gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x1f: /* fminv half-precision */
+ gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x2c: /* fmaxnmv */
+ gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x2f: /* fmaxv */
+ gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x3c: /* fminnmv */
+ gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x3f: /* fminv */
+ gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_temp_free_i32(tcg_hi);
+ tcg_temp_free_i32(tcg_lo);
+ return tcg_res;
+ }
+}
+
+/* AdvSIMD across lanes
+ * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 5);
+ bool is_q = extract32(insn, 30, 1);
+ bool is_u = extract32(insn, 29, 1);
+ bool is_fp = false;
+ bool is_min = false;
+ int esize;
+ int elements;
+ int i;
+ TCGv_i64 tcg_res, tcg_elt;
+
+ switch (opcode) {
+ case 0x1b: /* ADDV */
+ if (is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x3: /* SADDLV, UADDLV */
+ case 0xa: /* SMAXV, UMAXV */
+ case 0x1a: /* SMINV, UMINV */
+ if (size == 3 || (size == 2 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0xc: /* FMAXNMV, FMINNMV */
+ case 0xf: /* FMAXV, FMINV */
+ /* Bit 1 of size field encodes min vs max and the actual size
+ * depends on the encoding of the U bit. If not set (and FP16
+ * enabled) then we do half-precision float instead of single
+ * precision.
+ */
+ is_min = extract32(size, 1, 1);
+ is_fp = true;
+ if (!is_u && dc_isar_feature(aa64_fp16, s)) {
+ size = 1;
+ } else if (!is_u || !is_q || extract32(size, 0, 1)) {
+ unallocated_encoding(s);
+ return;
+ } else {
+ size = 2;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ esize = 8 << size;
+ elements = (is_q ? 128 : 64) / esize;
+
+ tcg_res = tcg_temp_new_i64();
+ tcg_elt = tcg_temp_new_i64();
+
+ /* These instructions operate across all lanes of a vector
+ * to produce a single result. We can guarantee that a 64
+ * bit intermediate is sufficient:
+ * + for [US]ADDLV the maximum element size is 32 bits, and
+ * the result type is 64 bits
+ * + for FMAX*V, FMIN*V, ADDV the intermediate type is the
+ * same as the element size, which is 32 bits at most
+ * For the integer operations we can choose to work at 64
+ * or 32 bits and truncate at the end; for simplicity
+ * we use 64 bits always. The floating point
+ * ops do require 32 bit intermediates, though.
+ */
+ if (!is_fp) {
+ read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN));
+
+ for (i = 1; i < elements; i++) {
+ read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN));
+
+ switch (opcode) {
+ case 0x03: /* SADDLV / UADDLV */
+ case 0x1b: /* ADDV */
+ tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt);
+ break;
+ case 0x0a: /* SMAXV / UMAXV */
+ if (is_u) {
+ tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt);
+ } else {
+ tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt);
+ }
+ break;
+ case 0x1a: /* SMINV / UMINV */
+ if (is_u) {
+ tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt);
+ } else {
+ tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ }
+ } else {
+ /* Floating point vector reduction ops which work across 32
+ * bit (single) or 16 bit (half-precision) intermediates.
+ * Note that correct NaN propagation requires that we do these
+ * operations in exactly the order specified by the pseudocode.
+ */
+ TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ int fpopcode = opcode | is_min << 4 | is_u << 5;
+ int vmap = (1 << elements) - 1;
+ TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
+ (is_q ? 128 : 64), vmap, fpst);
+ tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
+ tcg_temp_free_i32(tcg_res32);
+ tcg_temp_free_ptr(fpst);
+ }
+
+ tcg_temp_free_i64(tcg_elt);
+
+ /* Now truncate the result to the width required for the final output */
+ if (opcode == 0x03) {
+ /* SADDLV, UADDLV: result is 2*esize */
+ size++;
+ }
+
+ switch (size) {
+ case 0:
+ tcg_gen_ext8u_i64(tcg_res, tcg_res);
+ break;
+ case 1:
+ tcg_gen_ext16u_i64(tcg_res, tcg_res);
+ break;
+ case 2:
+ tcg_gen_ext32u_i64(tcg_res, tcg_res);
+ break;
+ case 3:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_dreg(s, rd, tcg_res);
+ tcg_temp_free_i64(tcg_res);
+}
+
+/* DUP (Element, Vector)
+ *
+ * 31 30 29 21 20 16 15 10 9 5 4 0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 0 1 | Rn | Rd |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
+ int imm5)
+{
+ int size = ctz32(imm5);
+ int index;
+
+ if (size > 3 || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ index = imm5 >> (size + 1);
+ tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
+ vec_reg_offset(s, rn, index, size),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* DUP (element, scalar)
+ * 31 21 20 16 15 10 9 5 4 0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 1 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 0 1 | Rn | Rd |
+ * +-----------------------+--------+-------------+------+------+
+ */
+static void handle_simd_dupes(DisasContext *s, int rd, int rn,
+ int imm5)
+{
+ int size = ctz32(imm5);
+ int index;
+ TCGv_i64 tmp;
+
+ if (size > 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ index = imm5 >> (size + 1);
+
+ /* This instruction just extracts the specified element and
+ * zero-extends it into the bottom of the destination register.
+ */
+ tmp = tcg_temp_new_i64();
+ read_vec_element(s, tmp, rn, index, size);
+ write_fp_dreg(s, rd, tmp);
+ tcg_temp_free_i64(tmp);
+}
+
+/* DUP (General)
+ *
+ * 31 30 29 21 20 16 15 10 9 5 4 0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 1 1 | Rn | Rd |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
+ int imm5)
+{
+ int size = ctz32(imm5);
+ uint32_t dofs, oprsz, maxsz;
+
+ if (size > 3 || ((size == 3) && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ dofs = vec_full_reg_offset(s, rd);
+ oprsz = is_q ? 16 : 8;
+ maxsz = vec_full_reg_size(s);
+
+ tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn));
+}
+
+/* INS (Element)
+ *
+ * 31 21 20 16 15 14 11 10 9 5 4 0
+ * +-----------------------+--------+------------+---+------+------+
+ * | 0 1 1 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd |
+ * +-----------------------+--------+------------+---+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_inse(DisasContext *s, int rd, int rn,
+ int imm4, int imm5)
+{
+ int size = ctz32(imm5);
+ int src_index, dst_index;
+ TCGv_i64 tmp;
+
+ if (size > 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ dst_index = extract32(imm5, 1+size, 5);
+ src_index = extract32(imm4, size, 4);
+
+ tmp = tcg_temp_new_i64();
+
+ read_vec_element(s, tmp, rn, src_index, size);
+ write_vec_element(s, tmp, rd, dst_index, size);
+
+ tcg_temp_free_i64(tmp);
+
+ /* INS is considered a 128-bit write for SVE. */
+ clear_vec_high(s, true, rd);
+}
+
+
+/* INS (General)
+ *
+ * 31 21 20 16 15 10 9 5 4 0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 1 1 1 | Rn | Rd |
+ * +-----------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_insg(DisasContext *s, int rd, int rn, int imm5)
+{
+ int size = ctz32(imm5);
+ int idx;
+
+ if (size > 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ idx = extract32(imm5, 1 + size, 4 - size);
+ write_vec_element(s, cpu_reg(s, rn), rd, idx, size);
+
+ /* INS is considered a 128-bit write for SVE. */
+ clear_vec_high(s, true, rd);
+}
+
+/*
+ * UMOV (General)
+ * SMOV (General)
+ *
+ * 31 30 29 21 20 16 15 12 10 9 5 4 0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 1 U 1 1 | Rn | Rd |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * U: unsigned when set
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_umov_smov(DisasContext *s, int is_q, int is_signed,
+ int rn, int rd, int imm5)
+{
+ int size = ctz32(imm5);
+ int element;
+ TCGv_i64 tcg_rd;
+
+ /* Check for UnallocatedEncodings */
+ if (is_signed) {
+ if (size > 2 || (size == 2 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else {
+ if (size > 3
+ || (size < 3 && is_q)
+ || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ element = extract32(imm5, 1+size, 4);
+
+ tcg_rd = cpu_reg(s, rd);
+ read_vec_element(s, tcg_rd, rn, element, size | (is_signed ? MO_SIGN : 0));
+ if (is_signed && !is_q) {
+ tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+ }
+}
+
+/* AdvSIMD copy
+ * 31 30 29 28 21 20 16 15 14 11 10 9 5 4 0
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ * | 0 | Q | op | 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd |
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_copy(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int imm4 = extract32(insn, 11, 4);
+ int op = extract32(insn, 29, 1);
+ int is_q = extract32(insn, 30, 1);
+ int imm5 = extract32(insn, 16, 5);
+
+ if (op) {
+ if (is_q) {
+ /* INS (element) */
+ handle_simd_inse(s, rd, rn, imm4, imm5);
+ } else {
+ unallocated_encoding(s);
+ }
+ } else {
+ switch (imm4) {
+ case 0:
+ /* DUP (element - vector) */
+ handle_simd_dupe(s, is_q, rd, rn, imm5);
+ break;
+ case 1:
+ /* DUP (general) */
+ handle_simd_dupg(s, is_q, rd, rn, imm5);
+ break;
+ case 3:
+ if (is_q) {
+ /* INS (general) */
+ handle_simd_insg(s, rd, rn, imm5);
+ } else {
+ unallocated_encoding(s);
+ }
+ break;
+ case 5:
+ case 7:
+ /* UMOV/SMOV (is_q indicates 32/64; imm4 indicates signedness) */
+ handle_simd_umov_smov(s, is_q, (imm4 == 5), rn, rd, imm5);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+ }
+}
+
+/* AdvSIMD modified immediate
+ * 31 30 29 28 19 18 16 15 12 11 10 9 5 4 0
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh | Rd |
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ *
+ * There are a number of operations that can be carried out here:
+ * MOVI - move (shifted) imm into register
+ * MVNI - move inverted (shifted) imm into register
+ * ORR - bitwise OR of (shifted) imm with register
+ * BIC - bitwise clear of (shifted) imm with register
+ * With ARMv8.2 we also have:
+ * FMOV half-precision
+ */
+static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int cmode = extract32(insn, 12, 4);
+ int o2 = extract32(insn, 11, 1);
+ uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
+ bool is_neg = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+ uint64_t imm = 0;
+
+ if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) {
+ /* Check for FMOV (vector, immediate) - half-precision */
+ if (!(dc_isar_feature(aa64_fp16, s) && o2 && cmode == 0xf)) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (cmode == 15 && o2 && !is_neg) {
+ /* FMOV (vector, immediate) - half-precision */
+ imm = vfp_expand_imm(MO_16, abcdefgh);
+ /* now duplicate across the lanes */
+ imm = dup_const(MO_16, imm);
+ } else {
+ imm = asimd_imm_const(abcdefgh, cmode, is_neg);
+ }
+
+ if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
+ /* MOVI or MVNI, with MVNI negation handled above. */
+ tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
+ vec_full_reg_size(s), imm);
+ } else {
+ /* ORR or BIC, with BIC negation to AND handled above. */
+ if (is_neg) {
+ gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64);
+ } else {
+ gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64);
+ }
+ }
+}
+
+/* AdvSIMD scalar copy
+ * 31 30 29 28 21 20 16 15 14 11 10 9 5 4 0
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ * | 0 1 | op | 1 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd |
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_scalar_copy(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int imm4 = extract32(insn, 11, 4);
+ int imm5 = extract32(insn, 16, 5);
+ int op = extract32(insn, 29, 1);
+
+ if (op != 0 || imm4 != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* DUP (element, scalar) */
+ handle_simd_dupes(s, rd, rn, imm5);
+}
+
+/* AdvSIMD scalar pairwise
+ * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
+{
+ int u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ TCGv_ptr fpst;
+
+ /* For some ops (the FP ones), size[1] is part of the encoding.
+ * For ADDP strictly it is not but size[1] is always 1 for valid
+ * encodings.
+ */
+ opcode |= (extract32(size, 1, 1) << 5);
+
+ switch (opcode) {
+ case 0x3b: /* ADDP */
+ if (u || size != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ fpst = NULL;
+ break;
+ case 0xc: /* FMAXNMP */
+ case 0xd: /* FADDP */
+ case 0xf: /* FMAXP */
+ case 0x2c: /* FMINNMP */
+ case 0x2f: /* FMINP */
+ /* FP op, size[0] is 32 or 64 bit*/
+ if (!u) {
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ } else {
+ size = MO_16;
+ }
+ } else {
+ size = extract32(size, 0, 1) ? MO_64 : MO_32;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size == MO_64) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, 0, MO_64);
+ read_vec_element(s, tcg_op2, rn, 1, MO_64);
+
+ switch (opcode) {
+ case 0x3b: /* ADDP */
+ tcg_gen_add_i64(tcg_res, tcg_op1, tcg_op2);
+ break;
+ case 0xc: /* FMAXNMP */
+ gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xd: /* FADDP */
+ gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xf: /* FMAXP */
+ gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2c: /* FMINNMP */
+ gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2f: /* FMINP */
+ gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_dreg(s, rd, tcg_res);
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_res);
+ } else {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op1, rn, 0, size);
+ read_vec_element_i32(s, tcg_op2, rn, 1, size);
+
+ if (size == MO_16) {
+ switch (opcode) {
+ case 0xc: /* FMAXNMP */
+ gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xd: /* FADDP */
+ gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xf: /* FMAXP */
+ gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2c: /* FMINNMP */
+ gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2f: /* FMINP */
+ gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ switch (opcode) {
+ case 0xc: /* FMAXNMP */
+ gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xd: /* FADDP */
+ gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xf: /* FMAXP */
+ gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2c: /* FMINNMP */
+ gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x2f: /* FMINP */
+ gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_res);
+ }
+
+ if (fpst) {
+ tcg_temp_free_ptr(fpst);
+ }
+}
+
+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ TCGv_i64 tcg_rnd, bool accumulate,
+ bool is_u, int size, int shift)
+{
+ bool extended_result = false;
+ bool round = tcg_rnd != NULL;
+ int ext_lshift = 0;
+ TCGv_i64 tcg_src_hi;
+
+ if (round && size == 3) {
+ extended_result = true;
+ ext_lshift = 64 - shift;
+ tcg_src_hi = tcg_temp_new_i64();
+ } else if (shift == 64) {
+ if (!accumulate && is_u) {
+ /* result is zero */
+ tcg_gen_movi_i64(tcg_res, 0);
+ return;
+ }
+ }
+
+ /* Deal with the rounding step */
+ if (round) {
+ if (extended_result) {
+ TCGv_i64 tcg_zero = tcg_constant_i64(0);
+ if (!is_u) {
+ /* take care of sign extending tcg_res */
+ tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_src_hi,
+ tcg_rnd, tcg_zero);
+ } else {
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_zero,
+ tcg_rnd, tcg_zero);
+ }
+ } else {
+ tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+ }
+ }
+
+ /* Now do the shift right */
+ if (round && extended_result) {
+ /* extended case, >64 bit precision required */
+ if (ext_lshift == 0) {
+ /* special case, only high bits matter */
+ tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+ tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+ }
+ } else {
+ if (is_u) {
+ if (shift == 64) {
+ /* essentially shifting in 64 zeros */
+ tcg_gen_movi_i64(tcg_src, 0);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ }
+ } else {
+ if (shift == 64) {
+ /* effectively extending the sign-bit */
+ tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+ } else {
+ tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+ }
+ }
+ }
+
+ if (accumulate) {
+ tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+ } else {
+ tcg_gen_mov_i64(tcg_res, tcg_src);
+ }
+
+ if (extended_result) {
+ tcg_temp_free_i64(tcg_src_hi);
+ }
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+ bool is_u, int immh, int immb,
+ int opcode, int rn, int rd)
+{
+ const int size = 3;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ bool accumulate = false;
+ bool round = false;
+ bool insert = false;
+ TCGv_i64 tcg_rn;
+ TCGv_i64 tcg_rd;
+ TCGv_i64 tcg_round;
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ accumulate = true;
+ break;
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ round = true;
+ break;
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ accumulate = round = true;
+ break;
+ case 0x08: /* SRI */
+ insert = true;
+ break;
+ }
+
+ if (round) {
+ tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+ } else {
+ tcg_round = NULL;
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ if (insert) {
+ /* shift count same as element size is valid but does nothing;
+ * special case to avoid potential shift by 64.
+ */
+ int esize = 8 << size;
+ if (shift != esize) {
+ tcg_gen_shri_i64(tcg_rn, tcg_rn, shift);
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
+ }
+ } else {
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
+ }
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+}
+
+/* SHL/SLI - Scalar shift left */
+static void handle_scalar_simd_shli(DisasContext *s, bool insert,
+ int immh, int immb, int opcode,
+ int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ TCGv_i64 tcg_rn;
+ TCGv_i64 tcg_rd;
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ if (insert) {
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift);
+ } else {
+ tcg_gen_shli_i64(tcg_rd, tcg_rn, shift);
+ }
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+}
+
+/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
+ * (signed/unsigned) narrowing */
+static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
+ bool is_u_shift, bool is_u_narrow,
+ int immh, int immb, int opcode,
+ int rn, int rd)
+{
+ int immhb = immh << 3 | immb;
+ int size = 32 - clz32(immh) - 1;
+ int esize = 8 << size;
+ int shift = (2 * esize) - immhb;
+ int elements = is_scalar ? 1 : (64 / esize);
+ bool round = extract32(opcode, 0, 1);
+ MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
+ TCGv_i64 tcg_rn, tcg_rd, tcg_round;
+ TCGv_i32 tcg_rd_narrowed;
+ TCGv_i64 tcg_final;
+
+ static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = {
+ { gen_helper_neon_narrow_sat_s8,
+ gen_helper_neon_unarrow_sat8 },
+ { gen_helper_neon_narrow_sat_s16,
+ gen_helper_neon_unarrow_sat16 },
+ { gen_helper_neon_narrow_sat_s32,
+ gen_helper_neon_unarrow_sat32 },
+ { NULL, NULL },
+ };
+ static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = {
+ gen_helper_neon_narrow_sat_u8,
+ gen_helper_neon_narrow_sat_u16,
+ gen_helper_neon_narrow_sat_u32,
+ NULL
+ };
+ NeonGenNarrowEnvFn *narrowfn;
+
+ int i;
+
+ assert(size < 4);
+
+ if (extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (is_u_shift) {
+ narrowfn = unsigned_narrow_fns[size];
+ } else {
+ narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0];
+ }
+
+ tcg_rn = tcg_temp_new_i64();
+ tcg_rd = tcg_temp_new_i64();
+ tcg_rd_narrowed = tcg_temp_new_i32();
+ tcg_final = tcg_const_i64(0);
+
+ if (round) {
+ tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+ } else {
+ tcg_round = NULL;
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, ldop);
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ false, is_u_shift, size+1, shift);
+ narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd);
+ tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
+ tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+ }
+
+ if (!is_q) {
+ write_vec_element(s, tcg_final, rd, 0, MO_64);
+ } else {
+ write_vec_element(s, tcg_final, rd, 1, MO_64);
+ }
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+ tcg_temp_free_i32(tcg_rd_narrowed);
+ tcg_temp_free_i64(tcg_final);
+
+ clear_vec_high(s, is_q, rd);
+}
+
+/* SQSHLU, UQSHL, SQSHL: saturating left shifts */
+static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q,
+ bool src_unsigned, bool dst_unsigned,
+ int immh, int immb, int rn, int rd)
+{
+ int immhb = immh << 3 | immb;
+ int size = 32 - clz32(immh) - 1;
+ int shift = immhb - (8 << size);
+ int pass;
+
+ assert(immh != 0);
+ assert(!(scalar && is_q));
+
+ if (!scalar) {
+ if (!is_q && extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Since we use the variable-shift helpers we must
+ * replicate the shift count into each element of
+ * the tcg_shift value.
+ */
+ switch (size) {
+ case 0:
+ shift |= shift << 8;
+ /* fall through */
+ case 1:
+ shift |= shift << 16;
+ break;
+ case 2:
+ case 3:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (size == 3) {
+ TCGv_i64 tcg_shift = tcg_constant_i64(shift);
+ static NeonGenTwo64OpEnvFn * const fns[2][2] = {
+ { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 },
+ { NULL, gen_helper_neon_qshl_u64 },
+ };
+ NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned];
+ int maxpass = is_q ? 2 : 1;
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+ write_vec_element(s, tcg_op, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_op);
+ }
+ clear_vec_high(s, is_q, rd);
+ } else {
+ TCGv_i32 tcg_shift = tcg_constant_i32(shift);
+ static NeonGenTwoOpEnvFn * const fns[2][2][3] = {
+ {
+ { gen_helper_neon_qshl_s8,
+ gen_helper_neon_qshl_s16,
+ gen_helper_neon_qshl_s32 },
+ { gen_helper_neon_qshlu_s8,
+ gen_helper_neon_qshlu_s16,
+ gen_helper_neon_qshlu_s32 }
+ }, {
+ { NULL, NULL, NULL },
+ { gen_helper_neon_qshl_u8,
+ gen_helper_neon_qshl_u16,
+ gen_helper_neon_qshl_u32 }
+ }
+ };
+ NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size];
+ MemOp memop = scalar ? size : MO_32;
+ int maxpass = scalar ? 1 : is_q ? 4 : 2;
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, pass, memop);
+ genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+ if (scalar) {
+ switch (size) {
+ case 0:
+ tcg_gen_ext8u_i32(tcg_op, tcg_op);
+ break;
+ case 1:
+ tcg_gen_ext16u_i32(tcg_op, tcg_op);
+ break;
+ case 2:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ write_fp_sreg(s, rd, tcg_op);
+ } else {
+ write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+ }
+
+ tcg_temp_free_i32(tcg_op);
+ }
+
+ if (!scalar) {
+ clear_vec_high(s, is_q, rd);
+ }
+ }
+}
+
+/* Common vector code for handling integer to FP conversion */
+static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
+ int elements, int is_signed,
+ int fracbits, int size)
+{
+ TCGv_ptr tcg_fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ TCGv_i32 tcg_shift = NULL;
+
+ MemOp mop = size | (is_signed ? MO_SIGN : 0);
+ int pass;
+
+ if (fracbits || size == MO_64) {
+ tcg_shift = tcg_constant_i32(fracbits);
+ }
+
+ if (size == MO_64) {
+ TCGv_i64 tcg_int64 = tcg_temp_new_i64();
+ TCGv_i64 tcg_double = tcg_temp_new_i64();
+
+ for (pass = 0; pass < elements; pass++) {
+ read_vec_element(s, tcg_int64, rn, pass, mop);
+
+ if (is_signed) {
+ gen_helper_vfp_sqtod(tcg_double, tcg_int64,
+ tcg_shift, tcg_fpst);
+ } else {
+ gen_helper_vfp_uqtod(tcg_double, tcg_int64,
+ tcg_shift, tcg_fpst);
+ }
+ if (elements == 1) {
+ write_fp_dreg(s, rd, tcg_double);
+ } else {
+ write_vec_element(s, tcg_double, rd, pass, MO_64);
+ }
+ }
+
+ tcg_temp_free_i64(tcg_int64);
+ tcg_temp_free_i64(tcg_double);
+
+ } else {
+ TCGv_i32 tcg_int32 = tcg_temp_new_i32();
+ TCGv_i32 tcg_float = tcg_temp_new_i32();
+
+ for (pass = 0; pass < elements; pass++) {
+ read_vec_element_i32(s, tcg_int32, rn, pass, mop);
+
+ switch (size) {
+ case MO_32:
+ if (fracbits) {
+ if (is_signed) {
+ gen_helper_vfp_sltos(tcg_float, tcg_int32,
+ tcg_shift, tcg_fpst);
+ } else {
+ gen_helper_vfp_ultos(tcg_float, tcg_int32,
+ tcg_shift, tcg_fpst);
+ }
+ } else {
+ if (is_signed) {
+ gen_helper_vfp_sitos(tcg_float, tcg_int32, tcg_fpst);
+ } else {
+ gen_helper_vfp_uitos(tcg_float, tcg_int32, tcg_fpst);
+ }
+ }
+ break;
+ case MO_16:
+ if (fracbits) {
+ if (is_signed) {
+ gen_helper_vfp_sltoh(tcg_float, tcg_int32,
+ tcg_shift, tcg_fpst);
+ } else {
+ gen_helper_vfp_ultoh(tcg_float, tcg_int32,
+ tcg_shift, tcg_fpst);
+ }
+ } else {
+ if (is_signed) {
+ gen_helper_vfp_sitoh(tcg_float, tcg_int32, tcg_fpst);
+ } else {
+ gen_helper_vfp_uitoh(tcg_float, tcg_int32, tcg_fpst);
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (elements == 1) {
+ write_fp_sreg(s, rd, tcg_float);
+ } else {
+ write_vec_element_i32(s, tcg_float, rd, pass, size);
+ }
+ }
+
+ tcg_temp_free_i32(tcg_int32);
+ tcg_temp_free_i32(tcg_float);
+ }
+
+ tcg_temp_free_ptr(tcg_fpst);
+
+ clear_vec_high(s, elements << size == 16, rd);
+}
+
+/* UCVTF/SCVTF - Integer to FP conversion */
+static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar,
+ bool is_q, bool is_u,
+ int immh, int immb, int opcode,
+ int rn, int rd)
+{
+ int size, elements, fracbits;
+ int immhb = immh << 3 | immb;
+
+ if (immh & 8) {
+ size = MO_64;
+ if (!is_scalar && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else if (immh & 4) {
+ size = MO_32;
+ } else if (immh & 2) {
+ size = MO_16;
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else {
+ /* immh == 0 would be a failure of the decode logic */
+ g_assert(immh == 1);
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (is_scalar) {
+ elements = 1;
+ } else {
+ elements = (8 << is_q) >> size;
+ }
+ fracbits = (16 << size) - immhb;
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size);
+}
+
+/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */
+static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar,
+ bool is_q, bool is_u,
+ int immh, int immb, int rn, int rd)
+{
+ int immhb = immh << 3 | immb;
+ int pass, size, fracbits;
+ TCGv_ptr tcg_fpstatus;
+ TCGv_i32 tcg_rmode, tcg_shift;
+
+ if (immh & 0x8) {
+ size = MO_64;
+ if (!is_scalar && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else if (immh & 0x4) {
+ size = MO_32;
+ } else if (immh & 0x2) {
+ size = MO_16;
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else {
+ /* Should have split out AdvSIMD modified immediate earlier. */
+ assert(immh == 1);
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ assert(!(is_scalar && is_q));
+
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO));
+ tcg_fpstatus = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ fracbits = (16 << size) - immhb;
+ tcg_shift = tcg_constant_i32(fracbits);
+
+ if (size == MO_64) {
+ int maxpass = is_scalar ? 1 : 2;
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ if (is_u) {
+ gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+ } else {
+ gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+ }
+ write_vec_element(s, tcg_op, rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_op);
+ }
+ clear_vec_high(s, is_q, rd);
+ } else {
+ void (*fn)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
+ int maxpass = is_scalar ? 1 : ((8 << is_q) >> size);
+
+ switch (size) {
+ case MO_16:
+ if (is_u) {
+ fn = gen_helper_vfp_touhh;
+ } else {
+ fn = gen_helper_vfp_toshh;
+ }
+ break;
+ case MO_32:
+ if (is_u) {
+ fn = gen_helper_vfp_touls;
+ } else {
+ fn = gen_helper_vfp_tosls;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, pass, size);
+ fn(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+ if (is_scalar) {
+ write_fp_sreg(s, rd, tcg_op);
+ } else {
+ write_vec_element_i32(s, tcg_op, rd, pass, size);
+ }
+ tcg_temp_free_i32(tcg_op);
+ }
+ if (!is_scalar) {
+ clear_vec_high(s, is_q, rd);
+ }
+ }
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ tcg_temp_free_ptr(tcg_fpstatus);
+ tcg_temp_free_i32(tcg_rmode);
+}
+
+/* AdvSIMD scalar shift by immediate
+ * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd |
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ *
+ * This is the scalar version so it works on a fixed sized registers
+ */
+static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int immb = extract32(insn, 16, 3);
+ int immh = extract32(insn, 19, 4);
+ bool is_u = extract32(insn, 29, 1);
+
+ if (immh == 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x08: /* SRI */
+ if (!is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA */
+ case 0x04: /* SRSHR / URSHR */
+ case 0x06: /* SRSRA / URSRA */
+ handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x0a: /* SHL / SLI */
+ handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x1c: /* SCVTF, UCVTF */
+ handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
+ opcode, rn, rd);
+ break;
+ case 0x10: /* SQSHRUN, SQSHRUN2 */
+ case 0x11: /* SQRSHRUN, SQRSHRUN2 */
+ if (!is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_vec_simd_sqshrn(s, true, false, false, true,
+ immh, immb, opcode, rn, rd);
+ break;
+ case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */
+ case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */
+ handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
+ immh, immb, opcode, rn, rd);
+ break;
+ case 0xc: /* SQSHLU */
+ if (!is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd);
+ break;
+ case 0xe: /* SQSHL, UQSHL */
+ handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
+ break;
+ case 0x1f: /* FCVTZS, FCVTZU */
+ handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
+ break;
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* AdvSIMD scalar three different
+ * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd |
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+ bool is_u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 4);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ if (is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x9: /* SQDMLAL, SQDMLAL2 */
+ case 0xb: /* SQDMLSL, SQDMLSL2 */
+ case 0xd: /* SQDMULL, SQDMULL2 */
+ if (size == 0 || size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (size == 2) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN);
+ read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN);
+
+ tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2);
+ gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, tcg_res, tcg_res);
+
+ switch (opcode) {
+ case 0xd: /* SQDMULL, SQDMULL2 */
+ break;
+ case 0xb: /* SQDMLSL, SQDMLSL2 */
+ tcg_gen_neg_i64(tcg_res, tcg_res);
+ /* fall through */
+ case 0x9: /* SQDMLAL, SQDMLAL2 */
+ read_vec_element(s, tcg_op1, rd, 0, MO_64);
+ gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env,
+ tcg_res, tcg_op1);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_dreg(s, rd, tcg_res);
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_res);
+ } else {
+ TCGv_i32 tcg_op1 = read_fp_hreg(s, rn);
+ TCGv_i32 tcg_op2 = read_fp_hreg(s, rm);
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2);
+ gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, tcg_res, tcg_res);
+
+ switch (opcode) {
+ case 0xd: /* SQDMULL, SQDMULL2 */
+ break;
+ case 0xb: /* SQDMLSL, SQDMLSL2 */
+ gen_helper_neon_negl_u32(tcg_res, tcg_res);
+ /* fall through */
+ case 0x9: /* SQDMLAL, SQDMLAL2 */
+ {
+ TCGv_i64 tcg_op3 = tcg_temp_new_i64();
+ read_vec_element(s, tcg_op3, rd, 0, MO_32);
+ gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env,
+ tcg_res, tcg_op3);
+ tcg_temp_free_i64(tcg_op3);
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_gen_ext32u_i64(tcg_res, tcg_res);
+ write_fp_dreg(s, rd, tcg_res);
+
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i64(tcg_res);
+ }
+}
+
+static void handle_3same_64(DisasContext *s, int opcode, bool u,
+ TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
+{
+ /* Handle 64x64->64 opcodes which are shared between the scalar
+ * and vector 3-same groups. We cover every opcode where size == 3
+ * is valid in either the three-reg-same (integer, not pairwise)
+ * or scalar-three-reg-same groups.
+ */
+ TCGCond cond;
+
+ switch (opcode) {
+ case 0x1: /* SQADD */
+ if (u) {
+ gen_helper_neon_qadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ } else {
+ gen_helper_neon_qadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0x5: /* SQSUB */
+ if (u) {
+ gen_helper_neon_qsub_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ } else {
+ gen_helper_neon_qsub_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0x6: /* CMGT, CMHI */
+ /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
+ * We implement this using setcond (test) and then negating.
+ */
+ cond = u ? TCG_COND_GTU : TCG_COND_GT;
+ do_cmop:
+ tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm);
+ tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ break;
+ case 0x7: /* CMGE, CMHS */
+ cond = u ? TCG_COND_GEU : TCG_COND_GE;
+ goto do_cmop;
+ case 0x11: /* CMTST, CMEQ */
+ if (u) {
+ cond = TCG_COND_EQ;
+ goto do_cmop;
+ }
+ gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm);
+ break;
+ case 0x8: /* SSHL, USHL */
+ if (u) {
+ gen_ushl_i64(tcg_rd, tcg_rn, tcg_rm);
+ } else {
+ gen_sshl_i64(tcg_rd, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0x9: /* SQSHL, UQSHL */
+ if (u) {
+ gen_helper_neon_qshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ } else {
+ gen_helper_neon_qshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0xa: /* SRSHL, URSHL */
+ if (u) {
+ gen_helper_neon_rshl_u64(tcg_rd, tcg_rn, tcg_rm);
+ } else {
+ gen_helper_neon_rshl_s64(tcg_rd, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0xb: /* SQRSHL, UQRSHL */
+ if (u) {
+ gen_helper_neon_qrshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ } else {
+ gen_helper_neon_qrshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0x10: /* ADD, SUB */
+ if (u) {
+ tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
+ } else {
+ tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Handle the 3-same-operands float operations; shared by the scalar
+ * and vector encodings. The caller must filter out any encodings
+ * not allocated for the encoding it is dealing with.
+ */
+static void handle_3same_float(DisasContext *s, int size, int elements,
+ int fpopcode, int rd, int rn, int rm)
+{
+ int pass;
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+ for (pass = 0; pass < elements; pass++) {
+ if (size) {
+ /* Double */
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ switch (fpopcode) {
+ case 0x39: /* FMLS */
+ /* As usual for ARM, separate negation for fused multiply-add */
+ gen_helper_vfp_negd(tcg_op1, tcg_op1);
+ /* fall through */
+ case 0x19: /* FMLA */
+ read_vec_element(s, tcg_res, rd, pass, MO_64);
+ gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2,
+ tcg_res, fpst);
+ break;
+ case 0x18: /* FMAXNM */
+ gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1a: /* FADD */
+ gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1b: /* FMULX */
+ gen_helper_vfp_mulxd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1c: /* FCMEQ */
+ gen_helper_neon_ceq_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1e: /* FMAX */
+ gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1f: /* FRECPS */
+ gen_helper_recpsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x38: /* FMINNM */
+ gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3a: /* FSUB */
+ gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3e: /* FMIN */
+ gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3f: /* FRSQRTS */
+ gen_helper_rsqrtsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5b: /* FMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5c: /* FCMGE */
+ gen_helper_neon_cge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5d: /* FACGE */
+ gen_helper_neon_acge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5f: /* FDIV */
+ gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7a: /* FABD */
+ gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_absd(tcg_res, tcg_res);
+ break;
+ case 0x7c: /* FCMGT */
+ gen_helper_neon_cgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7d: /* FACGT */
+ gen_helper_neon_acgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ } else {
+ /* Single */
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+ switch (fpopcode) {
+ case 0x39: /* FMLS */
+ /* As usual for ARM, separate negation for fused multiply-add */
+ gen_helper_vfp_negs(tcg_op1, tcg_op1);
+ /* fall through */
+ case 0x19: /* FMLA */
+ read_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+ gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2,
+ tcg_res, fpst);
+ break;
+ case 0x1a: /* FADD */
+ gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1b: /* FMULX */
+ gen_helper_vfp_mulxs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1c: /* FCMEQ */
+ gen_helper_neon_ceq_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1e: /* FMAX */
+ gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1f: /* FRECPS */
+ gen_helper_recpsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x18: /* FMAXNM */
+ gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x38: /* FMINNM */
+ gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3a: /* FSUB */
+ gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3e: /* FMIN */
+ gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3f: /* FRSQRTS */
+ gen_helper_rsqrtsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5b: /* FMUL */
+ gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5c: /* FCMGE */
+ gen_helper_neon_cge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5d: /* FACGE */
+ gen_helper_neon_acge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5f: /* FDIV */
+ gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7a: /* FABD */
+ gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_abss(tcg_res, tcg_res);
+ break;
+ case 0x7c: /* FCMGT */
+ gen_helper_neon_cgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7d: /* FACGT */
+ gen_helper_neon_acgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (elements == 1) {
+ /* scalar single so clear high part */
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
+ write_vec_element(s, tcg_tmp, rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_tmp);
+ } else {
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+ }
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+ }
+
+ tcg_temp_free_ptr(fpst);
+
+ clear_vec_high(s, elements * (size ? 8 : 4) > 8, rd);
+}
+
+/* AdvSIMD scalar three same
+ * 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 | Rm | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ TCGv_i64 tcg_rd;
+
+ if (opcode >= 0x18) {
+ /* Floating point: U, size[1] and opcode indicate operation */
+ int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
+ switch (fpopcode) {
+ case 0x1b: /* FMULX */
+ case 0x1f: /* FRECPS */
+ case 0x3f: /* FRSQRTS */
+ case 0x5d: /* FACGE */
+ case 0x7d: /* FACGT */
+ case 0x1c: /* FCMEQ */
+ case 0x5c: /* FCMGE */
+ case 0x7c: /* FCMGT */
+ case 0x7a: /* FABD */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x1: /* SQADD, UQADD */
+ case 0x5: /* SQSUB, UQSUB */
+ case 0x9: /* SQSHL, UQSHL */
+ case 0xb: /* SQRSHL, UQRSHL */
+ break;
+ case 0x8: /* SSHL, USHL */
+ case 0xa: /* SRSHL, URSHL */
+ case 0x6: /* CMGT, CMHI */
+ case 0x7: /* CMGE, CMHS */
+ case 0x11: /* CMTST, CMEQ */
+ case 0x10: /* ADD, SUB (vector) */
+ if (size != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x16: /* SQDMULH, SQRDMULH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_rd = tcg_temp_new_i64();
+
+ if (size == 3) {
+ TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+ TCGv_i64 tcg_rm = read_fp_dreg(s, rm);
+
+ handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rm);
+ } else {
+ /* Do a single operation on the lowest element in the vector.
+ * We use the standard Neon helpers and rely on 0 OP 0 == 0 with
+ * no side effects for all these operations.
+ * OPTME: special-purpose helpers would avoid doing some
+ * unnecessary work in the helper for the 8 and 16 bit cases.
+ */
+ NeonGenTwoOpEnvFn *genenvfn;
+ TCGv_i32 tcg_rn = tcg_temp_new_i32();
+ TCGv_i32 tcg_rm = tcg_temp_new_i32();
+ TCGv_i32 tcg_rd32 = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_rn, rn, 0, size);
+ read_vec_element_i32(s, tcg_rm, rm, 0, size);
+
+ switch (opcode) {
+ case 0x1: /* SQADD, UQADD */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qadd_s8, gen_helper_neon_qadd_u8 },
+ { gen_helper_neon_qadd_s16, gen_helper_neon_qadd_u16 },
+ { gen_helper_neon_qadd_s32, gen_helper_neon_qadd_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0x5: /* SQSUB, UQSUB */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qsub_s8, gen_helper_neon_qsub_u8 },
+ { gen_helper_neon_qsub_s16, gen_helper_neon_qsub_u16 },
+ { gen_helper_neon_qsub_s32, gen_helper_neon_qsub_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0x9: /* SQSHL, UQSHL */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
+ { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
+ { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0xb: /* SQRSHL, UQRSHL */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
+ { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
+ { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0x16: /* SQDMULH, SQRDMULH */
+ {
+ static NeonGenTwoOpEnvFn * const fns[2][2] = {
+ { gen_helper_neon_qdmulh_s16, gen_helper_neon_qrdmulh_s16 },
+ { gen_helper_neon_qdmulh_s32, gen_helper_neon_qrdmulh_s32 },
+ };
+ assert(size == 1 || size == 2);
+ genenvfn = fns[size - 1][u];
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ genenvfn(tcg_rd32, cpu_env, tcg_rn, tcg_rm);
+ tcg_gen_extu_i32_i64(tcg_rd, tcg_rd32);
+ tcg_temp_free_i32(tcg_rd32);
+ tcg_temp_free_i32(tcg_rn);
+ tcg_temp_free_i32(tcg_rm);
+ }
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rd);
+}
+
+/* AdvSIMD scalar three same FP16
+ * 31 30 29 28 24 23 22 21 20 16 15 14 13 11 10 9 5 4 0
+ * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
+ * | 0 1 | U | 1 1 1 1 0 | a | 1 0 | Rm | 0 0 | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
+ * v: 0101 1110 0100 0000 0000 0100 0000 0000 => 5e400400
+ * m: 1101 1111 0110 0000 1100 0100 0000 0000 => df60c400
+ */
+static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s,
+ uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 3);
+ int rm = extract32(insn, 16, 5);
+ bool u = extract32(insn, 29, 1);
+ bool a = extract32(insn, 23, 1);
+ int fpopcode = opcode | (a << 3) | (u << 4);
+ TCGv_ptr fpst;
+ TCGv_i32 tcg_op1;
+ TCGv_i32 tcg_op2;
+ TCGv_i32 tcg_res;
+
+ switch (fpopcode) {
+ case 0x03: /* FMULX */
+ case 0x04: /* FCMEQ (reg) */
+ case 0x07: /* FRECPS */
+ case 0x0f: /* FRSQRTS */
+ case 0x14: /* FCMGE (reg) */
+ case 0x15: /* FACGE */
+ case 0x1a: /* FABD */
+ case 0x1c: /* FCMGT (reg) */
+ case 0x1d: /* FACGT */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ tcg_op1 = read_fp_hreg(s, rn);
+ tcg_op2 = read_fp_hreg(s, rm);
+ tcg_res = tcg_temp_new_i32();
+
+ switch (fpopcode) {
+ case 0x03: /* FMULX */
+ gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x04: /* FCMEQ (reg) */
+ gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x07: /* FRECPS */
+ gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x0f: /* FRSQRTS */
+ gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x14: /* FCMGE (reg) */
+ gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x15: /* FACGE */
+ gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1a: /* FABD */
+ gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+ tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
+ break;
+ case 0x1c: /* FCMGT (reg) */
+ gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1d: /* FACGT */
+ gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_sreg(s, rd, tcg_res);
+
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_ptr(fpst);
+}
+
+/* AdvSIMD scalar three same extra
+ * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 1 | U | 1 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_scalar_three_reg_same_extra(DisasContext *s,
+ uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 4);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ TCGv_i32 ele1, ele2, ele3;
+ TCGv_i64 res;
+ bool feature;
+
+ switch (u * 16 + opcode) {
+ case 0x10: /* SQRDMLAH (vector) */
+ case 0x11: /* SQRDMLSH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_rdm, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ /* Do a single operation on the lowest element in the vector.
+ * We use the standard Neon helpers and rely on 0 OP 0 == 0
+ * with no side effects for all these operations.
+ * OPTME: special-purpose helpers would avoid doing some
+ * unnecessary work in the helper for the 16 bit cases.
+ */
+ ele1 = tcg_temp_new_i32();
+ ele2 = tcg_temp_new_i32();
+ ele3 = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, ele1, rn, 0, size);
+ read_vec_element_i32(s, ele2, rm, 0, size);
+ read_vec_element_i32(s, ele3, rd, 0, size);
+
+ switch (opcode) {
+ case 0x0: /* SQRDMLAH */
+ if (size == 1) {
+ gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3);
+ } else {
+ gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3);
+ }
+ break;
+ case 0x1: /* SQRDMLSH */
+ if (size == 1) {
+ gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3);
+ } else {
+ gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i32(ele1);
+ tcg_temp_free_i32(ele2);
+
+ res = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(res, ele3);
+ tcg_temp_free_i32(ele3);
+
+ write_fp_dreg(s, rd, res);
+ tcg_temp_free_i64(res);
+}
+
+static void handle_2misc_64(DisasContext *s, int opcode, bool u,
+ TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
+ TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
+{
+ /* Handle 64->64 opcodes which are shared between the scalar and
+ * vector 2-reg-misc groups. We cover every integer opcode where size == 3
+ * is valid in either group and also the double-precision fp ops.
+ * The caller only need provide tcg_rmode and tcg_fpstatus if the op
+ * requires them.
+ */
+ TCGCond cond;
+
+ switch (opcode) {
+ case 0x4: /* CLS, CLZ */
+ if (u) {
+ tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
+ } else {
+ tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
+ }
+ break;
+ case 0x5: /* NOT */
+ /* This opcode is shared with CNT and RBIT but we have earlier
+ * enforced that size == 3 if and only if this is the NOT insn.
+ */
+ tcg_gen_not_i64(tcg_rd, tcg_rn);
+ break;
+ case 0x7: /* SQABS, SQNEG */
+ if (u) {
+ gen_helper_neon_qneg_s64(tcg_rd, cpu_env, tcg_rn);
+ } else {
+ gen_helper_neon_qabs_s64(tcg_rd, cpu_env, tcg_rn);
+ }
+ break;
+ case 0xa: /* CMLT */
+ /* 64 bit integer comparison against zero, result is
+ * test ? (2^64 - 1) : 0. We implement via setcond(!test) and
+ * subtracting 1.
+ */
+ cond = TCG_COND_LT;
+ do_cmop:
+ tcg_gen_setcondi_i64(cond, tcg_rd, tcg_rn, 0);
+ tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ break;
+ case 0x8: /* CMGT, CMGE */
+ cond = u ? TCG_COND_GE : TCG_COND_GT;
+ goto do_cmop;
+ case 0x9: /* CMEQ, CMLE */
+ cond = u ? TCG_COND_LE : TCG_COND_EQ;
+ goto do_cmop;
+ case 0xb: /* ABS, NEG */
+ if (u) {
+ tcg_gen_neg_i64(tcg_rd, tcg_rn);
+ } else {
+ tcg_gen_abs_i64(tcg_rd, tcg_rn);
+ }
+ break;
+ case 0x2f: /* FABS */
+ gen_helper_vfp_absd(tcg_rd, tcg_rn);
+ break;
+ case 0x6f: /* FNEG */
+ gen_helper_vfp_negd(tcg_rd, tcg_rn);
+ break;
+ case 0x7f: /* FSQRT */
+ gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env);
+ break;
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x1c: /* FCVTAS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
+ break;
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x5c: /* FCVTAU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
+ break;
+ case 0x18: /* FRINTN */
+ case 0x19: /* FRINTM */
+ case 0x38: /* FRINTP */
+ case 0x39: /* FRINTZ */
+ case 0x58: /* FRINTA */
+ case 0x79: /* FRINTI */
+ gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus);
+ break;
+ case 0x59: /* FRINTX */
+ gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus);
+ break;
+ case 0x1e: /* FRINT32Z */
+ case 0x5e: /* FRINT32X */
+ gen_helper_frint32_d(tcg_rd, tcg_rn, tcg_fpstatus);
+ break;
+ case 0x1f: /* FRINT64Z */
+ case 0x5f: /* FRINT64X */
+ gen_helper_frint64_d(tcg_rd, tcg_rn, tcg_fpstatus);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void handle_2misc_fcmp_zero(DisasContext *s, int opcode,
+ bool is_scalar, bool is_u, bool is_q,
+ int size, int rn, int rd)
+{
+ bool is_double = (size == MO_64);
+ TCGv_ptr fpst;
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ if (is_double) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ TCGv_i64 tcg_zero = tcg_constant_i64(0);
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+ NeonGenTwoDoubleOpFn *genfn;
+ bool swap = false;
+ int pass;
+
+ switch (opcode) {
+ case 0x2e: /* FCMLT (zero) */
+ swap = true;
+ /* fallthrough */
+ case 0x2c: /* FCMGT (zero) */
+ genfn = gen_helper_neon_cgt_f64;
+ break;
+ case 0x2d: /* FCMEQ (zero) */
+ genfn = gen_helper_neon_ceq_f64;
+ break;
+ case 0x6d: /* FCMLE (zero) */
+ swap = true;
+ /* fall through */
+ case 0x6c: /* FCMGE (zero) */
+ genfn = gen_helper_neon_cge_f64;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ if (swap) {
+ genfn(tcg_res, tcg_zero, tcg_op, fpst);
+ } else {
+ genfn(tcg_res, tcg_op, tcg_zero, fpst);
+ }
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+ }
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op);
+
+ clear_vec_high(s, !is_scalar, rd);
+ } else {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i32 tcg_zero = tcg_constant_i32(0);
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ NeonGenTwoSingleOpFn *genfn;
+ bool swap = false;
+ int pass, maxpasses;
+
+ if (size == MO_16) {
+ switch (opcode) {
+ case 0x2e: /* FCMLT (zero) */
+ swap = true;
+ /* fall through */
+ case 0x2c: /* FCMGT (zero) */
+ genfn = gen_helper_advsimd_cgt_f16;
+ break;
+ case 0x2d: /* FCMEQ (zero) */
+ genfn = gen_helper_advsimd_ceq_f16;
+ break;
+ case 0x6d: /* FCMLE (zero) */
+ swap = true;
+ /* fall through */
+ case 0x6c: /* FCMGE (zero) */
+ genfn = gen_helper_advsimd_cge_f16;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ switch (opcode) {
+ case 0x2e: /* FCMLT (zero) */
+ swap = true;
+ /* fall through */
+ case 0x2c: /* FCMGT (zero) */
+ genfn = gen_helper_neon_cgt_f32;
+ break;
+ case 0x2d: /* FCMEQ (zero) */
+ genfn = gen_helper_neon_ceq_f32;
+ break;
+ case 0x6d: /* FCMLE (zero) */
+ swap = true;
+ /* fall through */
+ case 0x6c: /* FCMGE (zero) */
+ genfn = gen_helper_neon_cge_f32;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ if (is_scalar) {
+ maxpasses = 1;
+ } else {
+ int vector_size = 8 << is_q;
+ maxpasses = vector_size >> size;
+ }
+
+ for (pass = 0; pass < maxpasses; pass++) {
+ read_vec_element_i32(s, tcg_op, rn, pass, size);
+ if (swap) {
+ genfn(tcg_res, tcg_zero, tcg_op, fpst);
+ } else {
+ genfn(tcg_res, tcg_op, tcg_zero, fpst);
+ }
+ if (is_scalar) {
+ write_fp_sreg(s, rd, tcg_res);
+ } else {
+ write_vec_element_i32(s, tcg_res, rd, pass, size);
+ }
+ }
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ if (!is_scalar) {
+ clear_vec_high(s, is_q, rd);
+ }
+ }
+
+ tcg_temp_free_ptr(fpst);
+}
+
+static void handle_2misc_reciprocal(DisasContext *s, int opcode,
+ bool is_scalar, bool is_u, bool is_q,
+ int size, int rn, int rd)
+{
+ bool is_double = (size == 3);
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+ if (is_double) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+ int pass;
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ switch (opcode) {
+ case 0x3d: /* FRECPE */
+ gen_helper_recpe_f64(tcg_res, tcg_op, fpst);
+ break;
+ case 0x3f: /* FRECPX */
+ gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
+ break;
+ case 0x7d: /* FRSQRTE */
+ gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+ }
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op);
+ clear_vec_high(s, !is_scalar, rd);
+ } else {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ int pass, maxpasses;
+
+ if (is_scalar) {
+ maxpasses = 1;
+ } else {
+ maxpasses = is_q ? 4 : 2;
+ }
+
+ for (pass = 0; pass < maxpasses; pass++) {
+ read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+
+ switch (opcode) {
+ case 0x3c: /* URECPE */
+ gen_helper_recpe_u32(tcg_res, tcg_op);
+ break;
+ case 0x3d: /* FRECPE */
+ gen_helper_recpe_f32(tcg_res, tcg_op, fpst);
+ break;
+ case 0x3f: /* FRECPX */
+ gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
+ break;
+ case 0x7d: /* FRSQRTE */
+ gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (is_scalar) {
+ write_fp_sreg(s, rd, tcg_res);
+ } else {
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+ }
+ }
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ if (!is_scalar) {
+ clear_vec_high(s, is_q, rd);
+ }
+ }
+ tcg_temp_free_ptr(fpst);
+}
+
+static void handle_2misc_narrow(DisasContext *s, bool scalar,
+ int opcode, bool u, bool is_q,
+ int size, int rn, int rd)
+{
+ /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
+ * in the source becomes a size element in the destination).
+ */
+ int pass;
+ TCGv_i32 tcg_res[2];
+ int destelt = is_q ? 2 : 0;
+ int passes = scalar ? 1 : 2;
+
+ if (scalar) {
+ tcg_res[1] = tcg_constant_i32(0);
+ }
+
+ for (pass = 0; pass < passes; pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ NeonGenNarrowFn *genfn = NULL;
+ NeonGenNarrowEnvFn *genenvfn = NULL;
+
+ if (scalar) {
+ read_vec_element(s, tcg_op, rn, pass, size + 1);
+ } else {
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ }
+ tcg_res[pass] = tcg_temp_new_i32();
+
+ switch (opcode) {
+ case 0x12: /* XTN, SQXTUN */
+ {
+ static NeonGenNarrowFn * const xtnfns[3] = {
+ gen_helper_neon_narrow_u8,
+ gen_helper_neon_narrow_u16,
+ tcg_gen_extrl_i64_i32,
+ };
+ static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
+ gen_helper_neon_unarrow_sat8,
+ gen_helper_neon_unarrow_sat16,
+ gen_helper_neon_unarrow_sat32,
+ };
+ if (u) {
+ genenvfn = sqxtunfns[size];
+ } else {
+ genfn = xtnfns[size];
+ }
+ break;
+ }
+ case 0x14: /* SQXTN, UQXTN */
+ {
+ static NeonGenNarrowEnvFn * const fns[3][2] = {
+ { gen_helper_neon_narrow_sat_s8,
+ gen_helper_neon_narrow_sat_u8 },
+ { gen_helper_neon_narrow_sat_s16,
+ gen_helper_neon_narrow_sat_u16 },
+ { gen_helper_neon_narrow_sat_s32,
+ gen_helper_neon_narrow_sat_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0x16: /* FCVTN, FCVTN2 */
+ /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
+ if (size == 2) {
+ gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
+ } else {
+ TCGv_i32 tcg_lo = tcg_temp_new_i32();
+ TCGv_i32 tcg_hi = tcg_temp_new_i32();
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+ TCGv_i32 ahp = get_ahp_flag();
+
+ tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op);
+ gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp);
+ gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp);
+ tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
+ tcg_temp_free_i32(tcg_lo);
+ tcg_temp_free_i32(tcg_hi);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(ahp);
+ }
+ break;
+ case 0x36: /* BFCVTN, BFCVTN2 */
+ {
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst);
+ tcg_temp_free_ptr(fpst);
+ }
+ break;
+ case 0x56: /* FCVTXN, FCVTXN2 */
+ /* 64 bit to 32 bit float conversion
+ * with von Neumann rounding (round to odd)
+ */
+ assert(size == 2);
+ gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (genfn) {
+ genfn(tcg_res[pass], tcg_op);
+ } else if (genenvfn) {
+ genenvfn(tcg_res[pass], cpu_env, tcg_op);
+ }
+
+ tcg_temp_free_i64(tcg_op);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
+ tcg_temp_free_i32(tcg_res[pass]);
+ }
+ clear_vec_high(s, is_q, rd);
+}
+
+/* Remaining saturating accumulating ops */
+static void handle_2misc_satacc(DisasContext *s, bool is_scalar, bool is_u,
+ bool is_q, int size, int rn, int rd)
+{
+ bool is_double = (size == 3);
+
+ if (is_double) {
+ TCGv_i64 tcg_rn = tcg_temp_new_i64();
+ TCGv_i64 tcg_rd = tcg_temp_new_i64();
+ int pass;
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ read_vec_element(s, tcg_rn, rn, pass, MO_64);
+ read_vec_element(s, tcg_rd, rd, pass, MO_64);
+
+ if (is_u) { /* USQADD */
+ gen_helper_neon_uqadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ } else { /* SUQADD */
+ gen_helper_neon_sqadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ }
+ write_vec_element(s, tcg_rd, rd, pass, MO_64);
+ }
+ tcg_temp_free_i64(tcg_rd);
+ tcg_temp_free_i64(tcg_rn);
+ clear_vec_high(s, !is_scalar, rd);
+ } else {
+ TCGv_i32 tcg_rn = tcg_temp_new_i32();
+ TCGv_i32 tcg_rd = tcg_temp_new_i32();
+ int pass, maxpasses;
+
+ if (is_scalar) {
+ maxpasses = 1;
+ } else {
+ maxpasses = is_q ? 4 : 2;
+ }
+
+ for (pass = 0; pass < maxpasses; pass++) {
+ if (is_scalar) {
+ read_vec_element_i32(s, tcg_rn, rn, pass, size);
+ read_vec_element_i32(s, tcg_rd, rd, pass, size);
+ } else {
+ read_vec_element_i32(s, tcg_rn, rn, pass, MO_32);
+ read_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
+ }
+
+ if (is_u) { /* USQADD */
+ switch (size) {
+ case 0:
+ gen_helper_neon_uqadd_s8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ case 1:
+ gen_helper_neon_uqadd_s16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ case 2:
+ gen_helper_neon_uqadd_s32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else { /* SUQADD */
+ switch (size) {
+ case 0:
+ gen_helper_neon_sqadd_u8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ case 1:
+ gen_helper_neon_sqadd_u16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ case 2:
+ gen_helper_neon_sqadd_u32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ if (is_scalar) {
+ write_vec_element(s, tcg_constant_i64(0), rd, 0, MO_64);
+ }
+ write_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
+ }
+ tcg_temp_free_i32(tcg_rd);
+ tcg_temp_free_i32(tcg_rn);
+ clear_vec_high(s, is_q, rd);
+ }
+}
+
+/* AdvSIMD scalar two reg misc
+ * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 | Rn | Rd |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 12, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ bool is_fcvt = false;
+ int rmode;
+ TCGv_i32 tcg_rmode;
+ TCGv_ptr tcg_fpstatus;
+
+ switch (opcode) {
+ case 0x3: /* USQADD / SUQADD*/
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_satacc(s, true, u, false, size, rn, rd);
+ return;
+ case 0x7: /* SQABS / SQNEG */
+ break;
+ case 0xa: /* CMLT */
+ if (u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x8: /* CMGT, CMGE */
+ case 0x9: /* CMEQ, CMLE */
+ case 0xb: /* ABS, NEG */
+ if (size != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x12: /* SQXTUN */
+ if (!u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x14: /* SQXTN, UQXTN */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd);
+ return;
+ case 0xc ... 0xf:
+ case 0x16 ... 0x1d:
+ case 0x1f:
+ /* Floating point: U, size[1] and opcode indicate operation;
+ * size[0] indicates single or double precision.
+ */
+ opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+ size = extract32(size, 0, 1) ? 3 : 2;
+ switch (opcode) {
+ case 0x2c: /* FCMGT (zero) */
+ case 0x2d: /* FCMEQ (zero) */
+ case 0x2e: /* FCMLT (zero) */
+ case 0x6c: /* FCMGE (zero) */
+ case 0x6d: /* FCMLE (zero) */
+ handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd);
+ return;
+ case 0x1d: /* SCVTF */
+ case 0x5d: /* UCVTF */
+ {
+ bool is_signed = (opcode == 0x1d);
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
+ return;
+ }
+ case 0x3d: /* FRECPE */
+ case 0x3f: /* FRECPX */
+ case 0x7d: /* FRSQRTE */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
+ return;
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ is_fcvt = true;
+ rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+ break;
+ case 0x1c: /* FCVTAS */
+ case 0x5c: /* FCVTAU */
+ /* TIEAWAY doesn't fit in the usual rounding mode encoding */
+ is_fcvt = true;
+ rmode = FPROUNDING_TIEAWAY;
+ break;
+ case 0x56: /* FCVTXN, FCVTXN2 */
+ if (size == 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
+ return;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (is_fcvt) {
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+ tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ } else {
+ tcg_rmode = NULL;
+ tcg_fpstatus = NULL;
+ }
+
+ if (size == 3) {
+ TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+ TCGv_i64 tcg_rd = tcg_temp_new_i64();
+
+ handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus);
+ write_fp_dreg(s, rd, tcg_rd);
+ tcg_temp_free_i64(tcg_rd);
+ tcg_temp_free_i64(tcg_rn);
+ } else {
+ TCGv_i32 tcg_rn = tcg_temp_new_i32();
+ TCGv_i32 tcg_rd = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_rn, rn, 0, size);
+
+ switch (opcode) {
+ case 0x7: /* SQABS, SQNEG */
+ {
+ NeonGenOneOpEnvFn *genfn;
+ static NeonGenOneOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
+ { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
+ { gen_helper_neon_qabs_s32, gen_helper_neon_qneg_s32 },
+ };
+ genfn = fns[size][u];
+ genfn(tcg_rd, cpu_env, tcg_rn);
+ break;
+ }
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x1c: /* FCVTAS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_constant_i32(0),
+ tcg_fpstatus);
+ break;
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x5c: /* FCVTAU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_constant_i32(0),
+ tcg_fpstatus);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_fp_sreg(s, rd, tcg_rd);
+ tcg_temp_free_i32(tcg_rd);
+ tcg_temp_free_i32(tcg_rn);
+ }
+
+ if (is_fcvt) {
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ tcg_temp_free_i32(tcg_rmode);
+ tcg_temp_free_ptr(tcg_fpstatus);
+ }
+}
+
+/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
+static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ GVecGen2iFn *gvec_fn;
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ tcg_debug_assert(size <= 3);
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra;
+ break;
+
+ case 0x08: /* SRI */
+ gvec_fn = gen_gvec_sri;
+ break;
+
+ case 0x00: /* SSHR / USHR */
+ if (is_u) {
+ if (shift == 8 << size) {
+ /* Shift count the same size as element size produces zero. */
+ tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
+ is_q ? 16 : 8, vec_full_reg_size(s), 0);
+ return;
+ }
+ gvec_fn = tcg_gen_gvec_shri;
+ } else {
+ /* Shift count the same size as element size produces all sign. */
+ if (shift == 8 << size) {
+ shift -= 1;
+ }
+ gvec_fn = tcg_gen_gvec_sari;
+ }
+ break;
+
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr;
+ break;
+
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra;
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size);
+}
+
+/* SHL/SLI - Vector shift left */
+static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+
+ /* Range of size is limited by decode: immh is a non-zero 4 bit field */
+ assert(size >= 0 && size <= 3);
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (insert) {
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size);
+ } else {
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
+ }
+}
+
+/* USHLL/SHLL - Vector shift left with widening */
+static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ int dsize = 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ int i;
+
+ if (size >= 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ /* For the LL variants the store is larger than the load,
+ * so if rd == rn we would overwrite parts of our input.
+ * So load everything right now and use shifts in the main loop.
+ */
+ read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
+
+ for (i = 0; i < elements; i++) {
+ tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
+ ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+ tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
+ write_vec_element(s, tcg_rd, rd, i, size + 1);
+ }
+}
+
+/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
+static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int immhb = immh << 3 | immb;
+ int size = 32 - clz32(immh) - 1;
+ int dsize = 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ int shift = (2 * esize) - immhb;
+ bool round = extract32(opcode, 0, 1);
+ TCGv_i64 tcg_rn, tcg_rd, tcg_final;
+ TCGv_i64 tcg_round;
+ int i;
+
+ if (extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ tcg_rn = tcg_temp_new_i64();
+ tcg_rd = tcg_temp_new_i64();
+ tcg_final = tcg_temp_new_i64();
+ read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
+
+ if (round) {
+ tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+ } else {
+ tcg_round = NULL;
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, size+1);
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ false, true, size+1, shift);
+
+ tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+ }
+
+ if (!is_q) {
+ write_vec_element(s, tcg_final, rd, 0, MO_64);
+ } else {
+ write_vec_element(s, tcg_final, rd, 1, MO_64);
+ }
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+ tcg_temp_free_i64(tcg_final);
+
+ clear_vec_high(s, is_q, rd);
+}
+
+
+/* AdvSIMD shift by immediate
+ * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd |
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ */
+static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int immb = extract32(insn, 16, 3);
+ int immh = extract32(insn, 19, 4);
+ bool is_u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+
+ /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */
+ assert(immh != 0);
+
+ switch (opcode) {
+ case 0x08: /* SRI */
+ if (!is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA (accumulate) */
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x0a: /* SHL / SLI */
+ handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x10: /* SHRN */
+ case 0x11: /* RSHRN / SQRSHRUN */
+ if (is_u) {
+ handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
+ opcode, rn, rd);
+ } else {
+ handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd);
+ }
+ break;
+ case 0x12: /* SQSHRN / UQSHRN */
+ case 0x13: /* SQRSHRN / UQRSHRN */
+ handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
+ opcode, rn, rd);
+ break;
+ case 0x14: /* SSHLL / USHLL */
+ handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x1c: /* SCVTF / UCVTF */
+ handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
+ opcode, rn, rd);
+ break;
+ case 0xc: /* SQSHLU */
+ if (!is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd);
+ break;
+ case 0xe: /* SQSHL, UQSHL */
+ handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
+ break;
+ case 0x1f: /* FCVTZS/ FCVTZU */
+ handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
+ return;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+}
+
+/* Generate code to do a "long" addition or subtraction, ie one done in
+ * TCGv_i64 on vector lanes twice the width specified by size.
+ */
+static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res,
+ TCGv_i64 tcg_op1, TCGv_i64 tcg_op2)
+{
+ static NeonGenTwo64OpFn * const fns[3][2] = {
+ { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 },
+ { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 },
+ { tcg_gen_add_i64, tcg_gen_sub_i64 },
+ };
+ NeonGenTwo64OpFn *genfn;
+ assert(size < 3);
+
+ genfn = fns[size][is_sub];
+ genfn(tcg_res, tcg_op1, tcg_op2);
+}
+
+static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
+ int opcode, int rd, int rn, int rm)
+{
+ /* 3-reg-different widening insns: 64 x 64 -> 128 */
+ TCGv_i64 tcg_res[2];
+ int pass, accop;
+
+ tcg_res[0] = tcg_temp_new_i64();
+ tcg_res[1] = tcg_temp_new_i64();
+
+ /* Does this op do an adding accumulate, a subtracting accumulate,
+ * or no accumulate at all?
+ */
+ switch (opcode) {
+ case 5:
+ case 8:
+ case 9:
+ accop = 1;
+ break;
+ case 10:
+ case 11:
+ accop = -1;
+ break;
+ default:
+ accop = 0;
+ break;
+ }
+
+ if (accop != 0) {
+ read_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ read_vec_element(s, tcg_res[1], rd, 1, MO_64);
+ }
+
+ /* size == 2 means two 32x32->64 operations; this is worth special
+ * casing because we can generally handle it inline.
+ */
+ if (size == 2) {
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_passres;
+ MemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
+
+ int elt = pass + is_q * 2;
+
+ read_vec_element(s, tcg_op1, rn, elt, memop);
+ read_vec_element(s, tcg_op2, rm, elt, memop);
+
+ if (accop == 0) {
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ switch (opcode) {
+ case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+ tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2);
+ break;
+ case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+ tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2);
+ break;
+ case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+ case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+ {
+ TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
+
+ tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
+ tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
+ tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+ tcg_passres,
+ tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
+ tcg_temp_free_i64(tcg_tmp1);
+ tcg_temp_free_i64(tcg_tmp2);
+ break;
+ }
+ case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+ tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+ break;
+ case 9: /* SQDMLAL, SQDMLAL2 */
+ case 11: /* SQDMLSL, SQDMLSL2 */
+ case 13: /* SQDMULL, SQDMULL2 */
+ tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+ gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+ tcg_passres, tcg_passres);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (opcode == 9 || opcode == 11) {
+ /* saturating accumulate ops */
+ if (accop < 0) {
+ tcg_gen_neg_i64(tcg_passres, tcg_passres);
+ }
+ gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+ tcg_res[pass], tcg_passres);
+ } else if (accop > 0) {
+ tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ } else if (accop < 0) {
+ tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ }
+
+ if (accop != 0) {
+ tcg_temp_free_i64(tcg_passres);
+ }
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+ } else {
+ /* size 0 or 1, generally helper functions */
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i64 tcg_passres;
+ int elt = pass + is_q * 2;
+
+ read_vec_element_i32(s, tcg_op1, rn, elt, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, elt, MO_32);
+
+ if (accop == 0) {
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ switch (opcode) {
+ case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+ case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+ {
+ TCGv_i64 tcg_op2_64 = tcg_temp_new_i64();
+ static NeonGenWidenFn * const widenfns[2][2] = {
+ { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+ { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+ };
+ NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+ widenfn(tcg_op2_64, tcg_op2);
+ widenfn(tcg_passres, tcg_op1);
+ gen_neon_addl(size, (opcode == 2), tcg_passres,
+ tcg_passres, tcg_op2_64);
+ tcg_temp_free_i64(tcg_op2_64);
+ break;
+ }
+ case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+ case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+ if (size == 0) {
+ if (is_u) {
+ gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2);
+ }
+ } else {
+ if (is_u) {
+ gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2);
+ }
+ }
+ break;
+ case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+ if (size == 0) {
+ if (is_u) {
+ gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
+ }
+ } else {
+ if (is_u) {
+ gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+ }
+ }
+ break;
+ case 9: /* SQDMLAL, SQDMLAL2 */
+ case 11: /* SQDMLSL, SQDMLSL2 */
+ case 13: /* SQDMULL, SQDMULL2 */
+ assert(size == 1);
+ gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+ gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+ tcg_passres, tcg_passres);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+
+ if (accop != 0) {
+ if (opcode == 9 || opcode == 11) {
+ /* saturating accumulate ops */
+ if (accop < 0) {
+ gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+ }
+ gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+ tcg_res[pass],
+ tcg_passres);
+ } else {
+ gen_neon_addl(size, (accop < 0), tcg_res[pass],
+ tcg_res[pass], tcg_passres);
+ }
+ tcg_temp_free_i64(tcg_passres);
+ }
+ }
+ }
+
+ write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_res[0]);
+ tcg_temp_free_i64(tcg_res[1]);
+}
+
+static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size,
+ int opcode, int rd, int rn, int rm)
+{
+ TCGv_i64 tcg_res[2];
+ int part = is_q ? 2 : 0;
+ int pass;
+
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i64 tcg_op2_wide = tcg_temp_new_i64();
+ static NeonGenWidenFn * const widenfns[3][2] = {
+ { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+ { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+ { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 },
+ };
+ NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32);
+ widenfn(tcg_op2_wide, tcg_op2);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_res[pass] = tcg_temp_new_i64();
+ gen_neon_addl(size, (opcode == 3),
+ tcg_res[pass], tcg_op1, tcg_op2_wide);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2_wide);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+}
+
+static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in)
+{
+ tcg_gen_addi_i64(in, in, 1U << 31);
+ tcg_gen_extrh_i64_i32(res, in);
+}
+
+static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
+ int opcode, int rd, int rn, int rm)
+{
+ TCGv_i32 tcg_res[2];
+ int part = is_q ? 2 : 0;
+ int pass;
+
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_wideres = tcg_temp_new_i64();
+ static NeonGenNarrowFn * const narrowfns[3][2] = {
+ { gen_helper_neon_narrow_high_u8,
+ gen_helper_neon_narrow_round_high_u8 },
+ { gen_helper_neon_narrow_high_u16,
+ gen_helper_neon_narrow_round_high_u16 },
+ { tcg_gen_extrh_i64_i32, do_narrow_round_high_u32 },
+ };
+ NeonGenNarrowFn *gennarrow = narrowfns[size][is_u];
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2);
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+
+ tcg_res[pass] = tcg_temp_new_i32();
+ gennarrow(tcg_res[pass], tcg_wideres);
+ tcg_temp_free_i64(tcg_wideres);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32);
+ tcg_temp_free_i32(tcg_res[pass]);
+ }
+ clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three different
+ * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+ /* Instructions in this group fall into three basic classes
+ * (in each case with the operation working on each element in
+ * the input vectors):
+ * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra
+ * 128 bit input)
+ * (2) wide 64 x 128 -> 128
+ * (3) narrowing 128 x 128 -> 64
+ * Here we do initial decode, catch unallocated cases and
+ * dispatch to separate functions for each class.
+ */
+ int is_q = extract32(insn, 30, 1);
+ int is_u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 4);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ switch (opcode) {
+ case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
+ case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
+ /* 64 x 128 -> 128 */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm);
+ break;
+ case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
+ case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
+ /* 128 x 128 -> 64 */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
+ break;
+ case 14: /* PMULL, PMULL2 */
+ if (is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ switch (size) {
+ case 0: /* PMULL.P8 */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ /* The Q field specifies lo/hi half input for this insn. */
+ gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+ gen_helper_neon_pmull_h);
+ break;
+
+ case 3: /* PMULL.P64 */
+ if (!dc_isar_feature(aa64_pmull, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ /* The Q field specifies lo/hi half input for this insn. */
+ gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+ gen_helper_gvec_pmull_q);
+ break;
+
+ default:
+ unallocated_encoding(s);
+ break;
+ }
+ return;
+ case 9: /* SQDMLAL, SQDMLAL2 */
+ case 11: /* SQDMLSL, SQDMLSL2 */
+ case 13: /* SQDMULL, SQDMULL2 */
+ if (is_u || size == 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+ case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+ case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+ case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+ case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ case 12: /* SMULL, SMULL2, UMULL, UMULL2 */
+ /* 64 x 64 -> 128 */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
+ break;
+ default:
+ /* opcode 15 not allocated */
+ unallocated_encoding(s);
+ break;
+ }
+}
+
+/* Logic op (opcode == 3) subgroup of C3.6.16. */
+static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool is_u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (size + 4 * is_u) {
+ case 0: /* AND */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0);
+ return;
+ case 1: /* BIC */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0);
+ return;
+ case 2: /* ORR */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0);
+ return;
+ case 3: /* ORN */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0);
+ return;
+ case 4: /* EOR */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0);
+ return;
+
+ case 5: /* BSL bitwise select */
+ gen_gvec_fn4(s, is_q, rd, rd, rn, rm, tcg_gen_gvec_bitsel, 0);
+ return;
+ case 6: /* BIT, bitwise insert if true */
+ gen_gvec_fn4(s, is_q, rd, rm, rn, rd, tcg_gen_gvec_bitsel, 0);
+ return;
+ case 7: /* BIF, bitwise insert if false */
+ gen_gvec_fn4(s, is_q, rd, rm, rd, rn, tcg_gen_gvec_bitsel, 0);
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Pairwise op subgroup of C3.6.16.
+ *
+ * This is called directly or via the handle_3same_float for float pairwise
+ * operations where the opcode and size are calculated differently.
+ */
+static void handle_simd_3same_pair(DisasContext *s, int is_q, int u, int opcode,
+ int size, int rn, int rm, int rd)
+{
+ TCGv_ptr fpst;
+ int pass;
+
+ /* Floating point operations need fpst */
+ if (opcode >= 0x58) {
+ fpst = fpstatus_ptr(FPST_FPCR);
+ } else {
+ fpst = NULL;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ /* These operations work on the concatenated rm:rn, with each pair of
+ * adjacent elements being operated on to produce an element in the result.
+ */
+ if (size == 3) {
+ TCGv_i64 tcg_res[2];
+
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ int passreg = (pass == 0) ? rn : rm;
+
+ read_vec_element(s, tcg_op1, passreg, 0, MO_64);
+ read_vec_element(s, tcg_op2, passreg, 1, MO_64);
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ switch (opcode) {
+ case 0x17: /* ADDP */
+ tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ case 0x58: /* FMAXNMP */
+ gen_helper_vfp_maxnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5a: /* FADDP */
+ gen_helper_vfp_addd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5e: /* FMAXP */
+ gen_helper_vfp_maxd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x78: /* FMINNMP */
+ gen_helper_vfp_minnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7e: /* FMINP */
+ gen_helper_vfp_mind(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+ } else {
+ int maxpass = is_q ? 4 : 2;
+ TCGv_i32 tcg_res[4];
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ NeonGenTwoOpFn *genfn = NULL;
+ int passreg = pass < (maxpass / 2) ? rn : rm;
+ int passelt = (is_q && (pass & 1)) ? 2 : 0;
+
+ read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_32);
+ read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_32);
+ tcg_res[pass] = tcg_temp_new_i32();
+
+ switch (opcode) {
+ case 0x17: /* ADDP */
+ {
+ static NeonGenTwoOpFn * const fns[3] = {
+ gen_helper_neon_padd_u8,
+ gen_helper_neon_padd_u16,
+ tcg_gen_add_i32,
+ };
+ genfn = fns[size];
+ break;
+ }
+ case 0x14: /* SMAXP, UMAXP */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_pmax_s8, gen_helper_neon_pmax_u8 },
+ { gen_helper_neon_pmax_s16, gen_helper_neon_pmax_u16 },
+ { tcg_gen_smax_i32, tcg_gen_umax_i32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x15: /* SMINP, UMINP */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_pmin_s8, gen_helper_neon_pmin_u8 },
+ { gen_helper_neon_pmin_s16, gen_helper_neon_pmin_u16 },
+ { tcg_gen_smin_i32, tcg_gen_umin_i32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ /* The FP operations are all on single floats (32 bit) */
+ case 0x58: /* FMAXNMP */
+ gen_helper_vfp_maxnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5a: /* FADDP */
+ gen_helper_vfp_adds(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5e: /* FMAXP */
+ gen_helper_vfp_maxs(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x78: /* FMINNMP */
+ gen_helper_vfp_minnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7e: /* FMINP */
+ gen_helper_vfp_mins(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ /* FP ops called directly, otherwise call now */
+ if (genfn) {
+ genfn(tcg_res[pass], tcg_op1, tcg_op2);
+ }
+
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+
+ for (pass = 0; pass < maxpass; pass++) {
+ write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
+ tcg_temp_free_i32(tcg_res[pass]);
+ }
+ clear_vec_high(s, is_q, rd);
+ }
+
+ if (fpst) {
+ tcg_temp_free_ptr(fpst);
+ }
+}
+
+/* Floating point op subgroup of C3.6.16. */
+static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
+{
+ /* For floating point ops, the U, size[1] and opcode bits
+ * together indicate the operation. size[0] indicates single
+ * or double.
+ */
+ int fpopcode = extract32(insn, 11, 5)
+ | (extract32(insn, 23, 1) << 5)
+ | (extract32(insn, 29, 1) << 6);
+ int is_q = extract32(insn, 30, 1);
+ int size = extract32(insn, 22, 1);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ int datasize = is_q ? 128 : 64;
+ int esize = 32 << size;
+ int elements = datasize / esize;
+
+ if (size == 1 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (fpopcode) {
+ case 0x58: /* FMAXNMP */
+ case 0x5a: /* FADDP */
+ case 0x5e: /* FMAXP */
+ case 0x78: /* FMINNMP */
+ case 0x7e: /* FMINP */
+ if (size && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_simd_3same_pair(s, is_q, 0, fpopcode, size ? MO_64 : MO_32,
+ rn, rm, rd);
+ return;
+ case 0x1b: /* FMULX */
+ case 0x1f: /* FRECPS */
+ case 0x3f: /* FRSQRTS */
+ case 0x5d: /* FACGE */
+ case 0x7d: /* FACGT */
+ case 0x19: /* FMLA */
+ case 0x39: /* FMLS */
+ case 0x18: /* FMAXNM */
+ case 0x1a: /* FADD */
+ case 0x1c: /* FCMEQ */
+ case 0x1e: /* FMAX */
+ case 0x38: /* FMINNM */
+ case 0x3a: /* FSUB */
+ case 0x3e: /* FMIN */
+ case 0x5b: /* FMUL */
+ case 0x5c: /* FCMGE */
+ case 0x5f: /* FDIV */
+ case 0x7a: /* FABD */
+ case 0x7c: /* FCMGT */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
+ return;
+
+ case 0x1d: /* FMLAL */
+ case 0x3d: /* FMLSL */
+ case 0x59: /* FMLAL2 */
+ case 0x79: /* FMLSL2 */
+ if (size & 1 || !dc_isar_feature(aa64_fhm, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (fp_access_check(s)) {
+ int is_s = extract32(insn, 23, 1);
+ int is_2 = extract32(insn, 29, 1);
+ int data = (is_2 << 1) | is_s;
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), cpu_env,
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ data, gen_helper_gvec_fmlal_a64);
+ }
+ return;
+
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+}
+
+/* Integer op subgroup of C3.6.16. */
+static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
+{
+ int is_q = extract32(insn, 30, 1);
+ int u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 11, 5);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ int pass;
+ TCGCond cond;
+
+ switch (opcode) {
+ case 0x13: /* MUL, PMUL */
+ if (u && size != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x0: /* SHADD, UHADD */
+ case 0x2: /* SRHADD, URHADD */
+ case 0x4: /* SHSUB, UHSUB */
+ case 0xc: /* SMAX, UMAX */
+ case 0xd: /* SMIN, UMIN */
+ case 0xe: /* SABD, UABD */
+ case 0xf: /* SABA, UABA */
+ case 0x12: /* MLA, MLS */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x16: /* SQDMULH, SQRDMULH */
+ if (size == 0 || size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0x01: /* SQADD, UQADD */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqadd_qc, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqadd_qc, size);
+ }
+ return;
+ case 0x05: /* SQSUB, UQSUB */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqsub_qc, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqsub_qc, size);
+ }
+ return;
+ case 0x08: /* SSHL, USHL */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_ushl, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sshl, size);
+ }
+ return;
+ case 0x0c: /* SMAX, UMAX */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umax, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smax, size);
+ }
+ return;
+ case 0x0d: /* SMIN, UMIN */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umin, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smin, size);
+ }
+ return;
+ case 0xe: /* SABD, UABD */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uabd, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sabd, size);
+ }
+ return;
+ case 0xf: /* SABA, UABA */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uaba, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_saba, size);
+ }
+ return;
+ case 0x10: /* ADD, SUB */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size);
+ }
+ return;
+ case 0x13: /* MUL, PMUL */
+ if (!u) { /* MUL */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size);
+ } else { /* PMUL */
+ gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, gen_helper_gvec_pmul_b);
+ }
+ return;
+ case 0x12: /* MLA, MLS */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mls, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mla, size);
+ }
+ return;
+ case 0x16: /* SQDMULH, SQRDMULH */
+ {
+ static gen_helper_gvec_3_ptr * const fns[2][2] = {
+ { gen_helper_neon_sqdmulh_h, gen_helper_neon_sqrdmulh_h },
+ { gen_helper_neon_sqdmulh_s, gen_helper_neon_sqrdmulh_s },
+ };
+ gen_gvec_op3_qc(s, is_q, rd, rn, rm, fns[size - 1][u]);
+ }
+ return;
+ case 0x11:
+ if (!u) { /* CMTST */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_cmtst, size);
+ return;
+ }
+ /* else CMEQ */
+ cond = TCG_COND_EQ;
+ goto do_gvec_cmp;
+ case 0x06: /* CMGT, CMHI */
+ cond = u ? TCG_COND_GTU : TCG_COND_GT;
+ goto do_gvec_cmp;
+ case 0x07: /* CMGE, CMHS */
+ cond = u ? TCG_COND_GEU : TCG_COND_GE;
+ do_gvec_cmp:
+ tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+ return;
+ }
+
+ if (size == 3) {
+ assert(is_q);
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+ } else {
+ for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ NeonGenTwoOpFn *genfn = NULL;
+ NeonGenTwoOpEnvFn *genenvfn = NULL;
+
+ read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+ switch (opcode) {
+ case 0x0: /* SHADD, UHADD */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_hadd_s8, gen_helper_neon_hadd_u8 },
+ { gen_helper_neon_hadd_s16, gen_helper_neon_hadd_u16 },
+ { gen_helper_neon_hadd_s32, gen_helper_neon_hadd_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x2: /* SRHADD, URHADD */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_rhadd_s8, gen_helper_neon_rhadd_u8 },
+ { gen_helper_neon_rhadd_s16, gen_helper_neon_rhadd_u16 },
+ { gen_helper_neon_rhadd_s32, gen_helper_neon_rhadd_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x4: /* SHSUB, UHSUB */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_hsub_s8, gen_helper_neon_hsub_u8 },
+ { gen_helper_neon_hsub_s16, gen_helper_neon_hsub_u16 },
+ { gen_helper_neon_hsub_s32, gen_helper_neon_hsub_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x9: /* SQSHL, UQSHL */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
+ { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
+ { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ case 0xa: /* SRSHL, URSHL */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_rshl_s8, gen_helper_neon_rshl_u8 },
+ { gen_helper_neon_rshl_s16, gen_helper_neon_rshl_u16 },
+ { gen_helper_neon_rshl_s32, gen_helper_neon_rshl_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0xb: /* SQRSHL, UQRSHL */
+ {
+ static NeonGenTwoOpEnvFn * const fns[3][2] = {
+ { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
+ { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
+ { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
+ };
+ genenvfn = fns[size][u];
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ if (genenvfn) {
+ genenvfn(tcg_res, cpu_env, tcg_op1, tcg_op2);
+ } else {
+ genfn(tcg_res, tcg_op1, tcg_op2);
+ }
+
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+ }
+ clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three same
+ * 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 | Rm | opcode | 1 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
+{
+ int opcode = extract32(insn, 11, 5);
+
+ switch (opcode) {
+ case 0x3: /* logic ops */
+ disas_simd_3same_logic(s, insn);
+ break;
+ case 0x17: /* ADDP */
+ case 0x14: /* SMAXP, UMAXP */
+ case 0x15: /* SMINP, UMINP */
+ {
+ /* Pairwise operations */
+ int is_q = extract32(insn, 30, 1);
+ int u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ if (opcode == 0x17) {
+ if (u || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ } else {
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+ handle_simd_3same_pair(s, is_q, u, opcode, size, rn, rm, rd);
+ break;
+ }
+ case 0x18 ... 0x31:
+ /* floating point ops, sz[1] and U are part of opcode */
+ disas_simd_3same_float(s, insn);
+ break;
+ default:
+ disas_simd_3same_int(s, insn);
+ break;
+ }
+}
+
+/*
+ * Advanced SIMD three same (ARMv8.2 FP16 variants)
+ *
+ * 31 30 29 28 24 23 22 21 20 16 15 14 13 11 10 9 5 4 0
+ * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | a | 1 0 | Rm | 0 0 | opcode | 1 | Rn | Rd |
+ * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
+ *
+ * This includes FMULX, FCMEQ (register), FRECPS, FRSQRTS, FCMGE
+ * (register), FACGE, FABD, FCMGT (register) and FACGT.
+ *
+ */
+static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
+{
+ int opcode = extract32(insn, 11, 3);
+ int u = extract32(insn, 29, 1);
+ int a = extract32(insn, 23, 1);
+ int is_q = extract32(insn, 30, 1);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ /*
+ * For these floating point ops, the U, a and opcode bits
+ * together indicate the operation.
+ */
+ int fpopcode = opcode | (a << 3) | (u << 4);
+ int datasize = is_q ? 128 : 64;
+ int elements = datasize / 16;
+ bool pairwise;
+ TCGv_ptr fpst;
+ int pass;
+
+ switch (fpopcode) {
+ case 0x0: /* FMAXNM */
+ case 0x1: /* FMLA */
+ case 0x2: /* FADD */
+ case 0x3: /* FMULX */
+ case 0x4: /* FCMEQ */
+ case 0x6: /* FMAX */
+ case 0x7: /* FRECPS */
+ case 0x8: /* FMINNM */
+ case 0x9: /* FMLS */
+ case 0xa: /* FSUB */
+ case 0xe: /* FMIN */
+ case 0xf: /* FRSQRTS */
+ case 0x13: /* FMUL */
+ case 0x14: /* FCMGE */
+ case 0x15: /* FACGE */
+ case 0x17: /* FDIV */
+ case 0x1a: /* FABD */
+ case 0x1c: /* FCMGT */
+ case 0x1d: /* FACGT */
+ pairwise = false;
+ break;
+ case 0x10: /* FMAXNMP */
+ case 0x12: /* FADDP */
+ case 0x16: /* FMAXP */
+ case 0x18: /* FMINNMP */
+ case 0x1e: /* FMINP */
+ pairwise = true;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ if (pairwise) {
+ int maxpass = is_q ? 8 : 4;
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res[8];
+
+ for (pass = 0; pass < maxpass; pass++) {
+ int passreg = pass < (maxpass / 2) ? rn : rm;
+ int passelt = (pass << 1) & (maxpass - 1);
+
+ read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_16);
+ read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_16);
+ tcg_res[pass] = tcg_temp_new_i32();
+
+ switch (fpopcode) {
+ case 0x10: /* FMAXNMP */
+ gen_helper_advsimd_maxnumh(tcg_res[pass], tcg_op1, tcg_op2,
+ fpst);
+ break;
+ case 0x12: /* FADDP */
+ gen_helper_advsimd_addh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x16: /* FMAXP */
+ gen_helper_advsimd_maxh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x18: /* FMINNMP */
+ gen_helper_advsimd_minnumh(tcg_res[pass], tcg_op1, tcg_op2,
+ fpst);
+ break;
+ case 0x1e: /* FMINP */
+ gen_helper_advsimd_minh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ for (pass = 0; pass < maxpass; pass++) {
+ write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_16);
+ tcg_temp_free_i32(tcg_res[pass]);
+ }
+
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+
+ } else {
+ for (pass = 0; pass < elements; pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op1, rn, pass, MO_16);
+ read_vec_element_i32(s, tcg_op2, rm, pass, MO_16);
+
+ switch (fpopcode) {
+ case 0x0: /* FMAXNM */
+ gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1: /* FMLA */
+ read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+ gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
+ fpst);
+ break;
+ case 0x2: /* FADD */
+ gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3: /* FMULX */
+ gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x4: /* FCMEQ */
+ gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x6: /* FMAX */
+ gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7: /* FRECPS */
+ gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x8: /* FMINNM */
+ gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x9: /* FMLS */
+ /* As usual for ARM, separate negation for fused multiply-add */
+ tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
+ read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+ gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
+ fpst);
+ break;
+ case 0xa: /* FSUB */
+ gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xe: /* FMIN */
+ gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0xf: /* FRSQRTS */
+ gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x13: /* FMUL */
+ gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x14: /* FCMGE */
+ gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x15: /* FACGE */
+ gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x17: /* FDIV */
+ gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1a: /* FABD */
+ gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+ tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
+ break;
+ case 0x1c: /* FCMGT */
+ gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1d: /* FACGT */
+ gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+ }
+
+ tcg_temp_free_ptr(fpst);
+
+ clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three same extra
+ * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 4);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+ bool feature;
+ int rot;
+
+ switch (u * 16 + opcode) {
+ case 0x10: /* SQRDMLAH (vector) */
+ case 0x11: /* SQRDMLSH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_rdm, s);
+ break;
+ case 0x02: /* SDOT (vector) */
+ case 0x12: /* UDOT (vector) */
+ if (size != MO_32) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_dp, s);
+ break;
+ case 0x03: /* USDOT */
+ if (size != MO_32) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_i8mm, s);
+ break;
+ case 0x04: /* SMMLA */
+ case 0x14: /* UMMLA */
+ case 0x05: /* USMMLA */
+ if (!is_q || size != MO_32) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_i8mm, s);
+ break;
+ case 0x18: /* FCMLA, #0 */
+ case 0x19: /* FCMLA, #90 */
+ case 0x1a: /* FCMLA, #180 */
+ case 0x1b: /* FCMLA, #270 */
+ case 0x1c: /* FCADD, #90 */
+ case 0x1e: /* FCADD, #270 */
+ if (size == 0
+ || (size == 1 && !dc_isar_feature(aa64_fp16, s))
+ || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_fcma, s);
+ break;
+ case 0x1d: /* BFMMLA */
+ if (size != MO_16 || !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_bf16, s);
+ break;
+ case 0x1f:
+ switch (size) {
+ case 1: /* BFDOT */
+ case 3: /* BFMLAL{B,T} */
+ feature = dc_isar_feature(aa64_bf16, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0x0: /* SQRDMLAH (vector) */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlah_qc, size);
+ return;
+
+ case 0x1: /* SQRDMLSH (vector) */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlsh_qc, size);
+ return;
+
+ case 0x2: /* SDOT / UDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0,
+ u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b);
+ return;
+
+ case 0x3: /* USDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_usdot_b);
+ return;
+
+ case 0x04: /* SMMLA, UMMLA */
+ gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0,
+ u ? gen_helper_gvec_ummla_b
+ : gen_helper_gvec_smmla_b);
+ return;
+ case 0x05: /* USMMLA */
+ gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0, gen_helper_gvec_usmmla_b);
+ return;
+
+ case 0x8: /* FCMLA, #0 */
+ case 0x9: /* FCMLA, #90 */
+ case 0xa: /* FCMLA, #180 */
+ case 0xb: /* FCMLA, #270 */
+ rot = extract32(opcode, 0, 2);
+ switch (size) {
+ case 1:
+ gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, true, rot,
+ gen_helper_gvec_fcmlah);
+ break;
+ case 2:
+ gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
+ gen_helper_gvec_fcmlas);
+ break;
+ case 3:
+ gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
+ gen_helper_gvec_fcmlad);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ case 0xc: /* FCADD, #90 */
+ case 0xe: /* FCADD, #270 */
+ rot = extract32(opcode, 1, 1);
+ switch (size) {
+ case 1:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcaddh);
+ break;
+ case 2:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcadds);
+ break;
+ case 3:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcaddd);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ case 0xd: /* BFMMLA */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla);
+ return;
+ case 0xf:
+ switch (size) {
+ case 1: /* BFDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
+ break;
+ case 3: /* BFMLAL{B,T} */
+ gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, false, is_q,
+ gen_helper_gvec_bfmlal);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
+ int size, int rn, int rd)
+{
+ /* Handle 2-reg-misc ops which are widening (so each size element
+ * in the source becomes a 2*size element in the destination.
+ * The only instruction like this is FCVTL.
+ */
+ int pass;
+
+ if (size == 3) {
+ /* 32 -> 64 bit fp conversion */
+ TCGv_i64 tcg_res[2];
+ int srcelt = is_q ? 2 : 0;
+
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32);
+ gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env);
+ tcg_temp_free_i32(tcg_op);
+ }
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+ } else {
+ /* 16 -> 32 bit fp conversion */
+ int srcelt = is_q ? 4 : 0;
+ TCGv_i32 tcg_res[4];
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+ TCGv_i32 ahp = get_ahp_flag();
+
+ for (pass = 0; pass < 4; pass++) {
+ tcg_res[pass] = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16);
+ gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass],
+ fpst, ahp);
+ }
+ for (pass = 0; pass < 4; pass++) {
+ write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
+ tcg_temp_free_i32(tcg_res[pass]);
+ }
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(ahp);
+ }
+}
+
+static void handle_rev(DisasContext *s, int opcode, bool u,
+ bool is_q, int size, int rn, int rd)
+{
+ int op = (opcode << 1) | u;
+ int opsz = op + size;
+ int grp_size = 3 - opsz;
+ int dsize = is_q ? 128 : 64;
+ int i;
+
+ if (opsz >= 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (size == 0) {
+ /* Special case bytes, use bswap op on each group of elements */
+ int groups = dsize / (8 << grp_size);
+
+ for (i = 0; i < groups; i++) {
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_tmp, rn, i, grp_size);
+ switch (grp_size) {
+ case MO_16:
+ tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
+ break;
+ case MO_32:
+ tcg_gen_bswap32_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
+ break;
+ case MO_64:
+ tcg_gen_bswap64_i64(tcg_tmp, tcg_tmp);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ write_vec_element(s, tcg_tmp, rd, i, grp_size);
+ tcg_temp_free_i64(tcg_tmp);
+ }
+ clear_vec_high(s, is_q, rd);
+ } else {
+ int revmask = (1 << grp_size) - 1;
+ int esize = 8 << size;
+ int elements = dsize / esize;
+ TCGv_i64 tcg_rn = tcg_temp_new_i64();
+ TCGv_i64 tcg_rd = tcg_const_i64(0);
+ TCGv_i64 tcg_rd_hi = tcg_const_i64(0);
+
+ for (i = 0; i < elements; i++) {
+ int e_rev = (i & 0xf) ^ revmask;
+ int off = e_rev * esize;
+ read_vec_element(s, tcg_rn, rn, i, size);
+ if (off >= 64) {
+ tcg_gen_deposit_i64(tcg_rd_hi, tcg_rd_hi,
+ tcg_rn, off - 64, esize);
+ } else {
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, off, esize);
+ }
+ }
+ write_vec_element(s, tcg_rd, rd, 0, MO_64);
+ write_vec_element(s, tcg_rd_hi, rd, 1, MO_64);
+
+ tcg_temp_free_i64(tcg_rd_hi);
+ tcg_temp_free_i64(tcg_rd);
+ tcg_temp_free_i64(tcg_rn);
+ }
+}
+
+static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u,
+ bool is_q, int size, int rn, int rd)
+{
+ /* Implement the pairwise operations from 2-misc:
+ * SADDLP, UADDLP, SADALP, UADALP.
+ * These all add pairs of elements in the input to produce a
+ * double-width result element in the output (possibly accumulating).
+ */
+ bool accum = (opcode == 0x6);
+ int maxpass = is_q ? 2 : 1;
+ int pass;
+ TCGv_i64 tcg_res[2];
+
+ if (size == 2) {
+ /* 32 + 32 -> 64 op */
+ MemOp memop = size + (u ? 0 : MO_SIGN);
+
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, pass * 2, memop);
+ read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop);
+ tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ if (accum) {
+ read_vec_element(s, tcg_op1, rd, pass, MO_64);
+ tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+ }
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+ } else {
+ for (pass = 0; pass < maxpass; pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ NeonGenOne64OpFn *genfn;
+ static NeonGenOne64OpFn * const fns[2][2] = {
+ { gen_helper_neon_addlp_s8, gen_helper_neon_addlp_u8 },
+ { gen_helper_neon_addlp_s16, gen_helper_neon_addlp_u16 },
+ };
+
+ genfn = fns[size][u];
+
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+ genfn(tcg_res[pass], tcg_op);
+
+ if (accum) {
+ read_vec_element(s, tcg_op, rd, pass, MO_64);
+ if (size == 0) {
+ gen_helper_neon_addl_u16(tcg_res[pass],
+ tcg_res[pass], tcg_op);
+ } else {
+ gen_helper_neon_addl_u32(tcg_res[pass],
+ tcg_res[pass], tcg_op);
+ }
+ }
+ tcg_temp_free_i64(tcg_op);
+ }
+ }
+ if (!is_q) {
+ tcg_res[1] = tcg_constant_i64(0);
+ }
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+}
+
+static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd)
+{
+ /* Implement SHLL and SHLL2 */
+ int pass;
+ int part = is_q ? 2 : 0;
+ TCGv_i64 tcg_res[2];
+
+ for (pass = 0; pass < 2; pass++) {
+ static NeonGenWidenFn * const widenfns[3] = {
+ gen_helper_neon_widen_u8,
+ gen_helper_neon_widen_u16,
+ tcg_gen_extu_i32_i64,
+ };
+ NeonGenWidenFn *widenfn = widenfns[size];
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32);
+ tcg_res[pass] = tcg_temp_new_i64();
+ widenfn(tcg_res[pass], tcg_op);
+ tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size);
+
+ tcg_temp_free_i32(tcg_op);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+}
+
+/* AdvSIMD two reg misc
+ * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 | Rn | Rd |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 5);
+ bool u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ bool need_fpstatus = false;
+ bool need_rmode = false;
+ int rmode = -1;
+ TCGv_i32 tcg_rmode;
+ TCGv_ptr tcg_fpstatus;
+
+ switch (opcode) {
+ case 0x0: /* REV64, REV32 */
+ case 0x1: /* REV16 */
+ handle_rev(s, opcode, u, is_q, size, rn, rd);
+ return;
+ case 0x5: /* CNT, NOT, RBIT */
+ if (u && size == 0) {
+ /* NOT */
+ break;
+ } else if (u && size == 1) {
+ /* RBIT */
+ break;
+ } else if (!u && size == 0) {
+ /* CNT */
+ break;
+ }
+ unallocated_encoding(s);
+ return;
+ case 0x12: /* XTN, XTN2, SQXTUN, SQXTUN2 */
+ case 0x14: /* SQXTN, SQXTN2, UQXTN, UQXTN2 */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd);
+ return;
+ case 0x4: /* CLS, CLZ */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x2: /* SADDLP, UADDLP */
+ case 0x6: /* SADALP, UADALP */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd);
+ return;
+ case 0x13: /* SHLL, SHLL2 */
+ if (u == 0 || size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_shll(s, is_q, size, rn, rd);
+ return;
+ case 0xa: /* CMLT */
+ if (u == 1) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x8: /* CMGT, CMGE */
+ case 0x9: /* CMEQ, CMLE */
+ case 0xb: /* ABS, NEG */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x3: /* SUQADD, USQADD */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_satacc(s, false, u, is_q, size, rn, rd);
+ return;
+ case 0x7: /* SQABS, SQNEG */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0xc ... 0xf:
+ case 0x16 ... 0x1f:
+ {
+ /* Floating point: U, size[1] and opcode indicate operation;
+ * size[0] indicates single or double precision.
+ */
+ int is_double = extract32(size, 0, 1);
+ opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+ size = is_double ? 3 : 2;
+ switch (opcode) {
+ case 0x2f: /* FABS */
+ case 0x6f: /* FNEG */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x1d: /* SCVTF */
+ case 0x5d: /* UCVTF */
+ {
+ bool is_signed = (opcode == 0x1d) ? true : false;
+ int elements = is_double ? 2 : is_q ? 4 : 2;
+ if (is_double && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size);
+ return;
+ }
+ case 0x2c: /* FCMGT (zero) */
+ case 0x2d: /* FCMEQ (zero) */
+ case 0x2e: /* FCMLT (zero) */
+ case 0x6c: /* FCMGE (zero) */
+ case 0x6d: /* FCMLE (zero) */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd);
+ return;
+ case 0x7f: /* FSQRT */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ need_fpstatus = true;
+ need_rmode = true;
+ rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x5c: /* FCVTAU */
+ case 0x1c: /* FCVTAS */
+ need_fpstatus = true;
+ need_rmode = true;
+ rmode = FPROUNDING_TIEAWAY;
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x3c: /* URECPE */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x3d: /* FRECPE */
+ case 0x7d: /* FRSQRTE */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
+ return;
+ case 0x56: /* FCVTXN, FCVTXN2 */
+ if (size == 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x16: /* FCVTN, FCVTN2 */
+ /* handle_2misc_narrow does a 2*size -> size operation, but these
+ * instructions encode the source size rather than dest size.
+ */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
+ return;
+ case 0x36: /* BFCVTN, BFCVTN2 */
+ if (!dc_isar_feature(aa64_bf16, s) || size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
+ return;
+ case 0x17: /* FCVTL, FCVTL2 */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_widening(s, opcode, is_q, size, rn, rd);
+ return;
+ case 0x18: /* FRINTN */
+ case 0x19: /* FRINTM */
+ case 0x38: /* FRINTP */
+ case 0x39: /* FRINTZ */
+ need_rmode = true;
+ rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+ /* fall through */
+ case 0x59: /* FRINTX */
+ case 0x79: /* FRINTI */
+ need_fpstatus = true;
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x58: /* FRINTA */
+ need_rmode = true;
+ rmode = FPROUNDING_TIEAWAY;
+ need_fpstatus = true;
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x7c: /* URSQRTE */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x1e: /* FRINT32Z */
+ case 0x1f: /* FRINT64Z */
+ need_rmode = true;
+ rmode = FPROUNDING_ZERO;
+ /* fall through */
+ case 0x5e: /* FRINT32X */
+ case 0x5f: /* FRINT64X */
+ need_fpstatus = true;
+ if ((size == 3 && !is_q) || !dc_isar_feature(aa64_frint, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (need_fpstatus || need_rmode) {
+ tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
+ } else {
+ tcg_fpstatus = NULL;
+ }
+ if (need_rmode) {
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ } else {
+ tcg_rmode = NULL;
+ }
+
+ switch (opcode) {
+ case 0x5:
+ if (u && size == 0) { /* NOT */
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0);
+ return;
+ }
+ break;
+ case 0x8: /* CMGT, CMGE */
+ if (u) {
+ gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cge0, size);
+ } else {
+ gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cgt0, size);
+ }
+ return;
+ case 0x9: /* CMEQ, CMLE */
+ if (u) {
+ gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cle0, size);
+ } else {
+ gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_ceq0, size);
+ }
+ return;
+ case 0xa: /* CMLT */
+ gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_clt0, size);
+ return;
+ case 0xb:
+ if (u) { /* ABS, NEG */
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size);
+ } else {
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_abs, size);
+ }
+ return;
+ }
+
+ if (size == 3) {
+ /* All 64-bit element operations can be shared with scalar 2misc */
+ int pass;
+
+ /* Coverity claims (size == 3 && !is_q) has been eliminated
+ * from all paths leading to here.
+ */
+ tcg_debug_assert(is_q);
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+
+ handle_2misc_64(s, opcode, u, tcg_res, tcg_op,
+ tcg_rmode, tcg_fpstatus);
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op);
+ }
+ } else {
+ int pass;
+
+ for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+
+ if (size == 2) {
+ /* Special cases for 32 bit elements */
+ switch (opcode) {
+ case 0x4: /* CLS */
+ if (u) {
+ tcg_gen_clzi_i32(tcg_res, tcg_op, 32);
+ } else {
+ tcg_gen_clrsb_i32(tcg_res, tcg_op);
+ }
+ break;
+ case 0x7: /* SQABS, SQNEG */
+ if (u) {
+ gen_helper_neon_qneg_s32(tcg_res, cpu_env, tcg_op);
+ } else {
+ gen_helper_neon_qabs_s32(tcg_res, cpu_env, tcg_op);
+ }
+ break;
+ case 0x2f: /* FABS */
+ gen_helper_vfp_abss(tcg_res, tcg_op);
+ break;
+ case 0x6f: /* FNEG */
+ gen_helper_vfp_negs(tcg_res, tcg_op);
+ break;
+ case 0x7f: /* FSQRT */
+ gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
+ break;
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x1c: /* FCVTAS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ gen_helper_vfp_tosls(tcg_res, tcg_op,
+ tcg_constant_i32(0), tcg_fpstatus);
+ break;
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x5c: /* FCVTAU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ gen_helper_vfp_touls(tcg_res, tcg_op,
+ tcg_constant_i32(0), tcg_fpstatus);
+ break;
+ case 0x18: /* FRINTN */
+ case 0x19: /* FRINTM */
+ case 0x38: /* FRINTP */
+ case 0x39: /* FRINTZ */
+ case 0x58: /* FRINTA */
+ case 0x79: /* FRINTI */
+ gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x59: /* FRINTX */
+ gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x7c: /* URSQRTE */
+ gen_helper_rsqrte_u32(tcg_res, tcg_op);
+ break;
+ case 0x1e: /* FRINT32Z */
+ case 0x5e: /* FRINT32X */
+ gen_helper_frint32_s(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x1f: /* FRINT64Z */
+ case 0x5f: /* FRINT64X */
+ gen_helper_frint64_s(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ /* Use helpers for 8 and 16 bit elements */
+ switch (opcode) {
+ case 0x5: /* CNT, RBIT */
+ /* For these two insns size is part of the opcode specifier
+ * (handled earlier); they always operate on byte elements.
+ */
+ if (u) {
+ gen_helper_neon_rbit_u8(tcg_res, tcg_op);
+ } else {
+ gen_helper_neon_cnt_u8(tcg_res, tcg_op);
+ }
+ break;
+ case 0x7: /* SQABS, SQNEG */
+ {
+ NeonGenOneOpEnvFn *genfn;
+ static NeonGenOneOpEnvFn * const fns[2][2] = {
+ { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
+ { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
+ };
+ genfn = fns[size][u];
+ genfn(tcg_res, cpu_env, tcg_op);
+ break;
+ }
+ case 0x4: /* CLS, CLZ */
+ if (u) {
+ if (size == 0) {
+ gen_helper_neon_clz_u8(tcg_res, tcg_op);
+ } else {
+ gen_helper_neon_clz_u16(tcg_res, tcg_op);
+ }
+ } else {
+ if (size == 0) {
+ gen_helper_neon_cls_s8(tcg_res, tcg_op);
+ } else {
+ gen_helper_neon_cls_s16(tcg_res, tcg_op);
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ }
+ }
+ clear_vec_high(s, is_q, rd);
+
+ if (need_rmode) {
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ tcg_temp_free_i32(tcg_rmode);
+ }
+ if (need_fpstatus) {
+ tcg_temp_free_ptr(tcg_fpstatus);
+ }
+}
+
+/* AdvSIMD [scalar] two register miscellaneous (FP16)
+ *
+ * 31 30 29 28 27 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
+ * | 0 | Q | U | S | 1 1 1 0 | a | 1 1 1 1 0 0 | opcode | 1 0 | Rn | Rd |
+ * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
+ * mask: 1000 1111 0111 1110 0000 1100 0000 0000 0x8f7e 0c00
+ * val: 0000 1110 0111 1000 0000 1000 0000 0000 0x0e78 0800
+ *
+ * This actually covers two groups where scalar access is governed by
+ * bit 28. A bunch of the instructions (float to integral) only exist
+ * in the vector form and are un-allocated for the scalar decode. Also
+ * in the scalar decode Q is always 1.
+ */
+static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
+{
+ int fpop, opcode, a, u;
+ int rn, rd;
+ bool is_q;
+ bool is_scalar;
+ bool only_in_vector = false;
+
+ int pass;
+ TCGv_i32 tcg_rmode = NULL;
+ TCGv_ptr tcg_fpstatus = NULL;
+ bool need_rmode = false;
+ bool need_fpst = true;
+ int rmode;
+
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ rd = extract32(insn, 0, 5);
+ rn = extract32(insn, 5, 5);
+
+ a = extract32(insn, 23, 1);
+ u = extract32(insn, 29, 1);
+ is_scalar = extract32(insn, 28, 1);
+ is_q = extract32(insn, 30, 1);
+
+ opcode = extract32(insn, 12, 5);
+ fpop = deposit32(opcode, 5, 1, a);
+ fpop = deposit32(fpop, 6, 1, u);
+
+ switch (fpop) {
+ case 0x1d: /* SCVTF */
+ case 0x5d: /* UCVTF */
+ {
+ int elements;
+
+ if (is_scalar) {
+ elements = 1;
+ } else {
+ elements = (is_q ? 8 : 4);
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_simd_intfp_conv(s, rd, rn, elements, !u, 0, MO_16);
+ return;
+ }
+ break;
+ case 0x2c: /* FCMGT (zero) */
+ case 0x2d: /* FCMEQ (zero) */
+ case 0x2e: /* FCMLT (zero) */
+ case 0x6c: /* FCMGE (zero) */
+ case 0x6d: /* FCMLE (zero) */
+ handle_2misc_fcmp_zero(s, fpop, is_scalar, 0, is_q, MO_16, rn, rd);
+ return;
+ case 0x3d: /* FRECPE */
+ case 0x3f: /* FRECPX */
+ break;
+ case 0x18: /* FRINTN */
+ need_rmode = true;
+ only_in_vector = true;
+ rmode = FPROUNDING_TIEEVEN;
+ break;
+ case 0x19: /* FRINTM */
+ need_rmode = true;
+ only_in_vector = true;
+ rmode = FPROUNDING_NEGINF;
+ break;
+ case 0x38: /* FRINTP */
+ need_rmode = true;
+ only_in_vector = true;
+ rmode = FPROUNDING_POSINF;
+ break;
+ case 0x39: /* FRINTZ */
+ need_rmode = true;
+ only_in_vector = true;
+ rmode = FPROUNDING_ZERO;
+ break;
+ case 0x58: /* FRINTA */
+ need_rmode = true;
+ only_in_vector = true;
+ rmode = FPROUNDING_TIEAWAY;
+ break;
+ case 0x59: /* FRINTX */
+ case 0x79: /* FRINTI */
+ only_in_vector = true;
+ /* current rounding mode */
+ break;
+ case 0x1a: /* FCVTNS */
+ need_rmode = true;
+ rmode = FPROUNDING_TIEEVEN;
+ break;
+ case 0x1b: /* FCVTMS */
+ need_rmode = true;
+ rmode = FPROUNDING_NEGINF;
+ break;
+ case 0x1c: /* FCVTAS */
+ need_rmode = true;
+ rmode = FPROUNDING_TIEAWAY;
+ break;
+ case 0x3a: /* FCVTPS */
+ need_rmode = true;
+ rmode = FPROUNDING_POSINF;
+ break;
+ case 0x3b: /* FCVTZS */
+ need_rmode = true;
+ rmode = FPROUNDING_ZERO;
+ break;
+ case 0x5a: /* FCVTNU */
+ need_rmode = true;
+ rmode = FPROUNDING_TIEEVEN;
+ break;
+ case 0x5b: /* FCVTMU */
+ need_rmode = true;
+ rmode = FPROUNDING_NEGINF;
+ break;
+ case 0x5c: /* FCVTAU */
+ need_rmode = true;
+ rmode = FPROUNDING_TIEAWAY;
+ break;
+ case 0x7a: /* FCVTPU */
+ need_rmode = true;
+ rmode = FPROUNDING_POSINF;
+ break;
+ case 0x7b: /* FCVTZU */
+ need_rmode = true;
+ rmode = FPROUNDING_ZERO;
+ break;
+ case 0x2f: /* FABS */
+ case 0x6f: /* FNEG */
+ need_fpst = false;
+ break;
+ case 0x7d: /* FRSQRTE */
+ case 0x7f: /* FSQRT (vector) */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+
+ /* Check additional constraints for the scalar encoding */
+ if (is_scalar) {
+ if (!is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* FRINTxx is only in the vector form */
+ if (only_in_vector) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (need_rmode || need_fpst) {
+ tcg_fpstatus = fpstatus_ptr(FPST_FPCR_F16);
+ }
+
+ if (need_rmode) {
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ }
+
+ if (is_scalar) {
+ TCGv_i32 tcg_op = read_fp_hreg(s, rn);
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ switch (fpop) {
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x1c: /* FCVTAS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x3d: /* FRECPE */
+ gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x3f: /* FRECPX */
+ gen_helper_frecpx_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x5c: /* FCVTAU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x6f: /* FNEG */
+ tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+ break;
+ case 0x7d: /* FRSQRTE */
+ gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ /* limit any sign extension going on */
+ tcg_gen_andi_i32(tcg_res, tcg_res, 0xffff);
+ write_fp_sreg(s, rd, tcg_res);
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ } else {
+ for (pass = 0; pass < (is_q ? 8 : 4); pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, pass, MO_16);
+
+ switch (fpop) {
+ case 0x1a: /* FCVTNS */
+ case 0x1b: /* FCVTMS */
+ case 0x1c: /* FCVTAS */
+ case 0x3a: /* FCVTPS */
+ case 0x3b: /* FCVTZS */
+ gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x3d: /* FRECPE */
+ gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x5a: /* FCVTNU */
+ case 0x5b: /* FCVTMU */
+ case 0x5c: /* FCVTAU */
+ case 0x7a: /* FCVTPU */
+ case 0x7b: /* FCVTZU */
+ gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x18: /* FRINTN */
+ case 0x19: /* FRINTM */
+ case 0x38: /* FRINTP */
+ case 0x39: /* FRINTZ */
+ case 0x58: /* FRINTA */
+ case 0x79: /* FRINTI */
+ gen_helper_advsimd_rinth(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x59: /* FRINTX */
+ gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x2f: /* FABS */
+ tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
+ break;
+ case 0x6f: /* FNEG */
+ tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+ break;
+ case 0x7d: /* FRSQRTE */
+ gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ case 0x7f: /* FSQRT */
+ gen_helper_sqrt_f16(tcg_res, tcg_op, tcg_fpstatus);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op);
+ }
+
+ clear_vec_high(s, is_q, rd);
+ }
+
+ if (tcg_rmode) {
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+ tcg_temp_free_i32(tcg_rmode);
+ }
+
+ if (tcg_fpstatus) {
+ tcg_temp_free_ptr(tcg_fpstatus);
+ }
+}
+
+/* AdvSIMD scalar x indexed element
+ * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd |
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * AdvSIMD vector x indexed element
+ * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd |
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ */
+static void disas_simd_indexed(DisasContext *s, uint32_t insn)
+{
+ /* This encoding has two kinds of instruction:
+ * normal, where we perform elt x idxelt => elt for each
+ * element in the vector
+ * long, where we perform elt x idxelt and generate a result of
+ * double the width of the input element
+ * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs).
+ */
+ bool is_scalar = extract32(insn, 28, 1);
+ bool is_q = extract32(insn, 30, 1);
+ bool u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int l = extract32(insn, 21, 1);
+ int m = extract32(insn, 20, 1);
+ /* Note that the Rm field here is only 4 bits, not 5 as it usually is */
+ int rm = extract32(insn, 16, 4);
+ int opcode = extract32(insn, 12, 4);
+ int h = extract32(insn, 11, 1);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ bool is_long = false;
+ int is_fp = 0;
+ bool is_fp16 = false;
+ int index;
+ TCGv_ptr fpst;
+
+ switch (16 * u + opcode) {
+ case 0x08: /* MUL */
+ case 0x10: /* MLA */
+ case 0x14: /* MLS */
+ if (is_scalar) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x02: /* SMLAL, SMLAL2 */
+ case 0x12: /* UMLAL, UMLAL2 */
+ case 0x06: /* SMLSL, SMLSL2 */
+ case 0x16: /* UMLSL, UMLSL2 */
+ case 0x0a: /* SMULL, SMULL2 */
+ case 0x1a: /* UMULL, UMULL2 */
+ if (is_scalar) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_long = true;
+ break;
+ case 0x03: /* SQDMLAL, SQDMLAL2 */
+ case 0x07: /* SQDMLSL, SQDMLSL2 */
+ case 0x0b: /* SQDMULL, SQDMULL2 */
+ is_long = true;
+ break;
+ case 0x0c: /* SQDMULH */
+ case 0x0d: /* SQRDMULH */
+ break;
+ case 0x01: /* FMLA */
+ case 0x05: /* FMLS */
+ case 0x09: /* FMUL */
+ case 0x19: /* FMULX */
+ is_fp = 1;
+ break;
+ case 0x1d: /* SQRDMLAH */
+ case 0x1f: /* SQRDMLSH */
+ if (!dc_isar_feature(aa64_rdm, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x0e: /* SDOT */
+ case 0x1e: /* UDOT */
+ if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_dp, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x0f:
+ switch (size) {
+ case 0: /* SUDOT */
+ case 2: /* USDOT */
+ if (is_scalar || !dc_isar_feature(aa64_i8mm, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ size = MO_32;
+ break;
+ case 1: /* BFDOT */
+ if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ size = MO_32;
+ break;
+ case 3: /* BFMLAL{B,T} */
+ if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* can't set is_fp without other incorrect size checks */
+ size = MO_16;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x11: /* FCMLA #0 */
+ case 0x13: /* FCMLA #90 */
+ case 0x15: /* FCMLA #180 */
+ case 0x17: /* FCMLA #270 */
+ if (is_scalar || !dc_isar_feature(aa64_fcma, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_fp = 2;
+ break;
+ case 0x00: /* FMLAL */
+ case 0x04: /* FMLSL */
+ case 0x18: /* FMLAL2 */
+ case 0x1c: /* FMLSL2 */
+ if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ size = MO_16;
+ /* is_fp, but we pass cpu_env not fp_status. */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (is_fp) {
+ case 1: /* normal fp */
+ /* convert insn encoded size to MemOp size */
+ switch (size) {
+ case 0: /* half-precision */
+ size = MO_16;
+ is_fp16 = true;
+ break;
+ case MO_32: /* single precision */
+ case MO_64: /* double precision */
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+
+ case 2: /* complex fp */
+ /* Each indexable element is a complex pair. */
+ size += 1;
+ switch (size) {
+ case MO_32:
+ if (h && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_fp16 = true;
+ break;
+ case MO_64:
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+
+ default: /* integer */
+ switch (size) {
+ case MO_8:
+ case MO_64:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+ if (is_fp16 && !dc_isar_feature(aa64_fp16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Given MemOp size, adjust register and indexing. */
+ switch (size) {
+ case MO_16:
+ index = h << 2 | l << 1 | m;
+ break;
+ case MO_32:
+ index = h << 1 | l;
+ rm |= m << 4;
+ break;
+ case MO_64:
+ if (l || !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ index = h;
+ rm |= m << 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (is_fp) {
+ fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+ } else {
+ fpst = NULL;
+ }
+
+ switch (16 * u + opcode) {
+ case 0x0e: /* SDOT */
+ case 0x1e: /* UDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ u ? gen_helper_gvec_udot_idx_b
+ : gen_helper_gvec_sdot_idx_b);
+ return;
+ case 0x0f:
+ switch (extract32(insn, 22, 2)) {
+ case 0: /* SUDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_sudot_idx_b);
+ return;
+ case 1: /* BFDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_bfdot_idx);
+ return;
+ case 2: /* USDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_usdot_idx_b);
+ return;
+ case 3: /* BFMLAL{B,T} */
+ gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, 0, (index << 1) | is_q,
+ gen_helper_gvec_bfmlal_idx);
+ return;
+ }
+ g_assert_not_reached();
+ case 0x11: /* FCMLA #0 */
+ case 0x13: /* FCMLA #90 */
+ case 0x15: /* FCMLA #180 */
+ case 0x17: /* FCMLA #270 */
+ {
+ int rot = extract32(insn, 13, 2);
+ int data = (index << 2) | rot;
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, rd), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s), data,
+ size == MO_64
+ ? gen_helper_gvec_fcmlas_idx
+ : gen_helper_gvec_fcmlah_idx);
+ tcg_temp_free_ptr(fpst);
+ }
+ return;
+
+ case 0x00: /* FMLAL */
+ case 0x04: /* FMLSL */
+ case 0x18: /* FMLAL2 */
+ case 0x1c: /* FMLSL2 */
+ {
+ int is_s = extract32(opcode, 2, 1);
+ int is_2 = u;
+ int data = (index << 2) | (is_2 << 1) | is_s;
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), cpu_env,
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ data, gen_helper_gvec_fmlal_idx_a64);
+ }
+ return;
+
+ case 0x08: /* MUL */
+ if (!is_long && !is_scalar) {
+ static gen_helper_gvec_3 * const fns[3] = {
+ gen_helper_gvec_mul_idx_h,
+ gen_helper_gvec_mul_idx_s,
+ gen_helper_gvec_mul_idx_d,
+ };
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ index, fns[size - 1]);
+ return;
+ }
+ break;
+
+ case 0x10: /* MLA */
+ if (!is_long && !is_scalar) {
+ static gen_helper_gvec_4 * const fns[3] = {
+ gen_helper_gvec_mla_idx_h,
+ gen_helper_gvec_mla_idx_s,
+ gen_helper_gvec_mla_idx_d,
+ };
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, rd),
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ index, fns[size - 1]);
+ return;
+ }
+ break;
+
+ case 0x14: /* MLS */
+ if (!is_long && !is_scalar) {
+ static gen_helper_gvec_4 * const fns[3] = {
+ gen_helper_gvec_mls_idx_h,
+ gen_helper_gvec_mls_idx_s,
+ gen_helper_gvec_mls_idx_d,
+ };
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, rd),
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ index, fns[size - 1]);
+ return;
+ }
+ break;
+ }
+
+ if (size == 3) {
+ TCGv_i64 tcg_idx = tcg_temp_new_i64();
+ int pass;
+
+ assert(is_fp && is_q && !is_long);
+
+ read_vec_element(s, tcg_idx, rm, index, MO_64);
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op, rn, pass, MO_64);
+
+ switch (16 * u + opcode) {
+ case 0x05: /* FMLS */
+ /* As usual for ARM, separate negation for fused multiply-add */
+ gen_helper_vfp_negd(tcg_op, tcg_op);
+ /* fall through */
+ case 0x01: /* FMLA */
+ read_vec_element(s, tcg_res, rd, pass, MO_64);
+ gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
+ break;
+ case 0x09: /* FMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ case 0x19: /* FMULX */
+ gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_op);
+ tcg_temp_free_i64(tcg_res);
+ }
+
+ tcg_temp_free_i64(tcg_idx);
+ clear_vec_high(s, !is_scalar, rd);
+ } else if (!is_long) {
+ /* 32 bit floating point, or 16 or 32 bit integer.
+ * For the 16 bit scalar case we use the usual Neon helpers and
+ * rely on the fact that 0 op 0 == 0 with no side effects.
+ */
+ TCGv_i32 tcg_idx = tcg_temp_new_i32();
+ int pass, maxpasses;
+
+ if (is_scalar) {
+ maxpasses = 1;
+ } else {
+ maxpasses = is_q ? 4 : 2;
+ }
+
+ read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+ if (size == 1 && !is_scalar) {
+ /* The simplest way to handle the 16x16 indexed ops is to duplicate
+ * the index into both halves of the 32 bit tcg_idx and then use
+ * the usual Neon helpers.
+ */
+ tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+ }
+
+ for (pass = 0; pass < maxpasses; pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32);
+
+ switch (16 * u + opcode) {
+ case 0x08: /* MUL */
+ case 0x10: /* MLA */
+ case 0x14: /* MLS */
+ {
+ static NeonGenTwoOpFn * const fns[2][2] = {
+ { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+ { tcg_gen_add_i32, tcg_gen_sub_i32 },
+ };
+ NeonGenTwoOpFn *genfn;
+ bool is_sub = opcode == 0x4;
+
+ if (size == 1) {
+ gen_helper_neon_mul_u16(tcg_res, tcg_op, tcg_idx);
+ } else {
+ tcg_gen_mul_i32(tcg_res, tcg_op, tcg_idx);
+ }
+ if (opcode == 0x8) {
+ break;
+ }
+ read_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+ genfn = fns[size - 1][is_sub];
+ genfn(tcg_res, tcg_op, tcg_res);
+ break;
+ }
+ case 0x05: /* FMLS */
+ case 0x01: /* FMLA */
+ read_vec_element_i32(s, tcg_res, rd, pass,
+ is_scalar ? size : MO_32);
+ switch (size) {
+ case 1:
+ if (opcode == 0x5) {
+ /* As usual for ARM, separate negation for fused
+ * multiply-add */
+ tcg_gen_xori_i32(tcg_op, tcg_op, 0x80008000);
+ }
+ if (is_scalar) {
+ gen_helper_advsimd_muladdh(tcg_res, tcg_op, tcg_idx,
+ tcg_res, fpst);
+ } else {
+ gen_helper_advsimd_muladd2h(tcg_res, tcg_op, tcg_idx,
+ tcg_res, fpst);
+ }
+ break;
+ case 2:
+ if (opcode == 0x5) {
+ /* As usual for ARM, separate negation for
+ * fused multiply-add */
+ tcg_gen_xori_i32(tcg_op, tcg_op, 0x80000000);
+ }
+ gen_helper_vfp_muladds(tcg_res, tcg_op, tcg_idx,
+ tcg_res, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+ case 0x09: /* FMUL */
+ switch (size) {
+ case 1:
+ if (is_scalar) {
+ gen_helper_advsimd_mulh(tcg_res, tcg_op,
+ tcg_idx, fpst);
+ } else {
+ gen_helper_advsimd_mul2h(tcg_res, tcg_op,
+ tcg_idx, fpst);
+ }
+ break;
+ case 2:
+ gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+ case 0x19: /* FMULX */
+ switch (size) {
+ case 1:
+ if (is_scalar) {
+ gen_helper_advsimd_mulxh(tcg_res, tcg_op,
+ tcg_idx, fpst);
+ } else {
+ gen_helper_advsimd_mulx2h(tcg_res, tcg_op,
+ tcg_idx, fpst);
+ }
+ break;
+ case 2:
+ gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+ case 0x0c: /* SQDMULH */
+ if (size == 1) {
+ gen_helper_neon_qdmulh_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx);
+ } else {
+ gen_helper_neon_qdmulh_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx);
+ }
+ break;
+ case 0x0d: /* SQRDMULH */
+ if (size == 1) {
+ gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx);
+ } else {
+ gen_helper_neon_qrdmulh_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx);
+ }
+ break;
+ case 0x1d: /* SQRDMLAH */
+ read_vec_element_i32(s, tcg_res, rd, pass,
+ is_scalar ? size : MO_32);
+ if (size == 1) {
+ gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ } else {
+ gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ }
+ break;
+ case 0x1f: /* SQRDMLSH */
+ read_vec_element_i32(s, tcg_res, rd, pass,
+ is_scalar ? size : MO_32);
+ if (size == 1) {
+ gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ } else {
+ gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (is_scalar) {
+ write_fp_sreg(s, rd, tcg_res);
+ } else {
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+ }
+
+ tcg_temp_free_i32(tcg_op);
+ tcg_temp_free_i32(tcg_res);
+ }
+
+ tcg_temp_free_i32(tcg_idx);
+ clear_vec_high(s, is_q, rd);
+ } else {
+ /* long ops: 16x16->32 or 32x32->64 */
+ TCGv_i64 tcg_res[2];
+ int pass;
+ bool satop = extract32(opcode, 0, 1);
+ MemOp memop = MO_32;
+
+ if (satop || !u) {
+ memop |= MO_SIGN;
+ }
+
+ if (size == 2) {
+ TCGv_i64 tcg_idx = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_idx, rm, index, memop);
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ TCGv_i64 tcg_op = tcg_temp_new_i64();
+ TCGv_i64 tcg_passres;
+ int passelt;
+
+ if (is_scalar) {
+ passelt = 0;
+ } else {
+ passelt = pass + (is_q * 2);
+ }
+
+ read_vec_element(s, tcg_op, rn, passelt, memop);
+
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ if (opcode == 0xa || opcode == 0xb) {
+ /* Non-accumulating ops */
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx);
+ tcg_temp_free_i64(tcg_op);
+
+ if (satop) {
+ /* saturating, doubling */
+ gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+ tcg_passres, tcg_passres);
+ }
+
+ if (opcode == 0xa || opcode == 0xb) {
+ continue;
+ }
+
+ /* Accumulating op: handle accumulate step */
+ read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+ switch (opcode) {
+ case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ break;
+ case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ break;
+ case 0x7: /* SQDMLSL, SQDMLSL2 */
+ tcg_gen_neg_i64(tcg_passres, tcg_passres);
+ /* fall through */
+ case 0x3: /* SQDMLAL, SQDMLAL2 */
+ gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+ tcg_res[pass],
+ tcg_passres);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i64(tcg_passres);
+ }
+ tcg_temp_free_i64(tcg_idx);
+
+ clear_vec_high(s, !is_scalar, rd);
+ } else {
+ TCGv_i32 tcg_idx = tcg_temp_new_i32();
+
+ assert(size == 1);
+ read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+ if (!is_scalar) {
+ /* The simplest way to handle the 16x16 indexed ops is to
+ * duplicate the index into both halves of the 32 bit tcg_idx
+ * and then use the usual Neon helpers.
+ */
+ tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+ }
+
+ for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+ TCGv_i32 tcg_op = tcg_temp_new_i32();
+ TCGv_i64 tcg_passres;
+
+ if (is_scalar) {
+ read_vec_element_i32(s, tcg_op, rn, pass, size);
+ } else {
+ read_vec_element_i32(s, tcg_op, rn,
+ pass + (is_q * 2), MO_32);
+ }
+
+ tcg_res[pass] = tcg_temp_new_i64();
+
+ if (opcode == 0xa || opcode == 0xb) {
+ /* Non-accumulating ops */
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ if (memop & MO_SIGN) {
+ gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx);
+ } else {
+ gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx);
+ }
+ if (satop) {
+ gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+ tcg_passres, tcg_passres);
+ }
+ tcg_temp_free_i32(tcg_op);
+
+ if (opcode == 0xa || opcode == 0xb) {
+ continue;
+ }
+
+ /* Accumulating op: handle accumulate step */
+ read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+ switch (opcode) {
+ case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ break;
+ case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ break;
+ case 0x7: /* SQDMLSL, SQDMLSL2 */
+ gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+ /* fall through */
+ case 0x3: /* SQDMLAL, SQDMLAL2 */
+ gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+ tcg_res[pass],
+ tcg_passres);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i64(tcg_passres);
+ }
+ tcg_temp_free_i32(tcg_idx);
+
+ if (is_scalar) {
+ tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]);
+ }
+ }
+
+ if (is_scalar) {
+ tcg_res[1] = tcg_constant_i64(0);
+ }
+
+ for (pass = 0; pass < 2; pass++) {
+ write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_res[pass]);
+ }
+ }
+
+ if (fpst) {
+ tcg_temp_free_ptr(fpst);
+ }
+}
+
+/* Crypto AES
+ * 31 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 0 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 | Rn | Rd |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_aes(DisasContext *s, uint32_t insn)
+{
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ int decrypt;
+ gen_helper_gvec_2 *genfn2 = NULL;
+ gen_helper_gvec_3 *genfn3 = NULL;
+
+ if (!dc_isar_feature(aa64_aes, s) || size != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x4: /* AESE */
+ decrypt = 0;
+ genfn3 = gen_helper_crypto_aese;
+ break;
+ case 0x6: /* AESMC */
+ decrypt = 0;
+ genfn2 = gen_helper_crypto_aesmc;
+ break;
+ case 0x5: /* AESD */
+ decrypt = 1;
+ genfn3 = gen_helper_crypto_aese;
+ break;
+ case 0x7: /* AESIMC */
+ decrypt = 1;
+ genfn2 = gen_helper_crypto_aesmc;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+ if (genfn2) {
+ gen_gvec_op2_ool(s, true, rd, rn, decrypt, genfn2);
+ } else {
+ gen_gvec_op3_ool(s, true, rd, rd, rn, decrypt, genfn3);
+ }
+}
+
+/* Crypto three-reg SHA
+ * 31 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 0 | Rm | 0 | opcode | 0 0 | Rn | Rd |
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ */
+static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn)
+{
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 3);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ gen_helper_gvec_3 *genfn;
+ bool feature;
+
+ if (size != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0: /* SHA1C */
+ genfn = gen_helper_crypto_sha1c;
+ feature = dc_isar_feature(aa64_sha1, s);
+ break;
+ case 1: /* SHA1P */
+ genfn = gen_helper_crypto_sha1p;
+ feature = dc_isar_feature(aa64_sha1, s);
+ break;
+ case 2: /* SHA1M */
+ genfn = gen_helper_crypto_sha1m;
+ feature = dc_isar_feature(aa64_sha1, s);
+ break;
+ case 3: /* SHA1SU0 */
+ genfn = gen_helper_crypto_sha1su0;
+ feature = dc_isar_feature(aa64_sha1, s);
+ break;
+ case 4: /* SHA256H */
+ genfn = gen_helper_crypto_sha256h;
+ feature = dc_isar_feature(aa64_sha256, s);
+ break;
+ case 5: /* SHA256H2 */
+ genfn = gen_helper_crypto_sha256h2;
+ feature = dc_isar_feature(aa64_sha256, s);
+ break;
+ case 6: /* SHA256SU1 */
+ genfn = gen_helper_crypto_sha256su1;
+ feature = dc_isar_feature(aa64_sha256, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+ gen_gvec_op3_ool(s, true, rd, rn, rm, 0, genfn);
+}
+
+/* Crypto two-reg SHA
+ * 31 24 23 22 21 17 16 12 11 10 9 5 4 0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 | Rn | Rd |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn)
+{
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ gen_helper_gvec_2 *genfn;
+ bool feature;
+
+ if (size != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0: /* SHA1H */
+ feature = dc_isar_feature(aa64_sha1, s);
+ genfn = gen_helper_crypto_sha1h;
+ break;
+ case 1: /* SHA1SU1 */
+ feature = dc_isar_feature(aa64_sha1, s);
+ genfn = gen_helper_crypto_sha1su1;
+ break;
+ case 2: /* SHA256SU0 */
+ feature = dc_isar_feature(aa64_sha256, s);
+ genfn = gen_helper_crypto_sha256su0;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+ gen_gvec_op2_ool(s, true, rd, rn, 0, genfn);
+}
+
+static void gen_rax1_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)
+{
+ tcg_gen_rotli_i64(d, m, 1);
+ tcg_gen_xor_i64(d, d, n);
+}
+
+static void gen_rax1_vec(unsigned vece, TCGv_vec d, TCGv_vec n, TCGv_vec m)
+{
+ tcg_gen_rotli_vec(vece, d, m, 1);
+ tcg_gen_xor_vec(vece, d, d, n);
+}
+
+void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
+ static const GVecGen3 op = {
+ .fni8 = gen_rax1_i64,
+ .fniv = gen_rax1_vec,
+ .opt_opc = vecop_list,
+ .fno = gen_helper_crypto_rax1,
+ .vece = MO_64,
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &op);
+}
+
+/* Crypto three-reg SHA512
+ * 31 21 20 16 15 14 13 12 11 10 9 5 4 0
+ * +-----------------------+------+---+---+-----+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 1 1 | Rm | 1 | O | 0 0 | opcode | Rn | Rd |
+ * +-----------------------+------+---+---+-----+--------+------+------+
+ */
+static void disas_crypto_three_reg_sha512(DisasContext *s, uint32_t insn)
+{
+ int opcode = extract32(insn, 10, 2);
+ int o = extract32(insn, 14, 1);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ bool feature;
+ gen_helper_gvec_3 *oolfn = NULL;
+ GVecGen3Fn *gvecfn = NULL;
+
+ if (o == 0) {
+ switch (opcode) {
+ case 0: /* SHA512H */
+ feature = dc_isar_feature(aa64_sha512, s);
+ oolfn = gen_helper_crypto_sha512h;
+ break;
+ case 1: /* SHA512H2 */
+ feature = dc_isar_feature(aa64_sha512, s);
+ oolfn = gen_helper_crypto_sha512h2;
+ break;
+ case 2: /* SHA512SU1 */
+ feature = dc_isar_feature(aa64_sha512, s);
+ oolfn = gen_helper_crypto_sha512su1;
+ break;
+ case 3: /* RAX1 */
+ feature = dc_isar_feature(aa64_sha3, s);
+ gvecfn = gen_gvec_rax1;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ } else {
+ switch (opcode) {
+ case 0: /* SM3PARTW1 */
+ feature = dc_isar_feature(aa64_sm3, s);
+ oolfn = gen_helper_crypto_sm3partw1;
+ break;
+ case 1: /* SM3PARTW2 */
+ feature = dc_isar_feature(aa64_sm3, s);
+ oolfn = gen_helper_crypto_sm3partw2;
+ break;
+ case 2: /* SM4EKEY */
+ feature = dc_isar_feature(aa64_sm4, s);
+ oolfn = gen_helper_crypto_sm4ekey;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (oolfn) {
+ gen_gvec_op3_ool(s, true, rd, rn, rm, 0, oolfn);
+ } else {
+ gen_gvec_fn3(s, true, rd, rn, rm, gvecfn, MO_64);
+ }
+}
+
+/* Crypto two-reg SHA512
+ * 31 12 11 10 9 5 4 0
+ * +-----------------------------------------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 | opcode | Rn | Rd |
+ * +-----------------------------------------+--------+------+------+
+ */
+static void disas_crypto_two_reg_sha512(DisasContext *s, uint32_t insn)
+{
+ int opcode = extract32(insn, 10, 2);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ bool feature;
+
+ switch (opcode) {
+ case 0: /* SHA512SU0 */
+ feature = dc_isar_feature(aa64_sha512, s);
+ break;
+ case 1: /* SM4E */
+ feature = dc_isar_feature(aa64_sm4, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0: /* SHA512SU0 */
+ gen_gvec_op2_ool(s, true, rd, rn, 0, gen_helper_crypto_sha512su0);
+ break;
+ case 1: /* SM4E */
+ gen_gvec_op3_ool(s, true, rd, rd, rn, 0, gen_helper_crypto_sm4e);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Crypto four-register
+ * 31 23 22 21 20 16 15 14 10 9 5 4 0
+ * +-------------------+-----+------+---+------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 | Op0 | Rm | 0 | Ra | Rn | Rd |
+ * +-------------------+-----+------+---+------+------+------+
+ */
+static void disas_crypto_four_reg(DisasContext *s, uint32_t insn)
+{
+ int op0 = extract32(insn, 21, 2);
+ int rm = extract32(insn, 16, 5);
+ int ra = extract32(insn, 10, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ bool feature;
+
+ switch (op0) {
+ case 0: /* EOR3 */
+ case 1: /* BCAX */
+ feature = dc_isar_feature(aa64_sha3, s);
+ break;
+ case 2: /* SM3SS1 */
+ feature = dc_isar_feature(aa64_sm3, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!feature) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ if (op0 < 2) {
+ TCGv_i64 tcg_op1, tcg_op2, tcg_op3, tcg_res[2];
+ int pass;
+
+ tcg_op1 = tcg_temp_new_i64();
+ tcg_op2 = tcg_temp_new_i64();
+ tcg_op3 = tcg_temp_new_i64();
+ tcg_res[0] = tcg_temp_new_i64();
+ tcg_res[1] = tcg_temp_new_i64();
+
+ for (pass = 0; pass < 2; pass++) {
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+ read_vec_element(s, tcg_op3, ra, pass, MO_64);
+
+ if (op0 == 0) {
+ /* EOR3 */
+ tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op3);
+ } else {
+ /* BCAX */
+ tcg_gen_andc_i64(tcg_res[pass], tcg_op2, tcg_op3);
+ }
+ tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+ }
+ write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_op3);
+ tcg_temp_free_i64(tcg_res[0]);
+ tcg_temp_free_i64(tcg_res[1]);
+ } else {
+ TCGv_i32 tcg_op1, tcg_op2, tcg_op3, tcg_res, tcg_zero;
+
+ tcg_op1 = tcg_temp_new_i32();
+ tcg_op2 = tcg_temp_new_i32();
+ tcg_op3 = tcg_temp_new_i32();
+ tcg_res = tcg_temp_new_i32();
+ tcg_zero = tcg_constant_i32(0);
+
+ read_vec_element_i32(s, tcg_op1, rn, 3, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, 3, MO_32);
+ read_vec_element_i32(s, tcg_op3, ra, 3, MO_32);
+
+ tcg_gen_rotri_i32(tcg_res, tcg_op1, 20);
+ tcg_gen_add_i32(tcg_res, tcg_res, tcg_op2);
+ tcg_gen_add_i32(tcg_res, tcg_res, tcg_op3);
+ tcg_gen_rotri_i32(tcg_res, tcg_res, 25);
+
+ write_vec_element_i32(s, tcg_zero, rd, 0, MO_32);
+ write_vec_element_i32(s, tcg_zero, rd, 1, MO_32);
+ write_vec_element_i32(s, tcg_zero, rd, 2, MO_32);
+ write_vec_element_i32(s, tcg_res, rd, 3, MO_32);
+
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ tcg_temp_free_i32(tcg_op3);
+ tcg_temp_free_i32(tcg_res);
+ }
+}
+
+/* Crypto XAR
+ * 31 21 20 16 15 10 9 5 4 0
+ * +-----------------------+------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 1 0 0 | Rm | imm6 | Rn | Rd |
+ * +-----------------------+------+--------+------+------+
+ */
+static void disas_crypto_xar(DisasContext *s, uint32_t insn)
+{
+ int rm = extract32(insn, 16, 5);
+ int imm6 = extract32(insn, 10, 6);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ if (!dc_isar_feature(aa64_sha3, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ gen_gvec_xar(MO_64, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), imm6, 16,
+ vec_full_reg_size(s));
+}
+
+/* Crypto three-reg imm2
+ * 31 21 20 16 15 14 13 12 11 10 9 5 4 0
+ * +-----------------------+------+-----+------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 1 0 | Rm | 1 0 | imm2 | opcode | Rn | Rd |
+ * +-----------------------+------+-----+------+--------+------+------+
+ */
+static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn)
+{
+ static gen_helper_gvec_3 * const fns[4] = {
+ gen_helper_crypto_sm3tt1a, gen_helper_crypto_sm3tt1b,
+ gen_helper_crypto_sm3tt2a, gen_helper_crypto_sm3tt2b,
+ };
+ int opcode = extract32(insn, 10, 2);
+ int imm2 = extract32(insn, 12, 2);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ if (!dc_isar_feature(aa64_sm3, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ gen_gvec_op3_ool(s, true, rd, rn, rm, imm2, fns[opcode]);
+}
+
+/* C3.6 Data processing - SIMD, inc Crypto
+ *
+ * As the decode gets a little complex we are using a table based
+ * approach for this part of the decode.
+ */
+static const AArch64DecodeTable data_proc_simd[] = {
+ /* pattern , mask , fn */
+ { 0x0e200400, 0x9f200400, disas_simd_three_reg_same },
+ { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra },
+ { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff },
+ { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
+ { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
+ { 0x0e000400, 0x9fe08400, disas_simd_copy },
+ { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */
+ /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
+ { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
+ { 0x0f000400, 0x9f800400, disas_simd_shift_imm },
+ { 0x0e000000, 0xbf208c00, disas_simd_tb },
+ { 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
+ { 0x2e000000, 0xbf208400, disas_simd_ext },
+ { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same },
+ { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra },
+ { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
+ { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
+ { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
+ { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy },
+ { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */
+ { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
+ { 0x4e280800, 0xff3e0c00, disas_crypto_aes },
+ { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha },
+ { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha },
+ { 0xce608000, 0xffe0b000, disas_crypto_three_reg_sha512 },
+ { 0xcec08000, 0xfffff000, disas_crypto_two_reg_sha512 },
+ { 0xce000000, 0xff808000, disas_crypto_four_reg },
+ { 0xce800000, 0xffe00000, disas_crypto_xar },
+ { 0xce408000, 0xffe0c000, disas_crypto_three_reg_imm2 },
+ { 0x0e400400, 0x9f60c400, disas_simd_three_reg_same_fp16 },
+ { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 },
+ { 0x5e400400, 0xdf60c400, disas_simd_scalar_three_reg_same_fp16 },
+ { 0x00000000, 0x00000000, NULL }
+};
+
+static void disas_data_proc_simd(DisasContext *s, uint32_t insn)
+{
+ /* Note that this is called with all non-FP cases from
+ * table C3-6 so it must UNDEF for entries not specifically
+ * allocated to instructions in that table.
+ */
+ AArch64DecodeFn *fn = lookup_disas_fn(&data_proc_simd[0], insn);
+ if (fn) {
+ fn(s, insn);
+ } else {
+ unallocated_encoding(s);
+ }
+}
+
+/* C3.6 Data processing - SIMD and floating point */
+static void disas_data_proc_simd_fp(DisasContext *s, uint32_t insn)
+{
+ if (extract32(insn, 28, 1) == 1 && extract32(insn, 30, 1) == 0) {
+ disas_data_proc_fp(s, insn);
+ } else {
+ /* SIMD, including crypto */
+ disas_data_proc_simd(s, insn);
+ }
+}
+
+/*
+ * Include the generated SME FA64 decoder.
+ */
+
+#include "decode-sme-fa64.c.inc"
+
+static bool trans_OK(DisasContext *s, arg_OK *a)
+{
+ return true;
+}
+
+static bool trans_FAIL(DisasContext *s, arg_OK *a)
+{
+ s->is_nonstreaming = true;
+ return true;
+}
+
+/**
+ * is_guarded_page:
+ * @env: The cpu environment
+ * @s: The DisasContext
+ *
+ * Return true if the page is guarded.
+ */
+static bool is_guarded_page(CPUARMState *env, DisasContext *s)
+{
+ uint64_t addr = s->base.pc_first;
+#ifdef CONFIG_USER_ONLY
+ return page_get_flags(addr) & PAGE_BTI;
+#else
+ CPUTLBEntryFull *full;
+ void *host;
+ int mmu_idx = arm_to_core_mmu_idx(s->mmu_idx);
+ int flags;
+
+ /*
+ * We test this immediately after reading an insn, which means
+ * that the TLB entry must be present and valid, and thus this
+ * access will never raise an exception.
+ */
+ flags = probe_access_full(env, addr, MMU_INST_FETCH, mmu_idx,
+ false, &host, &full, 0);
+ assert(!(flags & TLB_INVALID_MASK));
+
+ return full->guarded;
+#endif
+}
+
+/**
+ * btype_destination_ok:
+ * @insn: The instruction at the branch destination
+ * @bt: SCTLR_ELx.BT
+ * @btype: PSTATE.BTYPE, and is non-zero
+ *
+ * On a guarded page, there are a limited number of insns
+ * that may be present at the branch target:
+ * - branch target identifiers,
+ * - paciasp, pacibsp,
+ * - BRK insn
+ * - HLT insn
+ * Anything else causes a Branch Target Exception.
+ *
+ * Return true if the branch is compatible, false to raise BTITRAP.
+ */
+static bool btype_destination_ok(uint32_t insn, bool bt, int btype)
+{
+ if ((insn & 0xfffff01fu) == 0xd503201fu) {
+ /* HINT space */
+ switch (extract32(insn, 5, 7)) {
+ case 0b011001: /* PACIASP */
+ case 0b011011: /* PACIBSP */
+ /*
+ * If SCTLR_ELx.BT, then PACI*SP are not compatible
+ * with btype == 3. Otherwise all btype are ok.
+ */
+ return !bt || btype != 3;
+ case 0b100000: /* BTI */
+ /* Not compatible with any btype. */
+ return false;
+ case 0b100010: /* BTI c */
+ /* Not compatible with btype == 3 */
+ return btype != 3;
+ case 0b100100: /* BTI j */
+ /* Not compatible with btype == 2 */
+ return btype != 2;
+ case 0b100110: /* BTI jc */
+ /* Compatible with any btype. */
+ return true;
+ }
+ } else {
+ switch (insn & 0xffe0001fu) {
+ case 0xd4200000u: /* BRK */
+ case 0xd4400000u: /* HLT */
+ /* Give priority to the breakpoint exception. */
+ return true;
+ }
+ }
+ return false;
+}
+
+static void aarch64_tr_init_disas_context(DisasContextBase *dcbase,
+ CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ CPUARMState *env = cpu->env_ptr;
+ ARMCPU *arm_cpu = env_archcpu(env);
+ CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
+ int bound, core_mmu_idx;
+
+ dc->isar = &arm_cpu->isar;
+ dc->condjmp = 0;
+ dc->pc_save = dc->base.pc_first;
+ dc->aarch64 = true;
+ dc->thumb = false;
+ dc->sctlr_b = 0;
+ dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
+ dc->condexec_mask = 0;
+ dc->condexec_cond = 0;
+ core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
+ dc->mmu_idx = core_to_aa64_mmu_idx(core_mmu_idx);
+ dc->tbii = EX_TBFLAG_A64(tb_flags, TBII);
+ dc->tbid = EX_TBFLAG_A64(tb_flags, TBID);
+ dc->tcma = EX_TBFLAG_A64(tb_flags, TCMA);
+ dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
+#if !defined(CONFIG_USER_ONLY)
+ dc->user = (dc->current_el == 0);
+#endif
+ dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
+ dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
+ dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
+ dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
+ dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
+ dc->fgt_eret = EX_TBFLAG_A64(tb_flags, FGT_ERET);
+ dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL);
+ dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL);
+ dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16;
+ dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16;
+ dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE);
+ dc->bt = EX_TBFLAG_A64(tb_flags, BT);
+ dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE);
+ dc->unpriv = EX_TBFLAG_A64(tb_flags, UNPRIV);
+ dc->ata = EX_TBFLAG_A64(tb_flags, ATA);
+ dc->mte_active[0] = EX_TBFLAG_A64(tb_flags, MTE_ACTIVE);
+ dc->mte_active[1] = EX_TBFLAG_A64(tb_flags, MTE0_ACTIVE);
+ dc->pstate_sm = EX_TBFLAG_A64(tb_flags, PSTATE_SM);
+ dc->pstate_za = EX_TBFLAG_A64(tb_flags, PSTATE_ZA);
+ dc->sme_trap_nonstreaming = EX_TBFLAG_A64(tb_flags, SME_TRAP_NONSTREAMING);
+ dc->vec_len = 0;
+ dc->vec_stride = 0;
+ dc->cp_regs = arm_cpu->cp_regs;
+ dc->features = env->features;
+ dc->dcz_blocksize = arm_cpu->dcz_blocksize;
+
+#ifdef CONFIG_USER_ONLY
+ /* In sve_probe_page, we assume TBI is enabled. */
+ tcg_debug_assert(dc->tbid & 1);
+#endif
+
+ /* Single step state. The code-generation logic here is:
+ * SS_ACTIVE == 0:
+ * generate code with no special handling for single-stepping (except
+ * that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+ * this happens anyway because those changes are all system register or
+ * PSTATE writes).
+ * SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+ * emit code for one insn
+ * emit code to clear PSTATE.SS
+ * emit code to generate software step exception for completed step
+ * end TB (as usual for having generated an exception)
+ * SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+ * emit code to generate a software step exception
+ * end the TB
+ */
+ dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
+ dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
+ dc->is_ldex = false;
+
+ /* Bound the number of insns to execute to those left on the page. */
+ bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
+
+ /* If architectural single step active, limit to 1. */
+ if (dc->ss_active) {
+ bound = 1;
+ }
+ dc->base.max_insns = MIN(dc->base.max_insns, bound);
+
+ init_tmp_a64_array(dc);
+}
+
+static void aarch64_tr_tb_start(DisasContextBase *db, CPUState *cpu)
+{
+}
+
+static void aarch64_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ target_ulong pc_arg = dc->base.pc_next;
+
+ if (TARGET_TB_PCREL) {
+ pc_arg &= ~TARGET_PAGE_MASK;
+ }
+ tcg_gen_insn_start(pc_arg, 0, 0);
+ dc->insn_start = tcg_last_op();
+}
+
+static void aarch64_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *s = container_of(dcbase, DisasContext, base);
+ CPUARMState *env = cpu->env_ptr;
+ uint64_t pc = s->base.pc_next;
+ uint32_t insn;
+
+ /* Singlestep exceptions have the highest priority. */
+ if (s->ss_active && !s->pstate_ss) {
+ /* Singlestep state is Active-pending.
+ * If we're in this state at the start of a TB then either
+ * a) we just took an exception to an EL which is being debugged
+ * and this is the first insn in the exception handler
+ * b) debug exceptions were masked and we just unmasked them
+ * without changing EL (eg by clearing PSTATE.D)
+ * In either case we're going to take a swstep exception in the
+ * "did not step an insn" case, and so the syndrome ISV and EX
+ * bits should be zero.
+ */
+ assert(s->base.num_insns == 1);
+ gen_swstep_exception(s, 0, 0);
+ s->base.is_jmp = DISAS_NORETURN;
+ s->base.pc_next = pc + 4;
+ return;
+ }
+
+ if (pc & 3) {
+ /*
+ * PC alignment fault. This has priority over the instruction abort
+ * that we would receive from a translation fault via arm_ldl_code.
+ * This should only be possible after an indirect branch, at the
+ * start of the TB.
+ */
+ assert(s->base.num_insns == 1);
+ gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
+ s->base.is_jmp = DISAS_NORETURN;
+ s->base.pc_next = QEMU_ALIGN_UP(pc, 4);
+ return;
+ }
+
+ s->pc_curr = pc;
+ insn = arm_ldl_code(env, &s->base, pc, s->sctlr_b);
+ s->insn = insn;
+ s->base.pc_next = pc + 4;
+
+ s->fp_access_checked = false;
+ s->sve_access_checked = false;
+
+ if (s->pstate_il) {
+ /*
+ * Illegal execution state. This has priority over BTI
+ * exceptions, but comes after instruction abort exceptions.
+ */
+ gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
+ return;
+ }
+
+ if (dc_isar_feature(aa64_bti, s)) {
+ if (s->base.num_insns == 1) {
+ /*
+ * At the first insn of the TB, compute s->guarded_page.
+ * We delayed computing this until successfully reading
+ * the first insn of the TB, above. This (mostly) ensures
+ * that the softmmu tlb entry has been populated, and the
+ * page table GP bit is available.
+ *
+ * Note that we need to compute this even if btype == 0,
+ * because this value is used for BR instructions later
+ * where ENV is not available.
+ */
+ s->guarded_page = is_guarded_page(env, s);
+
+ /* First insn can have btype set to non-zero. */
+ tcg_debug_assert(s->btype >= 0);
+
+ /*
+ * Note that the Branch Target Exception has fairly high
+ * priority -- below debugging exceptions but above most
+ * everything else. This allows us to handle this now
+ * instead of waiting until the insn is otherwise decoded.
+ */
+ if (s->btype != 0
+ && s->guarded_page
+ && !btype_destination_ok(insn, s->bt, s->btype)) {
+ gen_exception_insn(s, 0, EXCP_UDEF, syn_btitrap(s->btype));
+ return;
+ }
+ } else {
+ /* Not the first insn: btype must be 0. */
+ tcg_debug_assert(s->btype == 0);
+ }
+ }
+
+ s->is_nonstreaming = false;
+ if (s->sme_trap_nonstreaming) {
+ disas_sme_fa64(s, insn);
+ }
+
+ switch (extract32(insn, 25, 4)) {
+ case 0x0:
+ if (!extract32(insn, 31, 1) || !disas_sme(s, insn)) {
+ unallocated_encoding(s);
+ }
+ break;
+ case 0x1: case 0x3: /* UNALLOCATED */
+ unallocated_encoding(s);
+ break;
+ case 0x2:
+ if (!disas_sve(s, insn)) {
+ unallocated_encoding(s);
+ }
+ break;
+ case 0x8: case 0x9: /* Data processing - immediate */
+ disas_data_proc_imm(s, insn);
+ break;
+ case 0xa: case 0xb: /* Branch, exception generation and system insns */
+ disas_b_exc_sys(s, insn);
+ break;
+ case 0x4:
+ case 0x6:
+ case 0xc:
+ case 0xe: /* Loads and stores */
+ disas_ldst(s, insn);
+ break;
+ case 0x5:
+ case 0xd: /* Data processing - register */
+ disas_data_proc_reg(s, insn);
+ break;
+ case 0x7:
+ case 0xf: /* Data processing - SIMD and floating point */
+ disas_data_proc_simd_fp(s, insn);
+ break;
+ default:
+ assert(FALSE); /* all 15 cases should be handled above */
+ break;
+ }
+
+ /* if we allocated any temporaries, free them here */
+ free_tmp_a64(s);
+
+ /*
+ * After execution of most insns, btype is reset to 0.
+ * Note that we set btype == -1 when the insn sets btype.
+ */
+ if (s->btype > 0 && s->base.is_jmp != DISAS_NORETURN) {
+ reset_btype(s);
+ }
+
+ translator_loop_temp_check(&s->base);
+}
+
+static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+ if (unlikely(dc->ss_active)) {
+ /* Note that this means single stepping WFI doesn't halt the CPU.
+ * For conditional branch insns this is harmless unreachable code as
+ * gen_goto_tb() has already handled emitting the debug exception
+ * (and thus a tb-jump is not possible when singlestepping).
+ */
+ switch (dc->base.is_jmp) {
+ default:
+ gen_a64_update_pc(dc, 4);
+ /* fall through */
+ case DISAS_EXIT:
+ case DISAS_JUMP:
+ gen_step_complete_exception(dc);
+ break;
+ case DISAS_NORETURN:
+ break;
+ }
+ } else {
+ switch (dc->base.is_jmp) {
+ case DISAS_NEXT:
+ case DISAS_TOO_MANY:
+ gen_goto_tb(dc, 1, 4);
+ break;
+ default:
+ case DISAS_UPDATE_EXIT:
+ gen_a64_update_pc(dc, 4);
+ /* fall through */
+ case DISAS_EXIT:
+ tcg_gen_exit_tb(NULL, 0);
+ break;
+ case DISAS_UPDATE_NOCHAIN:
+ gen_a64_update_pc(dc, 4);
+ /* fall through */
+ case DISAS_JUMP:
+ tcg_gen_lookup_and_goto_ptr();
+ break;
+ case DISAS_NORETURN:
+ case DISAS_SWI:
+ break;
+ case DISAS_WFE:
+ gen_a64_update_pc(dc, 4);
+ gen_helper_wfe(cpu_env);
+ break;
+ case DISAS_YIELD:
+ gen_a64_update_pc(dc, 4);
+ gen_helper_yield(cpu_env);
+ break;
+ case DISAS_WFI:
+ /*
+ * This is a special case because we don't want to just halt
+ * the CPU if trying to debug across a WFI.
+ */
+ gen_a64_update_pc(dc, 4);
+ gen_helper_wfi(cpu_env, tcg_constant_i32(4));
+ /*
+ * The helper doesn't necessarily throw an exception, but we
+ * must go back to the main loop to check for interrupts anyway.
+ */
+ tcg_gen_exit_tb(NULL, 0);
+ break;
+ }
+ }
+}
+
+static void aarch64_tr_disas_log(const DisasContextBase *dcbase,
+ CPUState *cpu, FILE *logfile)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+ fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
+ target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
+}
+
+const TranslatorOps aarch64_translator_ops = {
+ .init_disas_context = aarch64_tr_init_disas_context,
+ .tb_start = aarch64_tr_tb_start,
+ .insn_start = aarch64_tr_insn_start,
+ .translate_insn = aarch64_tr_translate_insn,
+ .tb_stop = aarch64_tr_tb_stop,
+ .disas_log = aarch64_tr_disas_log,
+};
diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h
new file mode 100644
index 0000000..ad3762d
--- /dev/null
+++ b/target/arm/tcg/translate-a64.h
@@ -0,0 +1,201 @@
+/*
+ * AArch64 translation, common definitions.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_TRANSLATE_A64_H
+#define TARGET_ARM_TRANSLATE_A64_H
+
+TCGv_i64 new_tmp_a64(DisasContext *s);
+TCGv_i64 new_tmp_a64_local(DisasContext *s);
+TCGv_i64 new_tmp_a64_zero(DisasContext *s);
+TCGv_i64 cpu_reg(DisasContext *s, int reg);
+TCGv_i64 cpu_reg_sp(DisasContext *s, int reg);
+TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf);
+TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf);
+void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v);
+bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+ unsigned int imms, unsigned int immr);
+bool sve_access_check(DisasContext *s);
+bool sme_enabled_check(DisasContext *s);
+bool sme_enabled_check_with_svcr(DisasContext *s, unsigned);
+
+/* This function corresponds to CheckStreamingSVEEnabled. */
+static inline bool sme_sm_enabled_check(DisasContext *s)
+{
+ return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK);
+}
+
+/* This function corresponds to CheckSMEAndZAEnabled. */
+static inline bool sme_za_enabled_check(DisasContext *s)
+{
+ return sme_enabled_check_with_svcr(s, R_SVCR_ZA_MASK);
+}
+
+/* Note that this function corresponds to CheckStreamingSVEAndZAEnabled. */
+static inline bool sme_smza_enabled_check(DisasContext *s)
+{
+ return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK | R_SVCR_ZA_MASK);
+}
+
+TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr);
+TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
+ bool tag_checked, int log2_size);
+TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
+ bool tag_checked, int size);
+
+/* We should have at some point before trying to access an FP register
+ * done the necessary access check, so assert that
+ * (a) we did the check and
+ * (b) we didn't then just plough ahead anyway if it failed.
+ * Print the instruction pattern in the abort message so we can figure
+ * out what we need to fix if a user encounters this problem in the wild.
+ */
+static inline void assert_fp_access_checked(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+ if (unlikely(!s->fp_access_checked || s->fp_excp_el)) {
+ fprintf(stderr, "target-arm: FP access check missing for "
+ "instruction 0x%08x\n", s->insn);
+ abort();
+ }
+#endif
+}
+
+/* Return the offset into CPUARMState of an element of specified
+ * size, 'element' places in from the least significant end of
+ * the FP/vector register Qn.
+ */
+static inline int vec_reg_offset(DisasContext *s, int regno,
+ int element, MemOp size)
+{
+ int element_size = 1 << size;
+ int offs = element * element_size;
+#if HOST_BIG_ENDIAN
+ /* This is complicated slightly because vfp.zregs[n].d[0] is
+ * still the lowest and vfp.zregs[n].d[15] the highest of the
+ * 256 byte vector, even on big endian systems.
+ *
+ * Calculate the offset assuming fully little-endian,
+ * then XOR to account for the order of the 8-byte units.
+ *
+ * For 16 byte elements, the two 8 byte halves will not form a
+ * host int128 if the host is bigendian, since they're in the
+ * wrong order. However the only 16 byte operation we have is
+ * a move, so we can ignore this for the moment. More complicated
+ * operations will have to special case loading and storing from
+ * the zregs array.
+ */
+ if (element_size < 8) {
+ offs ^= 8 - element_size;
+ }
+#endif
+ offs += offsetof(CPUARMState, vfp.zregs[regno]);
+ assert_fp_access_checked(s);
+ return offs;
+}
+
+/* Return the offset info CPUARMState of the "whole" vector register Qn. */
+static inline int vec_full_reg_offset(DisasContext *s, int regno)
+{
+ assert_fp_access_checked(s);
+ return offsetof(CPUARMState, vfp.zregs[regno]);
+}
+
+/* Return a newly allocated pointer to the vector register. */
+static inline TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ret, cpu_env, vec_full_reg_offset(s, regno));
+ return ret;
+}
+
+/* Return the byte size of the "whole" vector register, VL / 8. */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+ return s->vl;
+}
+
+/* Return the byte size of the vector register, SVL / 8. */
+static inline int streaming_vec_reg_size(DisasContext *s)
+{
+ return s->svl;
+}
+
+/*
+ * Return the offset info CPUARMState of the predicate vector register Pn.
+ * Note for this purpose, FFR is P16.
+ */
+static inline int pred_full_reg_offset(DisasContext *s, int regno)
+{
+ return offsetof(CPUARMState, vfp.pregs[regno]);
+}
+
+/* Return the byte size of the whole predicate register, VL / 64. */
+static inline int pred_full_reg_size(DisasContext *s)
+{
+ return s->vl >> 3;
+}
+
+/* Return the byte size of the predicate register, SVL / 64. */
+static inline int streaming_pred_reg_size(DisasContext *s)
+{
+ return s->svl >> 3;
+}
+
+/*
+ * Round up the size of a register to a size allowed by
+ * the tcg vector infrastructure. Any operation which uses this
+ * size may assume that the bits above pred_full_reg_size are zero,
+ * and must leave them the same way.
+ *
+ * Note that this is not needed for the vector registers as they
+ * are always properly sized for tcg vectors.
+ */
+static inline int size_for_gvec(int size)
+{
+ if (size <= 8) {
+ return 8;
+ } else {
+ return QEMU_ALIGN_UP(size, 16);
+ }
+}
+
+static inline int pred_gvec_reg_size(DisasContext *s)
+{
+ return size_for_gvec(pred_full_reg_size(s));
+}
+
+/* Return a newly allocated pointer to the predicate register. */
+static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ret, cpu_env, pred_full_reg_offset(s, regno));
+ return ret;
+}
+
+bool disas_sve(DisasContext *, uint32_t);
+bool disas_sme(DisasContext *, uint32_t);
+
+void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, int64_t shift,
+ uint32_t opr_sz, uint32_t max_sz);
+
+void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
+void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
+
+#endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c
new file mode 100644
index 0000000..5df7d46
--- /dev/null
+++ b/target/arm/tcg/translate-m-nocp.c
@@ -0,0 +1,788 @@
+/*
+ * ARM translation: M-profile NOCP special-case instructions
+ *
+ * Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+#include "decode-m-nocp.c.inc"
+
+/*
+ * Decode VLLDM and VLSTM are nonstandard because:
+ * * if there is no FPU then these insns must NOP in
+ * Secure state and UNDEF in Nonsecure state
+ * * if there is an FPU then these insns do not have
+ * the usual behaviour that vfp_access_check() provides of
+ * being controlled by CPACR/NSACR enable bits or the
+ * lazy-stacking logic.
+ */
+static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
+{
+ TCGv_i32 fptr;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+ !arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+
+ if (a->op) {
+ /*
+ * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not
+ * to take the IMPDEF option to make memory accesses to the stack
+ * slots that correspond to the D16-D31 registers (discarding
+ * read data and writing UNKNOWN values), so for us the T2
+ * encoding behaves identically to the T1 encoding.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ } else {
+ /*
+ * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs.
+ * This is currently architecturally impossible, but we add the
+ * check to stay in line with the pseudocode. Note that we must
+ * emit code for the UNDEF so it takes precedence over the NOCP.
+ */
+ if (dc_isar_feature(aa32_simd_r32, s)) {
+ unallocated_encoding(s);
+ return true;
+ }
+ }
+
+ /*
+ * If not secure, UNDEF. We must emit code for this
+ * rather than returning false so that this takes
+ * precedence over the m-nocp.decode NOCP fallback.
+ */
+ if (!s->v8m_secure) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ s->eci_handled = true;
+
+ /* If no fpu, NOP. */
+ if (!dc_isar_feature(aa32_vfp, s)) {
+ clear_eci_state(s);
+ return true;
+ }
+
+ fptr = load_reg(s, a->rn);
+ if (a->l) {
+ gen_helper_v7m_vlldm(cpu_env, fptr);
+ } else {
+ gen_helper_v7m_vlstm(cpu_env, fptr);
+ }
+ tcg_temp_free_i32(fptr);
+
+ clear_eci_state(s);
+
+ /*
+ * End the TB, because we have updated FP control bits,
+ * and possibly VPR or LTPSIZE.
+ */
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ return true;
+}
+
+static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
+{
+ int btmreg, topreg;
+ TCGv_i64 zero;
+ TCGv_i32 aspen, sfpa;
+
+ if (!dc_isar_feature(aa32_m_sec_state, s)) {
+ /* Before v8.1M, fall through in decode to NOCP check */
+ return false;
+ }
+
+ /* Explicitly UNDEF because this takes precedence over NOCP */
+ if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ s->eci_handled = true;
+
+ if (!dc_isar_feature(aa32_vfp_simd, s)) {
+ /* NOP if we have neither FP nor MVE */
+ clear_eci_state(s);
+ return true;
+ }
+
+ /*
+ * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no
+ * active floating point context so we must NOP (without doing
+ * any lazy state preservation or the NOCP check).
+ */
+ aspen = load_cpu_field(v7m.fpccr[M_REG_S]);
+ sfpa = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK);
+ tcg_gen_or_i32(sfpa, sfpa, aspen);
+ arm_gen_condlabel(s);
+ tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel.label);
+
+ if (s->fp_excp_el != 0) {
+ gen_exception_insn_el(s, 0, EXCP_NOCP,
+ syn_uncategorized(), s->fp_excp_el);
+ return true;
+ }
+
+ topreg = a->vd + a->imm - 1;
+ btmreg = a->vd;
+
+ /* Convert to Sreg numbers if the insn specified in Dregs */
+ if (a->size == 3) {
+ topreg = topreg * 2 + 1;
+ btmreg *= 2;
+ }
+
+ if (topreg > 63 || (topreg > 31 && !(topreg & 1))) {
+ /* UNPREDICTABLE: we choose to undef */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ /* Silently ignore requests to clear D16-D31 if they don't exist */
+ if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) {
+ topreg = 31;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Zero the Sregs from btmreg to topreg inclusive. */
+ zero = tcg_constant_i64(0);
+ if (btmreg & 1) {
+ write_neon_element64(zero, btmreg >> 1, 1, MO_32);
+ btmreg++;
+ }
+ for (; btmreg + 1 <= topreg; btmreg += 2) {
+ write_neon_element64(zero, btmreg >> 1, 0, MO_64);
+ }
+ if (btmreg == topreg) {
+ write_neon_element64(zero, btmreg >> 1, 0, MO_32);
+ btmreg++;
+ }
+ assert(btmreg == topreg + 1);
+ if (dc_isar_feature(aa32_mve, s)) {
+ store_cpu_field(tcg_constant_i32(0), v7m.vpr);
+ }
+
+ clear_eci_state(s);
+ return true;
+}
+
+/*
+ * M-profile provides two different sets of instructions that can
+ * access floating point system registers: VMSR/VMRS (which move
+ * to/from a general purpose register) and VLDR/VSTR sysreg (which
+ * move directly to/from memory). In some cases there are also side
+ * effects which must happen after any write to memory (which could
+ * cause an exception). So we implement the common logic for the
+ * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(),
+ * which take pointers to callback functions which will perform the
+ * actual "read/write general purpose register" and "read/write
+ * memory" operations.
+ */
+
+/*
+ * Emit code to store the sysreg to its final destination; frees the
+ * TCG temp 'value' it is passed. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access);
+/*
+ * Emit code to load the value to be copied to the sysreg; returns
+ * a new TCG temporary. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque,
+ bool do_access);
+
+/* Common decode/access checks for fp sysreg read/write */
+typedef enum FPSysRegCheckResult {
+ FPSysRegCheckFailed, /* caller should return false */
+ FPSysRegCheckDone, /* caller should return true */
+ FPSysRegCheckContinue, /* caller should continue generating code */
+} FPSysRegCheckResult;
+
+static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno)
+{
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return FPSysRegCheckFailed;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ case QEMU_VFP_FPSCR_NZCV:
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ case ARM_VFP_FPCXT_S:
+ case ARM_VFP_FPCXT_NS:
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return FPSysRegCheckFailed;
+ }
+ if (!s->v8m_secure) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ case ARM_VFP_VPR:
+ case ARM_VFP_P0:
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ default:
+ return FPSysRegCheckFailed;
+ }
+
+ /*
+ * FPCXT_NS is a special case: it has specific handling for
+ * "current FP state is inactive", and must do the PreserveFPState()
+ * but not the usual full set of actions done by ExecuteFPCheck().
+ * So we don't call vfp_access_check() and the callers must handle this.
+ */
+ if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) {
+ return FPSysRegCheckDone;
+ }
+ return FPSysRegCheckContinue;
+}
+
+static void gen_branch_fpInactive(DisasContext *s, TCGCond cond,
+ TCGLabel *label)
+{
+ /*
+ * FPCXT_NS is a special case: it has specific handling for
+ * "current FP state is inactive", and must do the PreserveFPState()
+ * but not the usual full set of actions done by ExecuteFPCheck().
+ * We don't have a TB flag that matches the fpInactive check, so we
+ * do it at runtime as we don't expect FPCXT_NS accesses to be frequent.
+ *
+ * Emit code that checks fpInactive and does a conditional
+ * branch to label based on it:
+ * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive)
+ * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active)
+ */
+ assert(cond == TCG_COND_EQ || cond == TCG_COND_NE);
+
+ /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */
+ TCGv_i32 aspen, fpca;
+ aspen = load_cpu_field(v7m.fpccr[M_REG_NS]);
+ fpca = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK);
+ tcg_gen_or_i32(fpca, fpca, aspen);
+ tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label);
+ tcg_temp_free_i32(aspen);
+ tcg_temp_free_i32(fpca);
+}
+
+static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
+ fp_sysreg_loadfn *loadfn,
+ void *opaque)
+{
+ /* Do a write to an M-profile floating point system register */
+ TCGv_i32 tmp;
+ TCGLabel *lab_end = NULL;
+
+ switch (fp_sysreg_checks(s, regno)) {
+ case FPSysRegCheckFailed:
+ return false;
+ case FPSysRegCheckDone:
+ return true;
+ case FPSysRegCheckContinue:
+ break;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ tmp = loadfn(s, opaque, true);
+ gen_helper_vfp_set_fpscr(cpu_env, tmp);
+ tcg_temp_free_i32(tmp);
+ gen_lookup_tb(s);
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ {
+ TCGv_i32 fpscr;
+ tmp = loadfn(s, opaque, true);
+ if (dc_isar_feature(aa32_mve, s)) {
+ /* QC is only present for MVE; otherwise RES0 */
+ TCGv_i32 qc = tcg_temp_new_i32();
+ tcg_gen_andi_i32(qc, tmp, FPCR_QC);
+ /*
+ * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
+ * here writing the same value into all elements is simplest.
+ */
+ tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
+ 16, 16, qc);
+ }
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+ fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
+ tcg_gen_or_i32(fpscr, fpscr, tmp);
+ store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_temp_free_i32(tmp);
+ break;
+ }
+ case ARM_VFP_FPCXT_NS:
+ {
+ TCGLabel *lab_active = gen_new_label();
+
+ lab_end = gen_new_label();
+ gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+ /*
+ * fpInactive case: write is a NOP, so only do side effects
+ * like register writeback before we branch to end
+ */
+ loadfn(s, opaque, false);
+ tcg_gen_br(lab_end);
+
+ gen_set_label(lab_active);
+ /*
+ * !fpInactive: if FPU disabled, take NOCP exception;
+ * otherwise PreserveFPState(), and then FPCXT_NS writes
+ * behave the same as FPCXT_S writes.
+ */
+ if (!vfp_access_check_m(s, true)) {
+ /*
+ * This was only a conditional exception, so override
+ * gen_exception_insn_el()'s default to DISAS_NORETURN
+ */
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+ }
+ }
+ /* fall through */
+ case ARM_VFP_FPCXT_S:
+ {
+ TCGv_i32 sfpa, control;
+ /*
+ * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes
+ * bits [27:0] from value and zeroes bits [31:28].
+ */
+ tmp = loadfn(s, opaque, true);
+ sfpa = tcg_temp_new_i32();
+ tcg_gen_shri_i32(sfpa, tmp, 31);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_deposit_i32(control, control, sfpa,
+ R_V7M_CONTROL_SFPA_SHIFT, 1);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+ gen_helper_vfp_set_fpscr(cpu_env, tmp);
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(sfpa);
+ break;
+ }
+ case ARM_VFP_VPR:
+ /* Behaves as NOP if not privileged */
+ if (IS_USER(s)) {
+ loadfn(s, opaque, false);
+ break;
+ }
+ tmp = loadfn(s, opaque, true);
+ store_cpu_field(tmp, v7m.vpr);
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ break;
+ case ARM_VFP_P0:
+ {
+ TCGv_i32 vpr;
+ tmp = loadfn(s, opaque, true);
+ vpr = load_cpu_field(v7m.vpr);
+ tcg_gen_deposit_i32(vpr, vpr, tmp,
+ R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+ store_cpu_field(vpr, v7m.vpr);
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ tcg_temp_free_i32(tmp);
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+ if (lab_end) {
+ gen_set_label(lab_end);
+ }
+ return true;
+}
+
+static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
+ fp_sysreg_storefn *storefn,
+ void *opaque)
+{
+ /* Do a read from an M-profile floating point system register */
+ TCGv_i32 tmp;
+ TCGLabel *lab_end = NULL;
+ bool lookup_tb = false;
+
+ switch (fp_sysreg_checks(s, regno)) {
+ case FPSysRegCheckFailed:
+ return false;
+ case FPSysRegCheckDone:
+ return true;
+ case FPSysRegCheckContinue:
+ break;
+ }
+
+ if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
+ /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
+ regno = QEMU_VFP_FPSCR_NZCV;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ tmp = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ tmp = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
+ storefn(s, opaque, tmp, true);
+ break;
+ case QEMU_VFP_FPSCR_NZCV:
+ /*
+ * Read just NZCV; this is a special case to avoid the
+ * helper call for the "VMRS to CPSR.NZCV" insn.
+ */
+ tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_FPCXT_S:
+ {
+ TCGv_i32 control, sfpa, fpscr;
+ /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */
+ tmp = tcg_temp_new_i32();
+ sfpa = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+ tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+ tcg_gen_or_i32(tmp, tmp, sfpa);
+ tcg_temp_free_i32(sfpa);
+ /*
+ * Store result before updating FPSCR etc, in case
+ * it is a memory write which causes an exception.
+ */
+ storefn(s, opaque, tmp, true);
+ /*
+ * Now we must reset FPSCR from FPDSCR_NS, and clear
+ * CONTROL.SFPA; so we'll end the TB here.
+ */
+ tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(fpscr);
+ lookup_tb = true;
+ break;
+ }
+ case ARM_VFP_FPCXT_NS:
+ {
+ TCGv_i32 control, sfpa, fpscr, fpdscr;
+ TCGLabel *lab_active = gen_new_label();
+
+ lookup_tb = true;
+
+ gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+ /* fpInactive case: reads as FPDSCR_NS */
+ TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ storefn(s, opaque, tmp, true);
+ lab_end = gen_new_label();
+ tcg_gen_br(lab_end);
+
+ gen_set_label(lab_active);
+ /*
+ * !fpInactive: if FPU disabled, take NOCP exception;
+ * otherwise PreserveFPState(), and then FPCXT_NS
+ * reads the same as FPCXT_S.
+ */
+ if (!vfp_access_check_m(s, true)) {
+ /*
+ * This was only a conditional exception, so override
+ * gen_exception_insn_el()'s default to DISAS_NORETURN
+ */
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+ }
+ tmp = tcg_temp_new_i32();
+ sfpa = tcg_temp_new_i32();
+ fpscr = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(fpscr, cpu_env);
+ tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+ tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+ tcg_gen_or_i32(tmp, tmp, sfpa);
+ tcg_temp_free_i32(control);
+ /* Store result before updating FPSCR, in case it faults */
+ storefn(s, opaque, tmp, true);
+ /* If SFPA is zero then set FPSCR from FPDSCR_NS */
+ fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, tcg_constant_i32(0),
+ fpdscr, fpscr);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(sfpa);
+ tcg_temp_free_i32(fpdscr);
+ tcg_temp_free_i32(fpscr);
+ break;
+ }
+ case ARM_VFP_VPR:
+ /* Behaves as NOP if not privileged */
+ if (IS_USER(s)) {
+ storefn(s, opaque, NULL, false);
+ break;
+ }
+ tmp = load_cpu_field(v7m.vpr);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_P0:
+ tmp = load_cpu_field(v7m.vpr);
+ tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+ storefn(s, opaque, tmp, true);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (lab_end) {
+ gen_set_label(lab_end);
+ }
+ if (lookup_tb) {
+ gen_lookup_tb(s);
+ }
+ return true;
+}
+
+static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access)
+{
+ arg_VMSR_VMRS *a = opaque;
+
+ if (!do_access) {
+ return;
+ }
+
+ if (a->rt == 15) {
+ /* Set the 4 flag bits in the CPSR */
+ gen_set_nzcv(value);
+ tcg_temp_free_i32(value);
+ } else {
+ store_reg(s, a->rt, value);
+ }
+}
+
+static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access)
+{
+ arg_VMSR_VMRS *a = opaque;
+
+ if (!do_access) {
+ return NULL;
+ }
+ return load_reg(s, a->rt);
+}
+
+static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
+{
+ /*
+ * Accesses to R15 are UNPREDICTABLE; we choose to undef.
+ * FPSCR -> r15 is a special case which writes to the PSR flags;
+ * set a->reg to a special value to tell gen_M_fp_sysreg_read()
+ * we only care about the top 4 bits of FPSCR there.
+ */
+ if (a->rt == 15) {
+ if (a->l && a->reg == ARM_VFP_FPSCR) {
+ a->reg = QEMU_VFP_FPSCR_NZCV;
+ } else {
+ return false;
+ }
+ }
+
+ if (a->l) {
+ /* VMRS, move FP system register to gp register */
+ return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a);
+ } else {
+ /* VMSR, move gp register to FP system register */
+ return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a);
+ }
+}
+
+static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access)
+{
+ arg_vldr_sysreg *a = opaque;
+ uint32_t offset = a->imm;
+ TCGv_i32 addr;
+
+ if (!a->a) {
+ offset = -offset;
+ }
+
+ if (!do_access && !a->w) {
+ return;
+ }
+
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ if (do_access) {
+ gen_aa32_st_i32(s, value, addr, get_mem_index(s),
+ MO_UL | MO_ALIGN | s->be_data);
+ tcg_temp_free_i32(value);
+ }
+
+ if (a->w) {
+ /* writeback */
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+}
+
+static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque,
+ bool do_access)
+{
+ arg_vldr_sysreg *a = opaque;
+ uint32_t offset = a->imm;
+ TCGv_i32 addr;
+ TCGv_i32 value = NULL;
+
+ if (!a->a) {
+ offset = -offset;
+ }
+
+ if (!do_access && !a->w) {
+ return NULL;
+ }
+
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ if (do_access) {
+ value = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, value, addr, get_mem_index(s),
+ MO_UL | MO_ALIGN | s->be_data);
+ }
+
+ if (a->w) {
+ /* writeback */
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+ return value;
+}
+
+static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ if (a->rn == 15) {
+ return false;
+ }
+ return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a);
+}
+
+static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ if (a->rn == 15) {
+ return false;
+ }
+ return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a);
+}
+
+static bool trans_NOCP(DisasContext *s, arg_nocp *a)
+{
+ /*
+ * Handle M-profile early check for disabled coprocessor:
+ * all we need to do here is emit the NOCP exception if
+ * the coprocessor is disabled. Otherwise we return false
+ * and the real VFP/etc decode will handle the insn.
+ */
+ assert(arm_dc_feature(s, ARM_FEATURE_M));
+
+ if (a->cp == 11) {
+ a->cp = 10;
+ }
+ if (arm_dc_feature(s, ARM_FEATURE_V8_1M) &&
+ (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) {
+ /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */
+ a->cp = 10;
+ }
+
+ if (a->cp != 10) {
+ gen_exception_insn(s, 0, EXCP_NOCP, syn_uncategorized());
+ return true;
+ }
+
+ if (s->fp_excp_el != 0) {
+ gen_exception_insn_el(s, 0, EXCP_NOCP,
+ syn_uncategorized(), s->fp_excp_el);
+ return true;
+ }
+
+ return false;
+}
+
+static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a)
+{
+ /* This range needs a coprocessor check for v8.1M and later only */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ return trans_NOCP(s, a);
+}
diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
new file mode 100644
index 0000000..db7ea3f
--- /dev/null
+++ b/target/arm/tcg/translate-mve.c
@@ -0,0 +1,2310 @@
+/*
+ * ARM translation: M-profile MVE instructions
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+static inline int vidup_imm(DisasContext *s, int x)
+{
+ return 1 << x;
+}
+
+/* Include the generated decoder */
+#include "decode-mve.c.inc"
+
+typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLdStSGFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLdStIlFn(TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenTwoOpShiftFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLongDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
+typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64);
+typedef void MVEGenVIDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void MVEGenVIWDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void MVEGenCmpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenScalarCmpFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenVABAVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenDualAccOpFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenVCVTRmodeFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+
+/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
+static inline long mve_qreg_offset(unsigned reg)
+{
+ return offsetof(CPUARMState, vfp.zregs[reg].d[0]);
+}
+
+static TCGv_ptr mve_qreg_ptr(unsigned reg)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg));
+ return ret;
+}
+
+static bool mve_no_predication(DisasContext *s)
+{
+ /*
+ * Return true if we are executing the entire MVE instruction
+ * with no predication or partial-execution, and so we can safely
+ * use an inline TCG vector implementation.
+ */
+ return s->eci == 0 && s->mve_no_pred;
+}
+
+static bool mve_check_qreg_bank(DisasContext *s, int qmask)
+{
+ /*
+ * Check whether Qregs are in range. For v8.1M only Q0..Q7
+ * are supported, see VFPSmallRegisterBank().
+ */
+ return qmask < 8;
+}
+
+bool mve_eci_check(DisasContext *s)
+{
+ /*
+ * This is a beatwise insn: check that ECI is valid (not a
+ * reserved value) and note that we are handling it.
+ * Return true if OK, false if we generated an exception.
+ */
+ s->eci_handled = true;
+ switch (s->eci) {
+ case ECI_NONE:
+ case ECI_A0:
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return true;
+ default:
+ /* Reserved value: INVSTATE UsageFault */
+ gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+ return false;
+ }
+}
+
+void mve_update_eci(DisasContext *s)
+{
+ /*
+ * The helper function will always update the CPUState field,
+ * so we only need to update the DisasContext field.
+ */
+ if (s->eci) {
+ s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE;
+ }
+}
+
+void mve_update_and_store_eci(DisasContext *s)
+{
+ /*
+ * For insns which don't call a helper function that will call
+ * mve_advance_vpt(), this version updates s->eci and also stores
+ * it out to the CPUState field.
+ */
+ if (s->eci) {
+ mve_update_eci(s);
+ store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits);
+ }
+}
+
+static bool mve_skip_first_beat(DisasContext *s)
+{
+ /* Return true if PSR.ECI says we must skip the first beat of this insn */
+ switch (s->eci) {
+ case ECI_NONE:
+ return false;
+ case ECI_A0:
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return true;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn,
+ unsigned msize)
+{
+ TCGv_i32 addr;
+ uint32_t offset;
+ TCGv_ptr qreg;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd) ||
+ !fn) {
+ return false;
+ }
+
+ /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+ if (a->rn == 15 || (a->rn == 13 && a->w)) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ offset = a->imm << msize;
+ if (!a->a) {
+ offset = -offset;
+ }
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ qreg = mve_qreg_ptr(a->qd);
+ fn(cpu_env, qreg, addr);
+ tcg_temp_free_ptr(qreg);
+
+ /*
+ * Writeback always happens after the last beat of the insn,
+ * regardless of predication
+ */
+ if (a->w) {
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
+{
+ static MVEGenLdStFn * const ldstfns[4][2] = {
+ { gen_helper_mve_vstrb, gen_helper_mve_vldrb },
+ { gen_helper_mve_vstrh, gen_helper_mve_vldrh },
+ { gen_helper_mve_vstrw, gen_helper_mve_vldrw },
+ { NULL, NULL }
+ };
+ return do_ldst(s, a, ldstfns[a->size][a->l], a->size);
+}
+
+#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST, MSIZE) \
+ static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \
+ { \
+ static MVEGenLdStFn * const ldstfns[2][2] = { \
+ { gen_helper_mve_##ST, gen_helper_mve_##SLD }, \
+ { NULL, gen_helper_mve_##ULD }, \
+ }; \
+ return do_ldst(s, a, ldstfns[a->u][a->l], MSIZE); \
+ }
+
+DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h, MO_8)
+DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w, MO_8)
+DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w, MO_16)
+
+static bool do_ldst_sg(DisasContext *s, arg_vldst_sg *a, MVEGenLdStSGFn fn)
+{
+ TCGv_i32 addr;
+ TCGv_ptr qd, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn || a->rn == 15) {
+ /* Rn case is UNPREDICTABLE */
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ addr = load_reg(s, a->rn);
+
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm, addr);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ tcg_temp_free_i32(addr);
+ mve_update_eci(s);
+ return true;
+}
+
+/*
+ * The naming scheme here is "vldrb_sg_sh == in-memory byte loads
+ * signextended to halfword elements in register". _os_ indicates that
+ * the offsets in Qm should be scaled by the element size.
+ */
+/* This macro is just to make the arrays more compact in these functions */
+#define F(N) gen_helper_mve_##N
+
+/* VLDRB/VSTRB (ie msize 1) with OS=1 is UNPREDICTABLE; we UNDEF */
+static bool trans_VLDR_S_sg(DisasContext *s, arg_vldst_sg *a)
+{
+ static MVEGenLdStSGFn * const fns[2][4][4] = { {
+ { NULL, F(vldrb_sg_sh), F(vldrb_sg_sw), NULL },
+ { NULL, NULL, F(vldrh_sg_sw), NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+ }, {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, F(vldrh_sg_os_sw), NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+ }
+ };
+ if (a->qd == a->qm) {
+ return false; /* UNPREDICTABLE */
+ }
+ return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+static bool trans_VLDR_U_sg(DisasContext *s, arg_vldst_sg *a)
+{
+ static MVEGenLdStSGFn * const fns[2][4][4] = { {
+ { F(vldrb_sg_ub), F(vldrb_sg_uh), F(vldrb_sg_uw), NULL },
+ { NULL, F(vldrh_sg_uh), F(vldrh_sg_uw), NULL },
+ { NULL, NULL, F(vldrw_sg_uw), NULL },
+ { NULL, NULL, NULL, F(vldrd_sg_ud) }
+ }, {
+ { NULL, NULL, NULL, NULL },
+ { NULL, F(vldrh_sg_os_uh), F(vldrh_sg_os_uw), NULL },
+ { NULL, NULL, F(vldrw_sg_os_uw), NULL },
+ { NULL, NULL, NULL, F(vldrd_sg_os_ud) }
+ }
+ };
+ if (a->qd == a->qm) {
+ return false; /* UNPREDICTABLE */
+ }
+ return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+static bool trans_VSTR_sg(DisasContext *s, arg_vldst_sg *a)
+{
+ static MVEGenLdStSGFn * const fns[2][4][4] = { {
+ { F(vstrb_sg_ub), F(vstrb_sg_uh), F(vstrb_sg_uw), NULL },
+ { NULL, F(vstrh_sg_uh), F(vstrh_sg_uw), NULL },
+ { NULL, NULL, F(vstrw_sg_uw), NULL },
+ { NULL, NULL, NULL, F(vstrd_sg_ud) }
+ }, {
+ { NULL, NULL, NULL, NULL },
+ { NULL, F(vstrh_sg_os_uh), F(vstrh_sg_os_uw), NULL },
+ { NULL, NULL, F(vstrw_sg_os_uw), NULL },
+ { NULL, NULL, NULL, F(vstrd_sg_os_ud) }
+ }
+ };
+ return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+#undef F
+
+static bool do_ldst_sg_imm(DisasContext *s, arg_vldst_sg_imm *a,
+ MVEGenLdStSGFn *fn, unsigned msize)
+{
+ uint32_t offset;
+ TCGv_ptr qd, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ offset = a->imm << msize;
+ if (!a->a) {
+ offset = -offset;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm, tcg_constant_i32(offset));
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VLDRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+ static MVEGenLdStSGFn * const fns[] = {
+ gen_helper_mve_vldrw_sg_uw,
+ gen_helper_mve_vldrw_sg_wb_uw,
+ };
+ if (a->qd == a->qm) {
+ return false; /* UNPREDICTABLE */
+ }
+ return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
+}
+
+static bool trans_VLDRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+ static MVEGenLdStSGFn * const fns[] = {
+ gen_helper_mve_vldrd_sg_ud,
+ gen_helper_mve_vldrd_sg_wb_ud,
+ };
+ if (a->qd == a->qm) {
+ return false; /* UNPREDICTABLE */
+ }
+ return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
+}
+
+static bool trans_VSTRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+ static MVEGenLdStSGFn * const fns[] = {
+ gen_helper_mve_vstrw_sg_uw,
+ gen_helper_mve_vstrw_sg_wb_uw,
+ };
+ return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
+}
+
+static bool trans_VSTRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+ static MVEGenLdStSGFn * const fns[] = {
+ gen_helper_mve_vstrd_sg_ud,
+ gen_helper_mve_vstrd_sg_wb_ud,
+ };
+ return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
+}
+
+static bool do_vldst_il(DisasContext *s, arg_vldst_il *a, MVEGenLdStIlFn *fn,
+ int addrinc)
+{
+ TCGv_i32 rn;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd) ||
+ !fn || (a->rn == 13 && a->w) || a->rn == 15) {
+ /* Variously UNPREDICTABLE or UNDEF or related-encoding */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ rn = load_reg(s, a->rn);
+ /*
+ * We pass the index of Qd, not a pointer, because the helper must
+ * access multiple Q registers starting at Qd and working up.
+ */
+ fn(cpu_env, tcg_constant_i32(a->qd), rn);
+
+ if (a->w) {
+ tcg_gen_addi_i32(rn, rn, addrinc);
+ store_reg(s, a->rn, rn);
+ } else {
+ tcg_temp_free_i32(rn);
+ }
+ mve_update_and_store_eci(s);
+ return true;
+}
+
+/* This macro is just to make the arrays more compact in these functions */
+#define F(N) gen_helper_mve_##N
+
+static bool trans_VLD2(DisasContext *s, arg_vldst_il *a)
+{
+ static MVEGenLdStIlFn * const fns[4][4] = {
+ { F(vld20b), F(vld20h), F(vld20w), NULL, },
+ { F(vld21b), F(vld21h), F(vld21w), NULL, },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ };
+ if (a->qd > 6) {
+ return false;
+ }
+ return do_vldst_il(s, a, fns[a->pat][a->size], 32);
+}
+
+static bool trans_VLD4(DisasContext *s, arg_vldst_il *a)
+{
+ static MVEGenLdStIlFn * const fns[4][4] = {
+ { F(vld40b), F(vld40h), F(vld40w), NULL, },
+ { F(vld41b), F(vld41h), F(vld41w), NULL, },
+ { F(vld42b), F(vld42h), F(vld42w), NULL, },
+ { F(vld43b), F(vld43h), F(vld43w), NULL, },
+ };
+ if (a->qd > 4) {
+ return false;
+ }
+ return do_vldst_il(s, a, fns[a->pat][a->size], 64);
+}
+
+static bool trans_VST2(DisasContext *s, arg_vldst_il *a)
+{
+ static MVEGenLdStIlFn * const fns[4][4] = {
+ { F(vst20b), F(vst20h), F(vst20w), NULL, },
+ { F(vst21b), F(vst21h), F(vst21w), NULL, },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ };
+ if (a->qd > 6) {
+ return false;
+ }
+ return do_vldst_il(s, a, fns[a->pat][a->size], 32);
+}
+
+static bool trans_VST4(DisasContext *s, arg_vldst_il *a)
+{
+ static MVEGenLdStIlFn * const fns[4][4] = {
+ { F(vst40b), F(vst40h), F(vst40w), NULL, },
+ { F(vst41b), F(vst41h), F(vst41w), NULL, },
+ { F(vst42b), F(vst42h), F(vst42w), NULL, },
+ { F(vst43b), F(vst43h), F(vst43w), NULL, },
+ };
+ if (a->qd > 4) {
+ return false;
+ }
+ return do_vldst_il(s, a, fns[a->pat][a->size], 64);
+}
+
+#undef F
+
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+ TCGv_ptr qd;
+ TCGv_i32 rt;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd)) {
+ return false;
+ }
+ if (a->rt == 13 || a->rt == 15) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ rt = load_reg(s, a->rt);
+ if (mve_no_predication(s)) {
+ tcg_gen_gvec_dup_i32(a->size, mve_qreg_offset(a->qd), 16, 16, rt);
+ } else {
+ qd = mve_qreg_ptr(a->qd);
+ tcg_gen_dup_i32(a->size, rt, rt);
+ gen_helper_mve_vdup(cpu_env, qd, rt);
+ tcg_temp_free_ptr(qd);
+ }
+ tcg_temp_free_i32(rt);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_1op_vec(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn,
+ GVecGen2Fn vecfn)
+{
+ TCGv_ptr qd, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ if (vecfn && mve_no_predication(s)) {
+ vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm), 16, 16);
+ } else {
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
+{
+ return do_1op_vec(s, a, fn, NULL);
+}
+
+#define DO_1OP_VEC(INSN, FN, VECFN) \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ static MVEGenOneOpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_1op_vec(s, a, fns[a->size], VECFN); \
+ }
+
+#define DO_1OP(INSN, FN) DO_1OP_VEC(INSN, FN, NULL)
+
+DO_1OP(VCLZ, vclz)
+DO_1OP(VCLS, vcls)
+DO_1OP_VEC(VABS, vabs, tcg_gen_gvec_abs)
+DO_1OP_VEC(VNEG, vneg, tcg_gen_gvec_neg)
+DO_1OP(VQABS, vqabs)
+DO_1OP(VQNEG, vqneg)
+DO_1OP(VMAXA, vmaxa)
+DO_1OP(VMINA, vmina)
+
+/*
+ * For simple float/int conversions we use the fixed-point
+ * conversion helpers with a zero shift count
+ */
+#define DO_VCVT(INSN, HFN, SFN) \
+ static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \
+ { \
+ gen_helper_mve_##HFN(env, qd, qm, tcg_constant_i32(0)); \
+ } \
+ static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \
+ { \
+ gen_helper_mve_##SFN(env, qd, qm, tcg_constant_i32(0)); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ static MVEGenOneOpFn * const fns[] = { \
+ NULL, \
+ gen_##INSN##h, \
+ gen_##INSN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_1op(s, a, fns[a->size]); \
+ }
+
+DO_VCVT(VCVT_SF, vcvt_sh, vcvt_sf)
+DO_VCVT(VCVT_UF, vcvt_uh, vcvt_uf)
+DO_VCVT(VCVT_FS, vcvt_hs, vcvt_fs)
+DO_VCVT(VCVT_FU, vcvt_hu, vcvt_fu)
+
+static bool do_vcvt_rmode(DisasContext *s, arg_1op *a,
+ enum arm_fprounding rmode, bool u)
+{
+ /*
+ * Handle VCVT fp to int with specified rounding mode.
+ * This is a 1op fn but we must pass the rounding mode as
+ * an immediate to the helper.
+ */
+ TCGv_ptr qd, qm;
+ static MVEGenVCVTRmodeFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vcvt_rm_sh, gen_helper_mve_vcvt_rm_uh },
+ { gen_helper_mve_vcvt_rm_ss, gen_helper_mve_vcvt_rm_us },
+ { NULL, NULL },
+ };
+ MVEGenVCVTRmodeFn *fn = fns[a->size][u];
+
+ if (!dc_isar_feature(aa32_mve_fp, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm, tcg_constant_i32(arm_rmode_to_sf(rmode)));
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_VCVT_RMODE(INSN, RMODE, U) \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ return do_vcvt_rmode(s, a, RMODE, U); \
+ } \
+
+DO_VCVT_RMODE(VCVTAS, FPROUNDING_TIEAWAY, false)
+DO_VCVT_RMODE(VCVTAU, FPROUNDING_TIEAWAY, true)
+DO_VCVT_RMODE(VCVTNS, FPROUNDING_TIEEVEN, false)
+DO_VCVT_RMODE(VCVTNU, FPROUNDING_TIEEVEN, true)
+DO_VCVT_RMODE(VCVTPS, FPROUNDING_POSINF, false)
+DO_VCVT_RMODE(VCVTPU, FPROUNDING_POSINF, true)
+DO_VCVT_RMODE(VCVTMS, FPROUNDING_NEGINF, false)
+DO_VCVT_RMODE(VCVTMU, FPROUNDING_NEGINF, true)
+
+#define DO_VCVT_SH(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_1op(s, a, gen_helper_mve_##FN); \
+ } \
+
+DO_VCVT_SH(VCVTB_SH, vcvtb_sh)
+DO_VCVT_SH(VCVTT_SH, vcvtt_sh)
+DO_VCVT_SH(VCVTB_HS, vcvtb_hs)
+DO_VCVT_SH(VCVTT_HS, vcvtt_hs)
+
+#define DO_VRINT(INSN, RMODE) \
+ static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \
+ { \
+ gen_helper_mve_vrint_rm_h(env, qd, qm, \
+ tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
+ } \
+ static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \
+ { \
+ gen_helper_mve_vrint_rm_s(env, qd, qm, \
+ tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ static MVEGenOneOpFn * const fns[] = { \
+ NULL, \
+ gen_##INSN##h, \
+ gen_##INSN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_1op(s, a, fns[a->size]); \
+ }
+
+DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
+DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
+DO_VRINT(VRINTZ, FPROUNDING_ZERO)
+DO_VRINT(VRINTM, FPROUNDING_NEGINF)
+DO_VRINT(VRINTP, FPROUNDING_POSINF)
+
+static bool trans_VRINTX(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vrintx_h,
+ gen_helper_mve_vrintx_s,
+ NULL,
+ };
+ if (!dc_isar_feature(aa32_mve_fp, s)) {
+ return false;
+ }
+ return do_1op(s, a, fns[a->size]);
+}
+
+/* Narrowing moves: only size 0 and 1 are valid */
+#define DO_VMOVN(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ static MVEGenOneOpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ NULL, \
+ NULL, \
+ }; \
+ return do_1op(s, a, fns[a->size]); \
+ }
+
+DO_VMOVN(VMOVNB, vmovnb)
+DO_VMOVN(VMOVNT, vmovnt)
+DO_VMOVN(VQMOVUNB, vqmovunb)
+DO_VMOVN(VQMOVUNT, vqmovunt)
+DO_VMOVN(VQMOVN_BS, vqmovnbs)
+DO_VMOVN(VQMOVN_TS, vqmovnts)
+DO_VMOVN(VQMOVN_BU, vqmovnbu)
+DO_VMOVN(VQMOVN_TU, vqmovntu)
+
+static bool trans_VREV16(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev16b,
+ NULL,
+ NULL,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV32(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev32b,
+ gen_helper_mve_vrev32h,
+ NULL,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV64(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev64b,
+ gen_helper_mve_vrev64h,
+ gen_helper_mve_vrev64w,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VMVN(DisasContext *s, arg_1op *a)
+{
+ return do_1op_vec(s, a, gen_helper_mve_vmvn, tcg_gen_gvec_not);
+}
+
+static bool trans_VABS_fp(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vfabsh,
+ gen_helper_mve_vfabss,
+ NULL,
+ };
+ if (!dc_isar_feature(aa32_mve_fp, s)) {
+ return false;
+ }
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VNEG_fp(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vfnegh,
+ gen_helper_mve_vfnegs,
+ NULL,
+ };
+ if (!dc_isar_feature(aa32_mve_fp, s)) {
+ return false;
+ }
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool do_2op_vec(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn,
+ GVecGen3Fn *vecfn)
+{
+ TCGv_ptr qd, qn, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ if (vecfn && mve_no_predication(s)) {
+ vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qn),
+ mve_qreg_offset(a->qm), 16, 16);
+ } else {
+ qd = mve_qreg_ptr(a->qd);
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qn, qm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn *fn)
+{
+ return do_2op_vec(s, a, fn, NULL);
+}
+
+#define DO_LOGIC(INSN, HELPER, VECFN) \
+ static bool trans_##INSN(DisasContext *s, arg_2op *a) \
+ { \
+ return do_2op_vec(s, a, HELPER, VECFN); \
+ }
+
+DO_LOGIC(VAND, gen_helper_mve_vand, tcg_gen_gvec_and)
+DO_LOGIC(VBIC, gen_helper_mve_vbic, tcg_gen_gvec_andc)
+DO_LOGIC(VORR, gen_helper_mve_vorr, tcg_gen_gvec_or)
+DO_LOGIC(VORN, gen_helper_mve_vorn, tcg_gen_gvec_orc)
+DO_LOGIC(VEOR, gen_helper_mve_veor, tcg_gen_gvec_xor)
+
+static bool trans_VPSEL(DisasContext *s, arg_2op *a)
+{
+ /* This insn updates predication bits */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ return do_2op(s, a, gen_helper_mve_vpsel);
+}
+
+#define DO_2OP_VEC(INSN, FN, VECFN) \
+ static bool trans_##INSN(DisasContext *s, arg_2op *a) \
+ { \
+ static MVEGenTwoOpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2op_vec(s, a, fns[a->size], VECFN); \
+ }
+
+#define DO_2OP(INSN, FN) DO_2OP_VEC(INSN, FN, NULL)
+
+DO_2OP_VEC(VADD, vadd, tcg_gen_gvec_add)
+DO_2OP_VEC(VSUB, vsub, tcg_gen_gvec_sub)
+DO_2OP_VEC(VMUL, vmul, tcg_gen_gvec_mul)
+DO_2OP(VMULH_S, vmulhs)
+DO_2OP(VMULH_U, vmulhu)
+DO_2OP(VRMULH_S, vrmulhs)
+DO_2OP(VRMULH_U, vrmulhu)
+DO_2OP_VEC(VMAX_S, vmaxs, tcg_gen_gvec_smax)
+DO_2OP_VEC(VMAX_U, vmaxu, tcg_gen_gvec_umax)
+DO_2OP_VEC(VMIN_S, vmins, tcg_gen_gvec_smin)
+DO_2OP_VEC(VMIN_U, vminu, tcg_gen_gvec_umin)
+DO_2OP(VABD_S, vabds)
+DO_2OP(VABD_U, vabdu)
+DO_2OP(VHADD_S, vhadds)
+DO_2OP(VHADD_U, vhaddu)
+DO_2OP(VHSUB_S, vhsubs)
+DO_2OP(VHSUB_U, vhsubu)
+DO_2OP(VMULL_BS, vmullbs)
+DO_2OP(VMULL_BU, vmullbu)
+DO_2OP(VMULL_TS, vmullts)
+DO_2OP(VMULL_TU, vmulltu)
+DO_2OP(VQDMULH, vqdmulh)
+DO_2OP(VQRDMULH, vqrdmulh)
+DO_2OP(VQADD_S, vqadds)
+DO_2OP(VQADD_U, vqaddu)
+DO_2OP(VQSUB_S, vqsubs)
+DO_2OP(VQSUB_U, vqsubu)
+DO_2OP(VSHL_S, vshls)
+DO_2OP(VSHL_U, vshlu)
+DO_2OP(VRSHL_S, vrshls)
+DO_2OP(VRSHL_U, vrshlu)
+DO_2OP(VQSHL_S, vqshls)
+DO_2OP(VQSHL_U, vqshlu)
+DO_2OP(VQRSHL_S, vqrshls)
+DO_2OP(VQRSHL_U, vqrshlu)
+DO_2OP(VQDMLADH, vqdmladh)
+DO_2OP(VQDMLADHX, vqdmladhx)
+DO_2OP(VQRDMLADH, vqrdmladh)
+DO_2OP(VQRDMLADHX, vqrdmladhx)
+DO_2OP(VQDMLSDH, vqdmlsdh)
+DO_2OP(VQDMLSDHX, vqdmlsdhx)
+DO_2OP(VQRDMLSDH, vqrdmlsdh)
+DO_2OP(VQRDMLSDHX, vqrdmlsdhx)
+DO_2OP(VRHADD_S, vrhadds)
+DO_2OP(VRHADD_U, vrhaddu)
+/*
+ * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose
+ * so we can reuse the DO_2OP macro. (Our implementation calculates the
+ * "expected" results in this case.) Similarly for VHCADD.
+ */
+DO_2OP(VCADD90, vcadd90)
+DO_2OP(VCADD270, vcadd270)
+DO_2OP(VHCADD90, vhcadd90)
+DO_2OP(VHCADD270, vhcadd270)
+
+static bool trans_VQDMULLB(DisasContext *s, arg_2op *a)
+{
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullbh,
+ gen_helper_mve_vqdmullbw,
+ NULL,
+ };
+ if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT(DisasContext *s, arg_2op *a)
+{
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullth,
+ gen_helper_mve_vqdmulltw,
+ NULL,
+ };
+ if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VMULLP_B(DisasContext *s, arg_2op *a)
+{
+ /*
+ * Note that a->size indicates the output size, ie VMULL.P8
+ * is the 8x8->16 operation and a->size is MO_16; VMULL.P16
+ * is the 16x16->32 operation and a->size is MO_32.
+ */
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vmullpbh,
+ gen_helper_mve_vmullpbw,
+ NULL,
+ };
+ return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VMULLP_T(DisasContext *s, arg_2op *a)
+{
+ /* a->size is as for trans_VMULLP_B */
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vmullpth,
+ gen_helper_mve_vmullptw,
+ NULL,
+ };
+ return do_2op(s, a, fns[a->size]);
+}
+
+/*
+ * VADC and VSBC: these perform an add-with-carry or subtract-with-carry
+ * of the 32-bit elements in each lane of the input vectors, where the
+ * carry-out of each add is the carry-in of the next. The initial carry
+ * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C
+ * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C.
+ * These insns are subject to beat-wise execution. Partial execution
+ * of an I=1 (initial carry input fixed) insn which does not
+ * execute the first beat must start with the current FPSCR.NZCV
+ * value, not the fixed constant input.
+ */
+static bool trans_VADC(DisasContext *s, arg_2op *a)
+{
+ return do_2op(s, a, gen_helper_mve_vadc);
+}
+
+static bool trans_VADCI(DisasContext *s, arg_2op *a)
+{
+ if (mve_skip_first_beat(s)) {
+ return trans_VADC(s, a);
+ }
+ return do_2op(s, a, gen_helper_mve_vadci);
+}
+
+static bool trans_VSBC(DisasContext *s, arg_2op *a)
+{
+ return do_2op(s, a, gen_helper_mve_vsbc);
+}
+
+static bool trans_VSBCI(DisasContext *s, arg_2op *a)
+{
+ if (mve_skip_first_beat(s)) {
+ return trans_VSBC(s, a);
+ }
+ return do_2op(s, a, gen_helper_mve_vsbci);
+}
+
+#define DO_2OP_FP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2op *a) \
+ { \
+ static MVEGenTwoOpFn * const fns[] = { \
+ NULL, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_2op(s, a, fns[a->size]); \
+ }
+
+DO_2OP_FP(VADD_fp, vfadd)
+DO_2OP_FP(VSUB_fp, vfsub)
+DO_2OP_FP(VMUL_fp, vfmul)
+DO_2OP_FP(VABD_fp, vfabd)
+DO_2OP_FP(VMAXNM, vmaxnm)
+DO_2OP_FP(VMINNM, vminnm)
+DO_2OP_FP(VCADD90_fp, vfcadd90)
+DO_2OP_FP(VCADD270_fp, vfcadd270)
+DO_2OP_FP(VFMA, vfma)
+DO_2OP_FP(VFMS, vfms)
+DO_2OP_FP(VCMUL0, vcmul0)
+DO_2OP_FP(VCMUL90, vcmul90)
+DO_2OP_FP(VCMUL180, vcmul180)
+DO_2OP_FP(VCMUL270, vcmul270)
+DO_2OP_FP(VCMLA0, vcmla0)
+DO_2OP_FP(VCMLA90, vcmla90)
+DO_2OP_FP(VCMLA180, vcmla180)
+DO_2OP_FP(VCMLA270, vcmla270)
+DO_2OP_FP(VMAXNMA, vmaxnma)
+DO_2OP_FP(VMINNMA, vminnma)
+
+static bool do_2op_scalar(DisasContext *s, arg_2scalar *a,
+ MVEGenTwoOpScalarFn fn)
+{
+ TCGv_ptr qd, qn;
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qn) ||
+ !fn) {
+ return false;
+ }
+ if (a->rm == 13 || a->rm == 15) {
+ /* UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qn = mve_qreg_ptr(a->qn);
+ rm = load_reg(s, a->rm);
+ fn(cpu_env, qd, qn, rm);
+ tcg_temp_free_i32(rm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qn);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_2OP_SCALAR(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \
+ { \
+ static MVEGenTwoOpScalarFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2op_scalar(s, a, fns[a->size]); \
+ }
+
+DO_2OP_SCALAR(VADD_scalar, vadd_scalar)
+DO_2OP_SCALAR(VSUB_scalar, vsub_scalar)
+DO_2OP_SCALAR(VMUL_scalar, vmul_scalar)
+DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar)
+DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar)
+DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar)
+DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar)
+DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar)
+DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar)
+DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar)
+DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar)
+DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar)
+DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
+DO_2OP_SCALAR(VBRSR, vbrsr)
+DO_2OP_SCALAR(VMLA, vmla)
+DO_2OP_SCALAR(VMLAS, vmlas)
+DO_2OP_SCALAR(VQDMLAH, vqdmlah)
+DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
+DO_2OP_SCALAR(VQDMLASH, vqdmlash)
+DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
+
+static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
+{
+ static MVEGenTwoOpScalarFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullb_scalarh,
+ gen_helper_mve_vqdmullb_scalarw,
+ NULL,
+ };
+ if (a->qd == a->qn && a->size == MO_32) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op_scalar(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a)
+{
+ static MVEGenTwoOpScalarFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullt_scalarh,
+ gen_helper_mve_vqdmullt_scalarw,
+ NULL,
+ };
+ if (a->qd == a->qn && a->size == MO_32) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op_scalar(s, a, fns[a->size]);
+}
+
+
+#define DO_2OP_FP_SCALAR(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \
+ { \
+ static MVEGenTwoOpScalarFn * const fns[] = { \
+ NULL, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_2op_scalar(s, a, fns[a->size]); \
+ }
+
+DO_2OP_FP_SCALAR(VADD_fp_scalar, vfadd_scalar)
+DO_2OP_FP_SCALAR(VSUB_fp_scalar, vfsub_scalar)
+DO_2OP_FP_SCALAR(VMUL_fp_scalar, vfmul_scalar)
+DO_2OP_FP_SCALAR(VFMA_scalar, vfma_scalar)
+DO_2OP_FP_SCALAR(VFMAS_scalar, vfmas_scalar)
+
+static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a,
+ MVEGenLongDualAccOpFn *fn)
+{
+ TCGv_ptr qn, qm;
+ TCGv_i64 rda;
+ TCGv_i32 rdalo, rdahi;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qn | a->qm) ||
+ !fn) {
+ return false;
+ }
+ /*
+ * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
+ * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
+ */
+ if (a->rdahi == 13 || a->rdahi == 15) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current rda value, not 0.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ rda = tcg_temp_new_i64();
+ rdalo = load_reg(s, a->rdalo);
+ rdahi = load_reg(s, a->rdahi);
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+ tcg_temp_free_i32(rdalo);
+ tcg_temp_free_i32(rdahi);
+ } else {
+ rda = tcg_const_i64(0);
+ }
+
+ fn(rda, cpu_env, qn, qm, rda);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+
+ rdalo = tcg_temp_new_i32();
+ rdahi = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(rdalo, rda);
+ tcg_gen_extrh_i64_i32(rdahi, rda);
+ store_reg(s, a->rdalo, rdalo);
+ store_reg(s, a->rdahi, rdahi);
+ tcg_temp_free_i64(rda);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh },
+ { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlaldavuh, NULL },
+ { gen_helper_mve_vmlaldavuw, NULL },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh },
+ { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlaldavhuw, NULL,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenLongDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool do_dual_acc(DisasContext *s, arg_vmladav *a, MVEGenDualAccOpFn *fn)
+{
+ TCGv_ptr qn, qm;
+ TCGv_i32 rda;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qn) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current rda value, not 0.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ rda = load_reg(s, a->rda);
+ } else {
+ rda = tcg_const_i32(0);
+ }
+
+ fn(rda, cpu_env, qn, qm, rda);
+ store_reg(s, a->rda, rda);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_DUAL_ACC(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vmladav *a) \
+ { \
+ static MVEGenDualAccOpFn * const fns[4][2] = { \
+ { gen_helper_mve_##FN##b, gen_helper_mve_##FN##xb }, \
+ { gen_helper_mve_##FN##h, gen_helper_mve_##FN##xh }, \
+ { gen_helper_mve_##FN##w, gen_helper_mve_##FN##xw }, \
+ { NULL, NULL }, \
+ }; \
+ return do_dual_acc(s, a, fns[a->size][a->x]); \
+ }
+
+DO_DUAL_ACC(VMLADAV_S, vmladavs)
+DO_DUAL_ACC(VMLSDAV, vmlsdav)
+
+static bool trans_VMLADAV_U(DisasContext *s, arg_vmladav *a)
+{
+ static MVEGenDualAccOpFn * const fns[4][2] = {
+ { gen_helper_mve_vmladavub, NULL },
+ { gen_helper_mve_vmladavuh, NULL },
+ { gen_helper_mve_vmladavuw, NULL },
+ { NULL, NULL },
+ };
+ return do_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static void gen_vpst(DisasContext *s, uint32_t mask)
+{
+ /*
+ * Set the VPR mask fields. We take advantage of MASK01 and MASK23
+ * being adjacent fields in the register.
+ *
+ * Updating the masks is not predicated, but it is subject to beat-wise
+ * execution, and the mask is updated on the odd-numbered beats.
+ * So if PSR.ECI says we should skip beat 1, we mustn't update the
+ * 01 mask field.
+ */
+ TCGv_i32 vpr = load_cpu_field(v7m.vpr);
+ switch (s->eci) {
+ case ECI_NONE:
+ case ECI_A0:
+ /* Update both 01 and 23 fields */
+ tcg_gen_deposit_i32(vpr, vpr,
+ tcg_constant_i32(mask | (mask << 4)),
+ R_V7M_VPR_MASK01_SHIFT,
+ R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH);
+ break;
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ /* Update only the 23 mask field */
+ tcg_gen_deposit_i32(vpr, vpr,
+ tcg_constant_i32(mask),
+ R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ store_cpu_field(vpr, v7m.vpr);
+}
+
+static bool trans_VPST(DisasContext *s, arg_VPST *a)
+{
+ /* mask == 0 is a "related encoding" */
+ if (!dc_isar_feature(aa32_mve, s) || !a->mask) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+ gen_vpst(s, a->mask);
+ mve_update_and_store_eci(s);
+ return true;
+}
+
+static bool trans_VPNOT(DisasContext *s, arg_VPNOT *a)
+{
+ /*
+ * Invert the predicate in VPR.P0. We have call out to
+ * a helper because this insn itself is beatwise and can
+ * be predicated.
+ */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ gen_helper_mve_vpnot(cpu_env);
+ /* This insn updates predication bits */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
+{
+ /* VADDV: vector add across vector */
+ static MVEGenVADDVFn * const fns[4][2] = {
+ { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub },
+ { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh },
+ { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw },
+ { NULL, NULL }
+ };
+ TCGv_ptr qm;
+ TCGv_i32 rda;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ a->size == 3) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current value of Rda, not zero.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ /* Accumulate input from Rda */
+ rda = load_reg(s, a->rda);
+ } else {
+ /* Accumulate starting at zero */
+ rda = tcg_const_i32(0);
+ }
+
+ qm = mve_qreg_ptr(a->qm);
+ fns[a->size][a->u](rda, cpu_env, qm, rda);
+ store_reg(s, a->rda, rda);
+ tcg_temp_free_ptr(qm);
+
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a)
+{
+ /*
+ * Vector Add Long Across Vector: accumulate the 32-bit
+ * elements of the vector into a 64-bit result stored in
+ * a pair of general-purpose registers.
+ * No need to check Qm's bank: it is only 3 bits in decode.
+ */
+ TCGv_ptr qm;
+ TCGv_i64 rda;
+ TCGv_i32 rdalo, rdahi;
+
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+ /*
+ * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
+ * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
+ */
+ if (a->rdahi == 13 || a->rdahi == 15) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current value of RdaHi:RdaLo, not zero.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ /* Accumulate input from RdaHi:RdaLo */
+ rda = tcg_temp_new_i64();
+ rdalo = load_reg(s, a->rdalo);
+ rdahi = load_reg(s, a->rdahi);
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+ tcg_temp_free_i32(rdalo);
+ tcg_temp_free_i32(rdahi);
+ } else {
+ /* Accumulate starting at zero */
+ rda = tcg_const_i64(0);
+ }
+
+ qm = mve_qreg_ptr(a->qm);
+ if (a->u) {
+ gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda);
+ } else {
+ gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda);
+ }
+ tcg_temp_free_ptr(qm);
+
+ rdalo = tcg_temp_new_i32();
+ rdahi = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(rdalo, rda);
+ tcg_gen_extrh_i64_i32(rdahi, rda);
+ store_reg(s, a->rdalo, rdalo);
+ store_reg(s, a->rdahi, rdahi);
+ tcg_temp_free_i64(rda);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn,
+ GVecGen2iFn *vecfn)
+{
+ TCGv_ptr qd;
+ uint64_t imm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ imm = asimd_imm_const(a->imm, a->cmode, a->op);
+
+ if (vecfn && mve_no_predication(s)) {
+ vecfn(MO_64, mve_qreg_offset(a->qd), mve_qreg_offset(a->qd),
+ imm, 16, 16);
+ } else {
+ qd = mve_qreg_ptr(a->qd);
+ fn(cpu_env, qd, tcg_constant_i64(imm));
+ tcg_temp_free_ptr(qd);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static void gen_gvec_vmovi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, c);
+}
+
+static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a)
+{
+ /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
+ MVEGenOneOpImmFn *fn;
+ GVecGen2iFn *vecfn;
+
+ if ((a->cmode & 1) && a->cmode < 12) {
+ if (a->op) {
+ /*
+ * For op=1, the immediate will be inverted by asimd_imm_const(),
+ * so the VBIC becomes a logical AND operation.
+ */
+ fn = gen_helper_mve_vandi;
+ vecfn = tcg_gen_gvec_andi;
+ } else {
+ fn = gen_helper_mve_vorri;
+ vecfn = tcg_gen_gvec_ori;
+ }
+ } else {
+ /* There is one unallocated cmode/op combination in this space */
+ if (a->cmode == 15 && a->op == 1) {
+ return false;
+ }
+ /* asimd_imm_const() sorts out VMVNI vs VMOVI for us */
+ fn = gen_helper_mve_vmovi;
+ vecfn = gen_gvec_vmovi;
+ }
+ return do_1imm(s, a, fn, vecfn);
+}
+
+static bool do_2shift_vec(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
+ bool negateshift, GVecGen2iFn vecfn)
+{
+ TCGv_ptr qd, qm;
+ int shift = a->shift;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * When we handle a right shift insn using a left-shift helper
+ * which permits a negative shift count to indicate a right-shift,
+ * we must negate the shift count.
+ */
+ if (negateshift) {
+ shift = -shift;
+ }
+
+ if (vecfn && mve_no_predication(s)) {
+ vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm),
+ shift, 16, 16);
+ } else {
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm, tcg_constant_i32(shift));
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_2shift(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
+ bool negateshift)
+{
+ return do_2shift_vec(s, a, fn, negateshift, NULL);
+}
+
+#define DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, VECFN) \
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
+ { \
+ static MVEGenTwoOpShiftFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2shift_vec(s, a, fns[a->size], NEGATESHIFT, VECFN); \
+ }
+
+#define DO_2SHIFT(INSN, FN, NEGATESHIFT) \
+ DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, NULL)
+
+static void do_gvec_shri_s(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ /*
+ * We get here with a negated shift count, and we must handle
+ * shifts by the element size, which tcg_gen_gvec_sari() does not do.
+ */
+ shift = -shift;
+ if (shift == (8 << vece)) {
+ shift--;
+ }
+ tcg_gen_gvec_sari(vece, dofs, aofs, shift, oprsz, maxsz);
+}
+
+static void do_gvec_shri_u(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ /*
+ * We get here with a negated shift count, and we must handle
+ * shifts by the element size, which tcg_gen_gvec_shri() does not do.
+ */
+ shift = -shift;
+ if (shift == (8 << vece)) {
+ tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, 0);
+ } else {
+ tcg_gen_gvec_shri(vece, dofs, aofs, shift, oprsz, maxsz);
+ }
+}
+
+DO_2SHIFT_VEC(VSHLI, vshli_u, false, tcg_gen_gvec_shli)
+DO_2SHIFT(VQSHLI_S, vqshli_s, false)
+DO_2SHIFT(VQSHLI_U, vqshli_u, false)
+DO_2SHIFT(VQSHLUI, vqshlui_s, false)
+/* These right shifts use a left-shift helper with negated shift count */
+DO_2SHIFT_VEC(VSHRI_S, vshli_s, true, do_gvec_shri_s)
+DO_2SHIFT_VEC(VSHRI_U, vshli_u, true, do_gvec_shri_u)
+DO_2SHIFT(VRSHRI_S, vrshli_s, true)
+DO_2SHIFT(VRSHRI_U, vrshli_u, true)
+
+DO_2SHIFT_VEC(VSRI, vsri, false, gen_gvec_sri)
+DO_2SHIFT_VEC(VSLI, vsli, false, gen_gvec_sli)
+
+#define DO_2SHIFT_FP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
+ { \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_2shift(s, a, gen_helper_mve_##FN, false); \
+ }
+
+DO_2SHIFT_FP(VCVT_SH_fixed, vcvt_sh)
+DO_2SHIFT_FP(VCVT_UH_fixed, vcvt_uh)
+DO_2SHIFT_FP(VCVT_HS_fixed, vcvt_hs)
+DO_2SHIFT_FP(VCVT_HU_fixed, vcvt_hu)
+DO_2SHIFT_FP(VCVT_SF_fixed, vcvt_sf)
+DO_2SHIFT_FP(VCVT_UF_fixed, vcvt_uf)
+DO_2SHIFT_FP(VCVT_FS_fixed, vcvt_fs)
+DO_2SHIFT_FP(VCVT_FU_fixed, vcvt_fu)
+
+static bool do_2shift_scalar(DisasContext *s, arg_shl_scalar *a,
+ MVEGenTwoOpShiftFn *fn)
+{
+ TCGv_ptr qda;
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qda) ||
+ a->rm == 13 || a->rm == 15 || !fn) {
+ /* Rm cases are UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qda = mve_qreg_ptr(a->qda);
+ rm = load_reg(s, a->rm);
+ fn(cpu_env, qda, qda, rm);
+ tcg_temp_free_ptr(qda);
+ tcg_temp_free_i32(rm);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_2SHIFT_SCALAR(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_shl_scalar *a) \
+ { \
+ static MVEGenTwoOpShiftFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2shift_scalar(s, a, fns[a->size]); \
+ }
+
+DO_2SHIFT_SCALAR(VSHL_S_scalar, vshli_s)
+DO_2SHIFT_SCALAR(VSHL_U_scalar, vshli_u)
+DO_2SHIFT_SCALAR(VRSHL_S_scalar, vrshli_s)
+DO_2SHIFT_SCALAR(VRSHL_U_scalar, vrshli_u)
+DO_2SHIFT_SCALAR(VQSHL_S_scalar, vqshli_s)
+DO_2SHIFT_SCALAR(VQSHL_U_scalar, vqshli_u)
+DO_2SHIFT_SCALAR(VQRSHL_S_scalar, vqrshli_s)
+DO_2SHIFT_SCALAR(VQRSHL_U_scalar, vqrshli_u)
+
+#define DO_VSHLL(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
+ { \
+ static MVEGenTwoOpShiftFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ }; \
+ return do_2shift_vec(s, a, fns[a->size], false, do_gvec_##FN); \
+ }
+
+/*
+ * For the VSHLL vector helpers, the vece is the size of the input
+ * (ie MO_8 or MO_16); the helpers want to work in the output size.
+ * The shift count can be 0..<input size>, inclusive. (0 is VMOVL.)
+ */
+static void do_gvec_vshllbs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ unsigned ovece = vece + 1;
+ unsigned ibits = vece == MO_8 ? 8 : 16;
+ tcg_gen_gvec_shli(ovece, dofs, aofs, ibits, oprsz, maxsz);
+ tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+}
+
+static void do_gvec_vshllbu(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ unsigned ovece = vece + 1;
+ tcg_gen_gvec_andi(ovece, dofs, aofs,
+ ovece == MO_16 ? 0xff : 0xffff, oprsz, maxsz);
+ tcg_gen_gvec_shli(ovece, dofs, dofs, shift, oprsz, maxsz);
+}
+
+static void do_gvec_vshllts(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ unsigned ovece = vece + 1;
+ unsigned ibits = vece == MO_8 ? 8 : 16;
+ if (shift == 0) {
+ tcg_gen_gvec_sari(ovece, dofs, aofs, ibits, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_andi(ovece, dofs, aofs,
+ ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
+ tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+ }
+}
+
+static void do_gvec_vshlltu(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ unsigned ovece = vece + 1;
+ unsigned ibits = vece == MO_8 ? 8 : 16;
+ if (shift == 0) {
+ tcg_gen_gvec_shri(ovece, dofs, aofs, ibits, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_andi(ovece, dofs, aofs,
+ ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
+ tcg_gen_gvec_shri(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+ }
+}
+
+DO_VSHLL(VSHLL_BS, vshllbs)
+DO_VSHLL(VSHLL_BU, vshllbu)
+DO_VSHLL(VSHLL_TS, vshllts)
+DO_VSHLL(VSHLL_TU, vshlltu)
+
+#define DO_2SHIFT_N(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
+ { \
+ static MVEGenTwoOpShiftFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ }; \
+ return do_2shift(s, a, fns[a->size], false); \
+ }
+
+DO_2SHIFT_N(VSHRNB, vshrnb)
+DO_2SHIFT_N(VSHRNT, vshrnt)
+DO_2SHIFT_N(VRSHRNB, vrshrnb)
+DO_2SHIFT_N(VRSHRNT, vrshrnt)
+DO_2SHIFT_N(VQSHRNB_S, vqshrnb_s)
+DO_2SHIFT_N(VQSHRNT_S, vqshrnt_s)
+DO_2SHIFT_N(VQSHRNB_U, vqshrnb_u)
+DO_2SHIFT_N(VQSHRNT_U, vqshrnt_u)
+DO_2SHIFT_N(VQSHRUNB, vqshrunb)
+DO_2SHIFT_N(VQSHRUNT, vqshrunt)
+DO_2SHIFT_N(VQRSHRNB_S, vqrshrnb_s)
+DO_2SHIFT_N(VQRSHRNT_S, vqrshrnt_s)
+DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u)
+DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u)
+DO_2SHIFT_N(VQRSHRUNB, vqrshrunb)
+DO_2SHIFT_N(VQRSHRUNT, vqrshrunt)
+
+static bool trans_VSHLC(DisasContext *s, arg_VSHLC *a)
+{
+ /*
+ * Whole Vector Left Shift with Carry. The carry is taken
+ * from a general purpose register and written back there.
+ * An imm of 0 means "shift by 32".
+ */
+ TCGv_ptr qd;
+ TCGv_i32 rdm;
+
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+ return false;
+ }
+ if (a->rdm == 13 || a->rdm == 15) {
+ /* CONSTRAINED UNPREDICTABLE: we UNDEF */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ rdm = load_reg(s, a->rdm);
+ gen_helper_mve_vshlc(rdm, cpu_env, qd, rdm, tcg_constant_i32(a->imm));
+ store_reg(s, a->rdm, rdm);
+ tcg_temp_free_ptr(qd);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_vidup(DisasContext *s, arg_vidup *a, MVEGenVIDUPFn *fn)
+{
+ TCGv_ptr qd;
+ TCGv_i32 rn;
+
+ /*
+ * Vector increment/decrement with wrap and duplicate (VIDUP, VDDUP).
+ * This fills the vector with elements of successively increasing
+ * or decreasing values, starting from Rn.
+ */
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+ return false;
+ }
+ if (a->size == MO_64) {
+ /* size 0b11 is another encoding */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ rn = load_reg(s, a->rn);
+ fn(rn, cpu_env, qd, rn, tcg_constant_i32(a->imm));
+ store_reg(s, a->rn, rn);
+ tcg_temp_free_ptr(qd);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_viwdup(DisasContext *s, arg_viwdup *a, MVEGenVIWDUPFn *fn)
+{
+ TCGv_ptr qd;
+ TCGv_i32 rn, rm;
+
+ /*
+ * Vector increment/decrement with wrap and duplicate (VIWDUp, VDWDUP)
+ * This fills the vector with elements of successively increasing
+ * or decreasing values, starting from Rn. Rm specifies a point where
+ * the count wraps back around to 0. The updated offset is written back
+ * to Rn.
+ */
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+ return false;
+ }
+ if (!fn || a->rm == 13 || a->rm == 15) {
+ /*
+ * size 0b11 is another encoding; Rm == 13 is UNPREDICTABLE;
+ * Rm == 13 is VIWDUP, VDWDUP.
+ */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ rn = load_reg(s, a->rn);
+ rm = load_reg(s, a->rm);
+ fn(rn, cpu_env, qd, rn, rm, tcg_constant_i32(a->imm));
+ store_reg(s, a->rn, rn);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_i32(rm);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VIDUP(DisasContext *s, arg_vidup *a)
+{
+ static MVEGenVIDUPFn * const fns[] = {
+ gen_helper_mve_vidupb,
+ gen_helper_mve_viduph,
+ gen_helper_mve_vidupw,
+ NULL,
+ };
+ return do_vidup(s, a, fns[a->size]);
+}
+
+static bool trans_VDDUP(DisasContext *s, arg_vidup *a)
+{
+ static MVEGenVIDUPFn * const fns[] = {
+ gen_helper_mve_vidupb,
+ gen_helper_mve_viduph,
+ gen_helper_mve_vidupw,
+ NULL,
+ };
+ /* VDDUP is just like VIDUP but with a negative immediate */
+ a->imm = -a->imm;
+ return do_vidup(s, a, fns[a->size]);
+}
+
+static bool trans_VIWDUP(DisasContext *s, arg_viwdup *a)
+{
+ static MVEGenVIWDUPFn * const fns[] = {
+ gen_helper_mve_viwdupb,
+ gen_helper_mve_viwduph,
+ gen_helper_mve_viwdupw,
+ NULL,
+ };
+ return do_viwdup(s, a, fns[a->size]);
+}
+
+static bool trans_VDWDUP(DisasContext *s, arg_viwdup *a)
+{
+ static MVEGenVIWDUPFn * const fns[] = {
+ gen_helper_mve_vdwdupb,
+ gen_helper_mve_vdwduph,
+ gen_helper_mve_vdwdupw,
+ NULL,
+ };
+ return do_viwdup(s, a, fns[a->size]);
+}
+
+static bool do_vcmp(DisasContext *s, arg_vcmp *a, MVEGenCmpFn *fn)
+{
+ TCGv_ptr qn, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qn, qm);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+ if (a->mask) {
+ /* VPT */
+ gen_vpst(s, a->mask);
+ }
+ /* This insn updates predication bits */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_vcmp_scalar(DisasContext *s, arg_vcmp_scalar *a,
+ MVEGenScalarCmpFn *fn)
+{
+ TCGv_ptr qn;
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_mve, s) || !fn || a->rm == 13) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qn = mve_qreg_ptr(a->qn);
+ if (a->rm == 15) {
+ /* Encoding Rm=0b1111 means "constant zero" */
+ rm = tcg_constant_i32(0);
+ } else {
+ rm = load_reg(s, a->rm);
+ }
+ fn(cpu_env, qn, rm);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_i32(rm);
+ if (a->mask) {
+ /* VPT */
+ gen_vpst(s, a->mask);
+ }
+ /* This insn updates predication bits */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_VCMP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vcmp *a) \
+ { \
+ static MVEGenCmpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_vcmp(s, a, fns[a->size]); \
+ } \
+ static bool trans_##INSN##_scalar(DisasContext *s, \
+ arg_vcmp_scalar *a) \
+ { \
+ static MVEGenScalarCmpFn * const fns[] = { \
+ gen_helper_mve_##FN##_scalarb, \
+ gen_helper_mve_##FN##_scalarh, \
+ gen_helper_mve_##FN##_scalarw, \
+ NULL, \
+ }; \
+ return do_vcmp_scalar(s, a, fns[a->size]); \
+ }
+
+DO_VCMP(VCMPEQ, vcmpeq)
+DO_VCMP(VCMPNE, vcmpne)
+DO_VCMP(VCMPCS, vcmpcs)
+DO_VCMP(VCMPHI, vcmphi)
+DO_VCMP(VCMPGE, vcmpge)
+DO_VCMP(VCMPLT, vcmplt)
+DO_VCMP(VCMPGT, vcmpgt)
+DO_VCMP(VCMPLE, vcmple)
+
+#define DO_VCMP_FP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vcmp *a) \
+ { \
+ static MVEGenCmpFn * const fns[] = { \
+ NULL, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_vcmp(s, a, fns[a->size]); \
+ } \
+ static bool trans_##INSN##_scalar(DisasContext *s, \
+ arg_vcmp_scalar *a) \
+ { \
+ static MVEGenScalarCmpFn * const fns[] = { \
+ NULL, \
+ gen_helper_mve_##FN##_scalarh, \
+ gen_helper_mve_##FN##_scalars, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_vcmp_scalar(s, a, fns[a->size]); \
+ }
+
+DO_VCMP_FP(VCMPEQ_fp, vfcmpeq)
+DO_VCMP_FP(VCMPNE_fp, vfcmpne)
+DO_VCMP_FP(VCMPGE_fp, vfcmpge)
+DO_VCMP_FP(VCMPLT_fp, vfcmplt)
+DO_VCMP_FP(VCMPGT_fp, vfcmpgt)
+DO_VCMP_FP(VCMPLE_fp, vfcmple)
+
+static bool do_vmaxv(DisasContext *s, arg_vmaxv *a, MVEGenVADDVFn fn)
+{
+ /*
+ * MIN/MAX operations across a vector: compute the min or
+ * max of the initial value in a general purpose register
+ * and all the elements in the vector, and store it back
+ * into the general purpose register.
+ */
+ TCGv_ptr qm;
+ TCGv_i32 rda;
+
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
+ !fn || a->rda == 13 || a->rda == 15) {
+ /* Rda cases are UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qm = mve_qreg_ptr(a->qm);
+ rda = load_reg(s, a->rda);
+ fn(rda, cpu_env, qm, rda);
+ store_reg(s, a->rda, rda);
+ tcg_temp_free_ptr(qm);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_VMAXV(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vmaxv *a) \
+ { \
+ static MVEGenVADDVFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_vmaxv(s, a, fns[a->size]); \
+ }
+
+DO_VMAXV(VMAXV_S, vmaxvs)
+DO_VMAXV(VMAXV_U, vmaxvu)
+DO_VMAXV(VMAXAV, vmaxav)
+DO_VMAXV(VMINV_S, vminvs)
+DO_VMAXV(VMINV_U, vminvu)
+DO_VMAXV(VMINAV, vminav)
+
+#define DO_VMAXV_FP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vmaxv *a) \
+ { \
+ static MVEGenVADDVFn * const fns[] = { \
+ NULL, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##s, \
+ NULL, \
+ }; \
+ if (!dc_isar_feature(aa32_mve_fp, s)) { \
+ return false; \
+ } \
+ return do_vmaxv(s, a, fns[a->size]); \
+ }
+
+DO_VMAXV_FP(VMAXNMV, vmaxnmv)
+DO_VMAXV_FP(VMINNMV, vminnmv)
+DO_VMAXV_FP(VMAXNMAV, vmaxnmav)
+DO_VMAXV_FP(VMINNMAV, vminnmav)
+
+static bool do_vabav(DisasContext *s, arg_vabav *a, MVEGenVABAVFn *fn)
+{
+ /* Absolute difference accumulated across vector */
+ TCGv_ptr qn, qm;
+ TCGv_i32 rda;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qm | a->qn) ||
+ !fn || a->rda == 13 || a->rda == 15) {
+ /* Rda cases are UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qm = mve_qreg_ptr(a->qm);
+ qn = mve_qreg_ptr(a->qn);
+ rda = load_reg(s, a->rda);
+ fn(rda, cpu_env, qn, qm, rda);
+ store_reg(s, a->rda, rda);
+ tcg_temp_free_ptr(qm);
+ tcg_temp_free_ptr(qn);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_VABAV(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_vabav *a) \
+ { \
+ static MVEGenVABAVFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_vabav(s, a, fns[a->size]); \
+ }
+
+DO_VABAV(VABAV_S, vabavs)
+DO_VABAV(VABAV_U, vabavu)
+
+static bool trans_VMOV_to_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
+{
+ /*
+ * VMOV two 32-bit vector lanes to two general-purpose registers.
+ * This insn is not predicated but it is subject to beat-wise
+ * execution if it is not in an IT block. For us this means
+ * only that if PSR.ECI says we should not be executing the beat
+ * corresponding to the lane of the vector register being accessed
+ * then we should skip perfoming the move, and that we need to do
+ * the usual check for bad ECI state and advance of ECI state.
+ * (If PSR.ECI is non-zero then we cannot be in an IT block.)
+ */
+ TCGv_i32 tmp;
+ int vd;
+
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
+ a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15 ||
+ a->rt == a->rt2) {
+ /* Rt/Rt2 cases are UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Convert Qreg index to Dreg for read_neon_element32() etc */
+ vd = a->qd * 2;
+
+ if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, vd, a->idx, MO_32);
+ store_reg(s, a->rt, tmp);
+ }
+ if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, vd + 1, a->idx, MO_32);
+ store_reg(s, a->rt2, tmp);
+ }
+
+ mve_update_and_store_eci(s);
+ return true;
+}
+
+static bool trans_VMOV_from_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
+{
+ /*
+ * VMOV two general-purpose registers to two 32-bit vector lanes.
+ * This insn is not predicated but it is subject to beat-wise
+ * execution if it is not in an IT block. For us this means
+ * only that if PSR.ECI says we should not be executing the beat
+ * corresponding to the lane of the vector register being accessed
+ * then we should skip perfoming the move, and that we need to do
+ * the usual check for bad ECI state and advance of ECI state.
+ * (If PSR.ECI is non-zero then we cannot be in an IT block.)
+ */
+ TCGv_i32 tmp;
+ int vd;
+
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
+ a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15) {
+ /* Rt/Rt2 cases are UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Convert Qreg idx to Dreg for read_neon_element32() etc */
+ vd = a->qd * 2;
+
+ if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
+ tmp = load_reg(s, a->rt);
+ write_neon_element32(tmp, vd, a->idx, MO_32);
+ tcg_temp_free_i32(tmp);
+ }
+ if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
+ tmp = load_reg(s, a->rt2);
+ write_neon_element32(tmp, vd + 1, a->idx, MO_32);
+ tcg_temp_free_i32(tmp);
+ }
+
+ mve_update_and_store_eci(s);
+ return true;
+}
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
new file mode 100644
index 0000000..4016339
--- /dev/null
+++ b/target/arm/tcg/translate-neon.c
@@ -0,0 +1,4064 @@
+/*
+ * ARM translation: AArch32 Neon instructions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2005-2007 CodeSourcery
+ * Copyright (c) 2007 OpenedHand, Ltd.
+ * Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+/* Include the generated Neon decoder */
+#include "decode-neon-dp.c.inc"
+#include "decode-neon-ls.c.inc"
+#include "decode-neon-shared.c.inc"
+
+static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
+ return ret;
+}
+
+static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
+{
+ long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
+
+ switch (mop) {
+ case MO_UB:
+ tcg_gen_ld8u_i32(var, cpu_env, offset);
+ break;
+ case MO_UW:
+ tcg_gen_ld16u_i32(var, cpu_env, offset);
+ break;
+ case MO_UL:
+ tcg_gen_ld_i32(var, cpu_env, offset);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
+{
+ long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
+
+ switch (mop) {
+ case MO_UB:
+ tcg_gen_ld8u_i64(var, cpu_env, offset);
+ break;
+ case MO_UW:
+ tcg_gen_ld16u_i64(var, cpu_env, offset);
+ break;
+ case MO_UL:
+ tcg_gen_ld32u_i64(var, cpu_env, offset);
+ break;
+ case MO_UQ:
+ tcg_gen_ld_i64(var, cpu_env, offset);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
+{
+ long offset = neon_element_offset(reg, ele, size);
+
+ switch (size) {
+ case MO_8:
+ tcg_gen_st8_i32(var, cpu_env, offset);
+ break;
+ case MO_16:
+ tcg_gen_st16_i32(var, cpu_env, offset);
+ break;
+ case MO_32:
+ tcg_gen_st_i32(var, cpu_env, offset);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
+{
+ long offset = neon_element_offset(reg, ele, size);
+
+ switch (size) {
+ case MO_8:
+ tcg_gen_st8_i64(var, cpu_env, offset);
+ break;
+ case MO_16:
+ tcg_gen_st16_i64(var, cpu_env, offset);
+ break;
+ case MO_32:
+ tcg_gen_st32_i64(var, cpu_env, offset);
+ break;
+ case MO_64:
+ tcg_gen_st_i64(var, cpu_env, offset);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
+ int data, gen_helper_gvec_4 *fn_gvec)
+{
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
+ return false;
+ }
+
+ /*
+ * UNDEF accesses to odd registers for each bit of Q.
+ * Q will be 0b111 for all Q-reg instructions, otherwise
+ * when we have mixed Q- and D-reg inputs.
+ */
+ if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ int opr_sz = q ? 16 : 8;
+ tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
+ vfp_reg_offset(1, vn),
+ vfp_reg_offset(1, vm),
+ vfp_reg_offset(1, vd),
+ opr_sz, opr_sz, data, fn_gvec);
+ return true;
+}
+
+static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
+ int data, ARMFPStatusFlavour fp_flavour,
+ gen_helper_gvec_4_ptr *fn_gvec_ptr)
+{
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
+ return false;
+ }
+
+ /*
+ * UNDEF accesses to odd registers for each bit of Q.
+ * Q will be 0b111 for all Q-reg instructions, otherwise
+ * when we have mixed Q- and D-reg inputs.
+ */
+ if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ int opr_sz = q ? 16 : 8;
+ TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
+
+ tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
+ vfp_reg_offset(1, vn),
+ vfp_reg_offset(1, vm),
+ vfp_reg_offset(1, vd),
+ fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
+{
+ if (!dc_isar_feature(aa32_vcma, s)) {
+ return false;
+ }
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
+ FPST_STD_F16, gen_helper_gvec_fcmlah);
+ }
+ return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
+ FPST_STD, gen_helper_gvec_fcmlas);
+}
+
+static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
+{
+ int opr_sz;
+ TCGv_ptr fpst;
+ gen_helper_gvec_3_ptr *fn_gvec_ptr;
+
+ if (!dc_isar_feature(aa32_vcma, s)
+ || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vn | a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ opr_sz = (1 + a->q) * 8;
+ fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+ fn_gvec_ptr = (a->size == MO_16) ?
+ gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+ vfp_reg_offset(1, a->vn),
+ vfp_reg_offset(1, a->vm),
+ fpst, opr_sz, opr_sz, a->rot,
+ fn_gvec_ptr);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
+{
+ if (!dc_isar_feature(aa32_dp, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_sdot_b);
+}
+
+static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
+{
+ if (!dc_isar_feature(aa32_dp, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_udot_b);
+}
+
+static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_usdot_b);
+}
+
+static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_bfdot);
+}
+
+static bool trans_VFML(DisasContext *s, arg_VFML *a)
+{
+ int opr_sz;
+
+ if (!dc_isar_feature(aa32_fhm, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ opr_sz = (1 + a->q) * 8;
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+ vfp_reg_offset(a->q, a->vn),
+ vfp_reg_offset(a->q, a->vm),
+ cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
+ gen_helper_gvec_fmlal_a32);
+ return true;
+}
+
+static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
+{
+ int data = (a->index << 2) | a->rot;
+
+ if (!dc_isar_feature(aa32_vcma, s)) {
+ return false;
+ }
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
+ FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
+ }
+ return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
+ FPST_STD, gen_helper_gvec_fcmlas_idx);
+}
+
+static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
+{
+ if (!dc_isar_feature(aa32_dp, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_sdot_idx_b);
+}
+
+static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
+{
+ if (!dc_isar_feature(aa32_dp, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_udot_idx_b);
+}
+
+static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_usdot_idx_b);
+}
+
+static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_sudot_idx_b);
+}
+
+static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_bfdot_idx);
+}
+
+static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
+{
+ int opr_sz;
+
+ if (!dc_isar_feature(aa32_fhm, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
+ return false;
+ }
+
+ if (a->vd & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ opr_sz = (1 + a->q) * 8;
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+ vfp_reg_offset(a->q, a->vn),
+ vfp_reg_offset(a->q, a->rm),
+ cpu_env, opr_sz, opr_sz,
+ (a->index << 2) | a->s, /* is_2 == 0 */
+ gen_helper_gvec_fmlal_idx_a32);
+ return true;
+}
+
+static struct {
+ int nregs;
+ int interleave;
+ int spacing;
+} const neon_ls_element_type[11] = {
+ {1, 4, 1},
+ {1, 4, 2},
+ {4, 1, 1},
+ {2, 2, 2},
+ {1, 3, 1},
+ {1, 3, 2},
+ {3, 1, 1},
+ {1, 1, 1},
+ {1, 2, 1},
+ {1, 2, 2},
+ {2, 1, 1}
+};
+
+static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
+ int stride)
+{
+ if (rm != 15) {
+ TCGv_i32 base;
+
+ base = load_reg(s, rn);
+ if (rm == 13) {
+ tcg_gen_addi_i32(base, base, stride);
+ } else {
+ TCGv_i32 index;
+ index = load_reg(s, rm);
+ tcg_gen_add_i32(base, base, index);
+ tcg_temp_free_i32(index);
+ }
+ store_reg(s, rn, base);
+ }
+}
+
+static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
+{
+ /* Neon load/store multiple structures */
+ int nregs, interleave, spacing, reg, n;
+ MemOp mop, align, endian;
+ int mmu_idx = get_mem_index(s);
+ int size = a->size;
+ TCGv_i64 tmp64;
+ TCGv_i32 addr;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+ if (a->itype > 10) {
+ return false;
+ }
+ /* Catch UNDEF cases for bad values of align field */
+ switch (a->itype & 0xc) {
+ case 4:
+ if (a->align >= 2) {
+ return false;
+ }
+ break;
+ case 8:
+ if (a->align == 3) {
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+ nregs = neon_ls_element_type[a->itype].nregs;
+ interleave = neon_ls_element_type[a->itype].interleave;
+ spacing = neon_ls_element_type[a->itype].spacing;
+ if (size == 3 && (interleave | spacing) != 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* For our purposes, bytes are always little-endian. */
+ endian = s->be_data;
+ if (size == 0) {
+ endian = MO_LE;
+ }
+
+ /* Enforce alignment requested by the instruction */
+ if (a->align) {
+ align = pow2_align(a->align + 2); /* 4 ** a->align */
+ } else {
+ align = s->align_mem ? MO_ALIGN : 0;
+ }
+
+ /*
+ * Consecutive little-endian elements from a single register
+ * can be promoted to a larger little-endian operation.
+ */
+ if (interleave == 1 && endian == MO_LE) {
+ /* Retain any natural alignment. */
+ if (align == MO_ALIGN) {
+ align = pow2_align(size);
+ }
+ size = 3;
+ }
+
+ tmp64 = tcg_temp_new_i64();
+ addr = tcg_temp_new_i32();
+ load_reg_var(s, addr, a->rn);
+
+ mop = endian | size | align;
+ for (reg = 0; reg < nregs; reg++) {
+ for (n = 0; n < 8 >> size; n++) {
+ int xs;
+ for (xs = 0; xs < interleave; xs++) {
+ int tt = a->vd + reg + spacing * xs;
+
+ if (a->l) {
+ gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
+ neon_store_element64(tt, n, size, tmp64);
+ } else {
+ neon_load_element64(tmp64, tt, n, size);
+ gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
+ }
+ tcg_gen_addi_i32(addr, addr, 1 << size);
+
+ /* Subsequent memory operations inherit alignment */
+ mop &= ~MO_AMASK;
+ }
+ }
+ }
+ tcg_temp_free_i32(addr);
+ tcg_temp_free_i64(tmp64);
+
+ gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
+ return true;
+}
+
+static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
+{
+ /* Neon load single structure to all lanes */
+ int reg, stride, vec_size;
+ int vd = a->vd;
+ int size = a->size;
+ int nregs = a->n + 1;
+ TCGv_i32 addr, tmp;
+ MemOp mop, align;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ align = 0;
+ if (size == 3) {
+ if (nregs != 4 || a->a == 0) {
+ return false;
+ }
+ /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
+ size = MO_32;
+ align = MO_ALIGN_16;
+ } else if (a->a) {
+ switch (nregs) {
+ case 1:
+ if (size == 0) {
+ return false;
+ }
+ align = MO_ALIGN;
+ break;
+ case 2:
+ align = pow2_align(size + 1);
+ break;
+ case 3:
+ return false;
+ case 4:
+ if (size == 2) {
+ align = pow2_align(3);
+ } else {
+ align = pow2_align(size + 2);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * VLD1 to all lanes: T bit indicates how many Dregs to write.
+ * VLD2/3/4 to all lanes: T bit indicates register stride.
+ */
+ stride = a->t ? 2 : 1;
+ vec_size = nregs == 1 ? stride * 8 : 8;
+ mop = size | align;
+ tmp = tcg_temp_new_i32();
+ addr = tcg_temp_new_i32();
+ load_reg_var(s, addr, a->rn);
+ for (reg = 0; reg < nregs; reg++) {
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
+ if ((vd & 1) && vec_size == 16) {
+ /*
+ * We cannot write 16 bytes at once because the
+ * destination is unaligned.
+ */
+ tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
+ 8, 8, tmp);
+ tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
+ neon_full_reg_offset(vd), 8, 8);
+ } else {
+ tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
+ vec_size, vec_size, tmp);
+ }
+ tcg_gen_addi_i32(addr, addr, 1 << size);
+ vd += stride;
+
+ /* Subsequent memory operations inherit alignment */
+ mop &= ~MO_AMASK;
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(addr);
+
+ gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
+
+ return true;
+}
+
+static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
+{
+ /* Neon load/store single structure to one lane */
+ int reg;
+ int nregs = a->n + 1;
+ int vd = a->vd;
+ TCGv_i32 addr, tmp;
+ MemOp mop;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ /* Catch the UNDEF cases. This is unavoidably a bit messy. */
+ switch (nregs) {
+ case 1:
+ if (a->stride != 1) {
+ return false;
+ }
+ if (((a->align & (1 << a->size)) != 0) ||
+ (a->size == 2 && (a->align == 1 || a->align == 2))) {
+ return false;
+ }
+ break;
+ case 2:
+ if (a->size == 2 && (a->align & 2) != 0) {
+ return false;
+ }
+ break;
+ case 3:
+ if (a->align != 0) {
+ return false;
+ }
+ break;
+ case 4:
+ if (a->size == 2 && a->align == 3) {
+ return false;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ if ((vd + a->stride * (nregs - 1)) > 31) {
+ /*
+ * Attempts to write off the end of the register file are
+ * UNPREDICTABLE; we choose to UNDEF because otherwise we would
+ * access off the end of the array that holds the register data.
+ */
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Pick up SCTLR settings */
+ mop = finalize_memop(s, a->size);
+
+ if (a->align) {
+ MemOp align_op;
+
+ switch (nregs) {
+ case 1:
+ /* For VLD1, use natural alignment. */
+ align_op = MO_ALIGN;
+ break;
+ case 2:
+ /* For VLD2, use double alignment. */
+ align_op = pow2_align(a->size + 1);
+ break;
+ case 4:
+ if (a->size == MO_32) {
+ /*
+ * For VLD4.32, align = 1 is double alignment, align = 2 is
+ * quad alignment; align = 3 is rejected above.
+ */
+ align_op = pow2_align(a->size + a->align);
+ } else {
+ /* For VLD4.8 and VLD.16, we want quad alignment. */
+ align_op = pow2_align(a->size + 2);
+ }
+ break;
+ default:
+ /* For VLD3, the alignment field is zero and rejected above. */
+ g_assert_not_reached();
+ }
+
+ mop = (mop & ~MO_AMASK) | align_op;
+ }
+
+ tmp = tcg_temp_new_i32();
+ addr = tcg_temp_new_i32();
+ load_reg_var(s, addr, a->rn);
+
+ for (reg = 0; reg < nregs; reg++) {
+ if (a->l) {
+ gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
+ neon_store_element(vd, a->reg_idx, a->size, tmp);
+ } else { /* Store */
+ neon_load_element(tmp, vd, a->reg_idx, a->size);
+ gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
+ }
+ vd += a->stride;
+ tcg_gen_addi_i32(addr, addr, 1 << a->size);
+
+ /* Subsequent memory operations inherit alignment */
+ mop &= ~MO_AMASK;
+ }
+ tcg_temp_free_i32(addr);
+ tcg_temp_free_i32(tmp);
+
+ gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
+
+ return true;
+}
+
+static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
+{
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_full_reg_offset(a->vd);
+ int rn_ofs = neon_full_reg_offset(a->vn);
+ int rm_ofs = neon_full_reg_offset(a->vm);
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vn | a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
+ return true;
+}
+
+#define DO_3SAME(INSN, FUNC) \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ return do_3same(s, a, FUNC); \
+ }
+
+DO_3SAME(VADD, tcg_gen_gvec_add)
+DO_3SAME(VSUB, tcg_gen_gvec_sub)
+DO_3SAME(VAND, tcg_gen_gvec_and)
+DO_3SAME(VBIC, tcg_gen_gvec_andc)
+DO_3SAME(VORR, tcg_gen_gvec_or)
+DO_3SAME(VORN, tcg_gen_gvec_orc)
+DO_3SAME(VEOR, tcg_gen_gvec_xor)
+DO_3SAME(VSHL_S, gen_gvec_sshl)
+DO_3SAME(VSHL_U, gen_gvec_ushl)
+DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
+DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
+DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
+DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
+
+/* These insns are all gvec_bitsel but with the inputs in various orders. */
+#define DO_3SAME_BITSEL(INSN, O1, O2, O3) \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz); \
+ } \
+ DO_3SAME(INSN, gen_##INSN##_3s)
+
+DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
+DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
+DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
+
+#define DO_3SAME_NO_SZ_3(INSN, FUNC) \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size == 3) { \
+ return false; \
+ } \
+ return do_3same(s, a, FUNC); \
+ }
+
+DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
+DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
+DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
+DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
+DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
+DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
+DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
+DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
+DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
+DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
+DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
+DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
+
+#define DO_3SAME_CMP(INSN, COND) \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
+ } \
+ DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
+
+DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
+DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
+DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
+DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
+DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
+
+#define WRAP_OOL_FN(WRAPNAME, FUNC) \
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, \
+ uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz) \
+ { \
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
+ }
+
+WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
+
+static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
+{
+ if (a->size != 0) {
+ return false;
+ }
+ return do_3same(s, a, gen_VMUL_p_3s);
+}
+
+#define DO_VQRDMLAH(INSN, FUNC) \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (!dc_isar_feature(aa32_rdm, s)) { \
+ return false; \
+ } \
+ if (a->size != 1 && a->size != 2) { \
+ return false; \
+ } \
+ return do_3same(s, a, FUNC); \
+ }
+
+DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
+DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
+
+#define DO_SHA1(NAME, FUNC) \
+ WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \
+ static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (!dc_isar_feature(aa32_sha1, s)) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##NAME##_3s); \
+ }
+
+DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
+DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
+DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
+DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
+
+#define DO_SHA2(NAME, FUNC) \
+ WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \
+ static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (!dc_isar_feature(aa32_sha2, s)) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##NAME##_3s); \
+ }
+
+DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
+DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
+DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
+
+#define DO_3SAME_64(INSN, FUNC) \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static const GVecGen3 op = { .fni8 = FUNC }; \
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op); \
+ } \
+ DO_3SAME(INSN, gen_##INSN##_3s)
+
+#define DO_3SAME_64_ENV(INSN, FUNC) \
+ static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) \
+ { \
+ FUNC(d, cpu_env, n, m); \
+ } \
+ DO_3SAME_64(INSN, gen_##INSN##_elt)
+
+DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
+DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
+DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
+DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
+DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
+DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
+
+#define DO_3SAME_32(INSN, FUNC) \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static const GVecGen3 ops[4] = { \
+ { .fni4 = gen_helper_neon_##FUNC##8 }, \
+ { .fni4 = gen_helper_neon_##FUNC##16 }, \
+ { .fni4 = gen_helper_neon_##FUNC##32 }, \
+ { 0 }, \
+ }; \
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
+ } \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size > 2) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##INSN##_3s); \
+ }
+
+/*
+ * Some helper functions need to be passed the cpu_env. In order
+ * to use those with the gvec APIs like tcg_gen_gvec_3() we need
+ * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
+ * and which call a NeonGenTwoOpEnvFn().
+ */
+#define WRAP_ENV_FN(WRAPNAME, FUNC) \
+ static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m) \
+ { \
+ FUNC(d, cpu_env, n, m); \
+ }
+
+#define DO_3SAME_32_ENV(INSN, FUNC) \
+ WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8); \
+ WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16); \
+ WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32); \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static const GVecGen3 ops[4] = { \
+ { .fni4 = gen_##INSN##_tramp8 }, \
+ { .fni4 = gen_##INSN##_tramp16 }, \
+ { .fni4 = gen_##INSN##_tramp32 }, \
+ { 0 }, \
+ }; \
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
+ } \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size > 2) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##INSN##_3s); \
+ }
+
+DO_3SAME_32(VHADD_S, hadd_s)
+DO_3SAME_32(VHADD_U, hadd_u)
+DO_3SAME_32(VHSUB_S, hsub_s)
+DO_3SAME_32(VHSUB_U, hsub_u)
+DO_3SAME_32(VRHADD_S, rhadd_s)
+DO_3SAME_32(VRHADD_U, rhadd_u)
+DO_3SAME_32(VRSHL_S, rshl_s)
+DO_3SAME_32(VRSHL_U, rshl_u)
+
+DO_3SAME_32_ENV(VQSHL_S, qshl_s)
+DO_3SAME_32_ENV(VQSHL_U, qshl_u)
+DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
+DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
+
+static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
+{
+ /* Operations handled pairwise 32 bits at a time */
+ TCGv_i32 tmp, tmp2, tmp3;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->size == 3) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ assert(a->q == 0); /* enforced by decode patterns */
+
+ /*
+ * Note that we have to be careful not to clobber the source operands
+ * in the "vm == vd" case by storing the result of the first pass too
+ * early. Since Q is 0 there are always just two passes, so instead
+ * of a complicated loop over each pass we just unroll.
+ */
+ tmp = tcg_temp_new_i32();
+ tmp2 = tcg_temp_new_i32();
+ tmp3 = tcg_temp_new_i32();
+
+ read_neon_element32(tmp, a->vn, 0, MO_32);
+ read_neon_element32(tmp2, a->vn, 1, MO_32);
+ fn(tmp, tmp, tmp2);
+
+ read_neon_element32(tmp3, a->vm, 0, MO_32);
+ read_neon_element32(tmp2, a->vm, 1, MO_32);
+ fn(tmp3, tmp3, tmp2);
+
+ write_neon_element32(tmp, a->vd, 0, MO_32);
+ write_neon_element32(tmp3, a->vd, 1, MO_32);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp3);
+ return true;
+}
+
+#define DO_3SAME_PAIR(INSN, func) \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ static NeonGenTwoOpFn * const fns[] = { \
+ gen_helper_neon_##func##8, \
+ gen_helper_neon_##func##16, \
+ gen_helper_neon_##func##32, \
+ }; \
+ if (a->size > 2) { \
+ return false; \
+ } \
+ return do_3same_pair(s, a, fns[a->size]); \
+ }
+
+/* 32-bit pairwise ops end up the same as the elementwise versions. */
+#define gen_helper_neon_pmax_s32 tcg_gen_smax_i32
+#define gen_helper_neon_pmax_u32 tcg_gen_umax_i32
+#define gen_helper_neon_pmin_s32 tcg_gen_smin_i32
+#define gen_helper_neon_pmin_u32 tcg_gen_umin_i32
+#define gen_helper_neon_padd_u32 tcg_gen_add_i32
+
+DO_3SAME_PAIR(VPMAX_S, pmax_s)
+DO_3SAME_PAIR(VPMIN_S, pmin_s)
+DO_3SAME_PAIR(VPMAX_U, pmax_u)
+DO_3SAME_PAIR(VPMIN_U, pmin_u)
+DO_3SAME_PAIR(VPADD, padd_u)
+
+#define DO_3SAME_VQDMULH(INSN, FUNC) \
+ WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16); \
+ WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32); \
+ static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static const GVecGen3 ops[2] = { \
+ { .fni4 = gen_##INSN##_tramp16 }, \
+ { .fni4 = gen_##INSN##_tramp32 }, \
+ }; \
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
+ } \
+ static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size != 1 && a->size != 2) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##INSN##_3s); \
+ }
+
+DO_3SAME_VQDMULH(VQDMULH, qdmulh)
+DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
+
+#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ TCGv_ptr fpst = fpstatus_ptr(FPST); \
+ tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst, \
+ oprsz, maxsz, 0, FUNC); \
+ tcg_temp_free_ptr(fpst); \
+ }
+
+#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC) \
+ WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC) \
+ WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC) \
+ static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##INSN##_fp16_3s); \
+ } \
+ return do_3same(s, a, gen_##INSN##_fp32_3s); \
+ }
+
+
+DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
+DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
+DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
+DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
+DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
+DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
+DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
+DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
+DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
+DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
+DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
+DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
+DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
+DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
+DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
+DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
+DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
+
+WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
+WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
+WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
+WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
+
+static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_3same(s, a, gen_VMAXNM_fp16_3s);
+ }
+ return do_3same(s, a, gen_VMAXNM_fp32_3s);
+}
+
+static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_3same(s, a, gen_VMINNM_fp16_3s);
+ }
+ return do_3same(s, a, gen_VMINNM_fp32_3s);
+}
+
+static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ /* FP pairwise operations */
+ TCGv_ptr fpstatus;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ assert(a->q == 0); /* enforced by decode patterns */
+
+
+ fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+ vfp_reg_offset(1, a->vn),
+ vfp_reg_offset(1, a->vm),
+ fpstatus, 8, 8, 0, fn);
+ tcg_temp_free_ptr(fpstatus);
+
+ return true;
+}
+
+/*
+ * For all the functions using this macro, size == 1 means fp16,
+ * which is an architecture extension we don't implement yet.
+ */
+#define DO_3S_FP_PAIR(INSN,FUNC) \
+ static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
+ { \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_3same_fp_pair(s, a, FUNC##h); \
+ } \
+ return do_3same_fp_pair(s, a, FUNC##s); \
+ }
+
+DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
+DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
+DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
+
+static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
+{
+ /* Handle a 2-reg-shift insn which can be vectorized. */
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_full_reg_offset(a->vd);
+ int rm_ofs = neon_full_reg_offset(a->vm);
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
+ return true;
+}
+
+#define DO_2SH(INSN, FUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_vector_2sh(s, a, FUNC); \
+ } \
+
+DO_2SH(VSHL, tcg_gen_gvec_shli)
+DO_2SH(VSLI, gen_gvec_sli)
+DO_2SH(VSRI, gen_gvec_sri)
+DO_2SH(VSRA_S, gen_gvec_ssra)
+DO_2SH(VSRA_U, gen_gvec_usra)
+DO_2SH(VRSHR_S, gen_gvec_srshr)
+DO_2SH(VRSHR_U, gen_gvec_urshr)
+DO_2SH(VRSRA_S, gen_gvec_srsra)
+DO_2SH(VRSRA_U, gen_gvec_ursra)
+
+static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+ /* Signed shift out of range results in all-sign-bits */
+ a->shift = MIN(a->shift, (8 << a->size) - 1);
+ return do_vector_2sh(s, a, tcg_gen_gvec_sari);
+}
+
+static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
+}
+
+static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+ /* Shift out of range is architecturally valid and results in zero. */
+ if (a->shift >= (8 << a->size)) {
+ return do_vector_2sh(s, a, gen_zero_rd_2sh);
+ } else {
+ return do_vector_2sh(s, a, tcg_gen_gvec_shri);
+ }
+}
+
+static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwo64OpEnvFn *fn)
+{
+ /*
+ * 2-reg-and-shift operations, size == 3 case, where the
+ * function needs to be passed cpu_env.
+ */
+ TCGv_i64 constimm;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * To avoid excessive duplication of ops we implement shift
+ * by immediate using the variable shift operations.
+ */
+ constimm = tcg_constant_i64(dup_const(a->size, a->shift));
+
+ for (pass = 0; pass < a->q + 1; pass++) {
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ read_neon_element64(tmp, a->vm, pass, MO_64);
+ fn(tmp, cpu_env, tmp, constimm);
+ write_neon_element64(tmp, a->vd, pass, MO_64);
+ tcg_temp_free_i64(tmp);
+ }
+ return true;
+}
+
+static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwoOpEnvFn *fn)
+{
+ /*
+ * 2-reg-and-shift operations, size < 3 case, where the
+ * helper needs to be passed cpu_env.
+ */
+ TCGv_i32 constimm, tmp;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * To avoid excessive duplication of ops we implement shift
+ * by immediate using the variable shift operations.
+ */
+ constimm = tcg_constant_i32(dup_const(a->size, a->shift));
+ tmp = tcg_temp_new_i32();
+
+ for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+ read_neon_element32(tmp, a->vm, pass, MO_32);
+ fn(tmp, cpu_env, tmp, constimm);
+ write_neon_element32(tmp, a->vd, pass, MO_32);
+ }
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+#define DO_2SHIFT_ENV(INSN, FUNC) \
+ static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \
+ } \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ static NeonGenTwoOpEnvFn * const fns[] = { \
+ gen_helper_neon_##FUNC##8, \
+ gen_helper_neon_##FUNC##16, \
+ gen_helper_neon_##FUNC##32, \
+ }; \
+ assert(a->size < ARRAY_SIZE(fns)); \
+ return do_2shift_env_32(s, a, fns[a->size]); \
+ }
+
+DO_2SHIFT_ENV(VQSHLU, qshlu_s)
+DO_2SHIFT_ENV(VQSHL_U, qshl_u)
+DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+
+static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwo64OpFn *shiftfn,
+ NeonGenNarrowEnvFn *narrowfn)
+{
+ /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
+ TCGv_i64 constimm, rm1, rm2;
+ TCGv_i32 rd;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vm & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This is always a right shift, and the shiftfn is always a
+ * left-shift helper, which thus needs the negated shift count.
+ */
+ constimm = tcg_constant_i64(-a->shift);
+ rm1 = tcg_temp_new_i64();
+ rm2 = tcg_temp_new_i64();
+ rd = tcg_temp_new_i32();
+
+ /* Load both inputs first to avoid potential overwrite if rm == rd */
+ read_neon_element64(rm1, a->vm, 0, MO_64);
+ read_neon_element64(rm2, a->vm, 1, MO_64);
+
+ shiftfn(rm1, rm1, constimm);
+ narrowfn(rd, cpu_env, rm1);
+ write_neon_element32(rd, a->vd, 0, MO_32);
+
+ shiftfn(rm2, rm2, constimm);
+ narrowfn(rd, cpu_env, rm2);
+ write_neon_element32(rd, a->vd, 1, MO_32);
+
+ tcg_temp_free_i32(rd);
+ tcg_temp_free_i64(rm1);
+ tcg_temp_free_i64(rm2);
+
+ return true;
+}
+
+static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwoOpFn *shiftfn,
+ NeonGenNarrowEnvFn *narrowfn)
+{
+ /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
+ TCGv_i32 constimm, rm1, rm2, rm3, rm4;
+ TCGv_i64 rtmp;
+ uint32_t imm;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vm & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This is always a right shift, and the shiftfn is always a
+ * left-shift helper, which thus needs the negated shift count
+ * duplicated into each lane of the immediate value.
+ */
+ if (a->size == 1) {
+ imm = (uint16_t)(-a->shift);
+ imm |= imm << 16;
+ } else {
+ /* size == 2 */
+ imm = -a->shift;
+ }
+ constimm = tcg_constant_i32(imm);
+
+ /* Load all inputs first to avoid potential overwrite */
+ rm1 = tcg_temp_new_i32();
+ rm2 = tcg_temp_new_i32();
+ rm3 = tcg_temp_new_i32();
+ rm4 = tcg_temp_new_i32();
+ read_neon_element32(rm1, a->vm, 0, MO_32);
+ read_neon_element32(rm2, a->vm, 1, MO_32);
+ read_neon_element32(rm3, a->vm, 2, MO_32);
+ read_neon_element32(rm4, a->vm, 3, MO_32);
+ rtmp = tcg_temp_new_i64();
+
+ shiftfn(rm1, rm1, constimm);
+ shiftfn(rm2, rm2, constimm);
+
+ tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
+ tcg_temp_free_i32(rm2);
+
+ narrowfn(rm1, cpu_env, rtmp);
+ write_neon_element32(rm1, a->vd, 0, MO_32);
+ tcg_temp_free_i32(rm1);
+
+ shiftfn(rm3, rm3, constimm);
+ shiftfn(rm4, rm4, constimm);
+
+ tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
+ tcg_temp_free_i32(rm4);
+
+ narrowfn(rm3, cpu_env, rtmp);
+ tcg_temp_free_i64(rtmp);
+ write_neon_element32(rm3, a->vd, 1, MO_32);
+ tcg_temp_free_i32(rm3);
+ return true;
+}
+
+#define DO_2SN_64(INSN, FUNC, NARROWFUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC); \
+ }
+#define DO_2SN_32(INSN, FUNC, NARROWFUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \
+ }
+
+static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ tcg_gen_extrl_i64_i32(dest, src);
+}
+
+static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ gen_helper_neon_narrow_u16(dest, src);
+}
+
+static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ gen_helper_neon_narrow_u8(dest, src);
+}
+
+DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
+DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
+DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
+DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
+DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
+
+DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
+DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
+DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
+DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
+
+DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
+DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
+DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
+
+DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
+DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
+DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
+
+DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
+DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
+DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
+
+static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
+ NeonGenWidenFn *widenfn, bool u)
+{
+ TCGv_i64 tmp;
+ TCGv_i32 rm0, rm1;
+ uint64_t widen_mask = 0;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This is a widen-and-shift operation. The shift is always less
+ * than the width of the source type, so after widening the input
+ * vector we can simply shift the whole 64-bit widened register,
+ * and then clear the potential overflow bits resulting from left
+ * bits of the narrow input appearing as right bits of the left
+ * neighbour narrow input. Calculate a mask of bits to clear.
+ */
+ if ((a->shift != 0) && (a->size < 2 || u)) {
+ int esize = 8 << a->size;
+ widen_mask = MAKE_64BIT_MASK(0, esize);
+ widen_mask >>= esize - a->shift;
+ widen_mask = dup_const(a->size + 1, widen_mask);
+ }
+
+ rm0 = tcg_temp_new_i32();
+ rm1 = tcg_temp_new_i32();
+ read_neon_element32(rm0, a->vm, 0, MO_32);
+ read_neon_element32(rm1, a->vm, 1, MO_32);
+ tmp = tcg_temp_new_i64();
+
+ widenfn(tmp, rm0);
+ tcg_temp_free_i32(rm0);
+ if (a->shift != 0) {
+ tcg_gen_shli_i64(tmp, tmp, a->shift);
+ tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
+ }
+ write_neon_element64(tmp, a->vd, 0, MO_64);
+
+ widenfn(tmp, rm1);
+ tcg_temp_free_i32(rm1);
+ if (a->shift != 0) {
+ tcg_gen_shli_i64(tmp, tmp, a->shift);
+ tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
+ }
+ write_neon_element64(tmp, a->vd, 1, MO_64);
+ tcg_temp_free_i64(tmp);
+ return true;
+}
+
+static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_s8,
+ gen_helper_neon_widen_s16,
+ tcg_gen_ext_i32_i64,
+ };
+ return do_vshll_2sh(s, a, widenfn[a->size], false);
+}
+
+static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_u8,
+ gen_helper_neon_widen_u16,
+ tcg_gen_extu_i32_i64,
+ };
+ return do_vshll_2sh(s, a, widenfn[a->size], true);
+}
+
+static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
+ gen_helper_gvec_2_ptr *fn)
+{
+ /* FP operations in 2-reg-and-shift group */
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_full_reg_offset(a->vd);
+ int rm_ofs = neon_full_reg_offset(a->vm);
+ TCGv_ptr fpst;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+#define DO_FP_2SH(INSN, FUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_fp_2sh(s, a, FUNC); \
+ }
+
+DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
+DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
+DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
+DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
+
+DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
+DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
+DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
+DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
+
+static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
+ GVecGen2iFn *fn)
+{
+ uint64_t imm;
+ int reg_ofs, vec_size;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ reg_ofs = neon_full_reg_offset(a->vd);
+ vec_size = a->q ? 16 : 8;
+ imm = asimd_imm_const(a->imm, a->cmode, a->op);
+
+ fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
+ return true;
+}
+
+static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
+}
+
+static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
+{
+ /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
+ GVecGen2iFn *fn;
+
+ if ((a->cmode & 1) && a->cmode < 12) {
+ /* for op=1, the imm will be inverted, so BIC becomes AND. */
+ fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
+ } else {
+ /* There is one unallocated cmode/op combination in this space */
+ if (a->cmode == 15 && a->op == 1) {
+ return false;
+ }
+ fn = gen_VMOV_1r;
+ }
+ return do_1reg_imm(s, a, fn);
+}
+
+static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
+ NeonGenWidenFn *widenfn,
+ NeonGenTwo64OpFn *opfn,
+ int src1_mop, int src2_mop)
+{
+ /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
+ TCGv_i64 rn0_64, rn1_64, rm_64;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* size == 3 case, which is an entirely different insn group */
+ return false;
+ }
+
+ if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rn0_64 = tcg_temp_new_i64();
+ rn1_64 = tcg_temp_new_i64();
+ rm_64 = tcg_temp_new_i64();
+
+ if (src1_mop >= 0) {
+ read_neon_element64(rn0_64, a->vn, 0, src1_mop);
+ } else {
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vn, 0, MO_32);
+ widenfn(rn0_64, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+ if (src2_mop >= 0) {
+ read_neon_element64(rm_64, a->vm, 0, src2_mop);
+ } else {
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vm, 0, MO_32);
+ widenfn(rm_64, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+
+ opfn(rn0_64, rn0_64, rm_64);
+
+ /*
+ * Load second pass inputs before storing the first pass result, to
+ * avoid incorrect results if a narrow input overlaps with the result.
+ */
+ if (src1_mop >= 0) {
+ read_neon_element64(rn1_64, a->vn, 1, src1_mop);
+ } else {
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vn, 1, MO_32);
+ widenfn(rn1_64, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+ if (src2_mop >= 0) {
+ read_neon_element64(rm_64, a->vm, 1, src2_mop);
+ } else {
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vm, 1, MO_32);
+ widenfn(rm_64, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+
+ write_neon_element64(rn0_64, a->vd, 0, MO_64);
+
+ opfn(rn1_64, rn1_64, rm_64);
+ write_neon_element64(rn1_64, a->vd, 1, MO_64);
+
+ tcg_temp_free_i64(rn0_64);
+ tcg_temp_free_i64(rn1_64);
+ tcg_temp_free_i64(rm_64);
+
+ return true;
+}
+
+#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN) \
+ static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \
+ { \
+ static NeonGenWidenFn * const widenfn[] = { \
+ gen_helper_neon_widen_##S##8, \
+ gen_helper_neon_widen_##S##16, \
+ NULL, NULL, \
+ }; \
+ static NeonGenTwo64OpFn * const addfn[] = { \
+ gen_helper_neon_##OP##l_u16, \
+ gen_helper_neon_##OP##l_u32, \
+ tcg_gen_##OP##_i64, \
+ NULL, \
+ }; \
+ int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1; \
+ return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size], \
+ SRC1WIDE ? MO_UQ : narrow_mop, \
+ narrow_mop); \
+ }
+
+DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
+DO_PREWIDEN(VADDL_U, u, add, false, 0)
+DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
+DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
+DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
+DO_PREWIDEN(VADDW_U, u, add, true, 0)
+DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
+DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
+
+static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
+ NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
+{
+ /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
+ TCGv_i64 rn_64, rm_64;
+ TCGv_i32 rd0, rd1;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn || !narrowfn) {
+ /* size == 3 case, which is an entirely different insn group */
+ return false;
+ }
+
+ if ((a->vn | a->vm) & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rn_64 = tcg_temp_new_i64();
+ rm_64 = tcg_temp_new_i64();
+ rd0 = tcg_temp_new_i32();
+ rd1 = tcg_temp_new_i32();
+
+ read_neon_element64(rn_64, a->vn, 0, MO_64);
+ read_neon_element64(rm_64, a->vm, 0, MO_64);
+
+ opfn(rn_64, rn_64, rm_64);
+
+ narrowfn(rd0, rn_64);
+
+ read_neon_element64(rn_64, a->vn, 1, MO_64);
+ read_neon_element64(rm_64, a->vm, 1, MO_64);
+
+ opfn(rn_64, rn_64, rm_64);
+
+ narrowfn(rd1, rn_64);
+
+ write_neon_element32(rd0, a->vd, 0, MO_32);
+ write_neon_element32(rd1, a->vd, 1, MO_32);
+
+ tcg_temp_free_i32(rd0);
+ tcg_temp_free_i32(rd1);
+ tcg_temp_free_i64(rn_64);
+ tcg_temp_free_i64(rm_64);
+
+ return true;
+}
+
+#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP) \
+ static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \
+ { \
+ static NeonGenTwo64OpFn * const addfn[] = { \
+ gen_helper_neon_##OP##l_u16, \
+ gen_helper_neon_##OP##l_u32, \
+ tcg_gen_##OP##_i64, \
+ NULL, \
+ }; \
+ static NeonGenNarrowFn * const narrowfn[] = { \
+ gen_helper_neon_##NARROWTYPE##_high_u8, \
+ gen_helper_neon_##NARROWTYPE##_high_u16, \
+ EXTOP, \
+ NULL, \
+ }; \
+ return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]); \
+ }
+
+static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
+{
+ tcg_gen_addi_i64(rn, rn, 1u << 31);
+ tcg_gen_extrh_i64_i32(rd, rn);
+}
+
+DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
+DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
+DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
+DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
+
+static bool do_long_3d(DisasContext *s, arg_3diff *a,
+ NeonGenTwoOpWidenFn *opfn,
+ NeonGenTwo64OpFn *accfn)
+{
+ /*
+ * 3-regs different lengths, long operations.
+ * These perform an operation on two inputs that returns a double-width
+ * result, and then possibly perform an accumulation operation of
+ * that result into the double-width destination.
+ */
+ TCGv_i64 rd0, rd1, tmp;
+ TCGv_i32 rn, rm;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* size == 3 case, which is an entirely different insn group */
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rd0 = tcg_temp_new_i64();
+ rd1 = tcg_temp_new_i64();
+
+ rn = tcg_temp_new_i32();
+ rm = tcg_temp_new_i32();
+ read_neon_element32(rn, a->vn, 0, MO_32);
+ read_neon_element32(rm, a->vm, 0, MO_32);
+ opfn(rd0, rn, rm);
+
+ read_neon_element32(rn, a->vn, 1, MO_32);
+ read_neon_element32(rm, a->vm, 1, MO_32);
+ opfn(rd1, rn, rm);
+ tcg_temp_free_i32(rn);
+ tcg_temp_free_i32(rm);
+
+ /* Don't store results until after all loads: they might overlap */
+ if (accfn) {
+ tmp = tcg_temp_new_i64();
+ read_neon_element64(tmp, a->vd, 0, MO_64);
+ accfn(rd0, tmp, rd0);
+ read_neon_element64(tmp, a->vd, 1, MO_64);
+ accfn(rd1, tmp, rd1);
+ tcg_temp_free_i64(tmp);
+ }
+
+ write_neon_element64(rd0, a->vd, 0, MO_64);
+ write_neon_element64(rd1, a->vd, 1, MO_64);
+ tcg_temp_free_i64(rd0);
+ tcg_temp_free_i64(rd1);
+
+ return true;
+}
+
+static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_abdl_s16,
+ gen_helper_neon_abdl_s32,
+ gen_helper_neon_abdl_s64,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_abdl_u16,
+ gen_helper_neon_abdl_u32,
+ gen_helper_neon_abdl_u64,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_abdl_s16,
+ gen_helper_neon_abdl_s32,
+ gen_helper_neon_abdl_s64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const addfn[] = {
+ gen_helper_neon_addl_u16,
+ gen_helper_neon_addl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
+}
+
+static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_abdl_u16,
+ gen_helper_neon_abdl_u32,
+ gen_helper_neon_abdl_u64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const addfn[] = {
+ gen_helper_neon_addl_u16,
+ gen_helper_neon_addl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
+}
+
+static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+ TCGv_i32 lo = tcg_temp_new_i32();
+ TCGv_i32 hi = tcg_temp_new_i32();
+
+ tcg_gen_muls2_i32(lo, hi, rn, rm);
+ tcg_gen_concat_i32_i64(rd, lo, hi);
+
+ tcg_temp_free_i32(lo);
+ tcg_temp_free_i32(hi);
+}
+
+static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+ TCGv_i32 lo = tcg_temp_new_i32();
+ TCGv_i32 hi = tcg_temp_new_i32();
+
+ tcg_gen_mulu2_i32(lo, hi, rn, rm);
+ tcg_gen_concat_i32_i64(rd, lo, hi);
+
+ tcg_temp_free_i32(lo);
+ tcg_temp_free_i32(hi);
+}
+
+static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_mull_s8,
+ gen_helper_neon_mull_s16,
+ gen_mull_s32,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ gen_helper_neon_mull_u8,
+ gen_helper_neon_mull_u16,
+ gen_mull_u32,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+#define DO_VMLAL(INSN,MULL,ACC) \
+ static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \
+ { \
+ static NeonGenTwoOpWidenFn * const opfn[] = { \
+ gen_helper_neon_##MULL##8, \
+ gen_helper_neon_##MULL##16, \
+ gen_##MULL##32, \
+ NULL, \
+ }; \
+ static NeonGenTwo64OpFn * const accfn[] = { \
+ gen_helper_neon_##ACC##l_u16, \
+ gen_helper_neon_##ACC##l_u32, \
+ tcg_gen_##ACC##_i64, \
+ NULL, \
+ }; \
+ return do_long_3d(s, a, opfn[a->size], accfn[a->size]); \
+ }
+
+DO_VMLAL(VMLAL_S,mull_s,add)
+DO_VMLAL(VMLAL_U,mull_u,add)
+DO_VMLAL(VMLSL_S,mull_s,sub)
+DO_VMLAL(VMLSL_U,mull_u,sub)
+
+static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+ gen_helper_neon_mull_s16(rd, rn, rm);
+ gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
+}
+
+static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+ gen_mull_s32(rd, rn, rm);
+ gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
+}
+
+static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
+}
+
+static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
+}
+
+static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLAL_acc_16,
+ gen_VQDMLAL_acc_32,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ gen_helper_neon_negl_u32(rm, rm);
+ gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
+}
+
+static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_neg_i64(rm, rm);
+ gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
+}
+
+static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLSL_acc_16,
+ gen_VQDMLSL_acc_32,
+ NULL,
+ };
+
+ return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
+{
+ gen_helper_gvec_3 *fn_gvec;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ switch (a->size) {
+ case 0:
+ fn_gvec = gen_helper_neon_pmull_h;
+ break;
+ case 2:
+ if (!dc_isar_feature(aa32_pmull, s)) {
+ return false;
+ }
+ fn_gvec = gen_helper_gvec_pmull_q;
+ break;
+ default:
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
+ neon_full_reg_offset(a->vn),
+ neon_full_reg_offset(a->vm),
+ 16, 16, 0, fn_gvec);
+ return true;
+}
+
+static void gen_neon_dup_low16(TCGv_i32 var)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_ext16u_i32(var, var);
+ tcg_gen_shli_i32(tmp, var, 16);
+ tcg_gen_or_i32(var, var, tmp);
+ tcg_temp_free_i32(tmp);
+}
+
+static void gen_neon_dup_high16(TCGv_i32 var)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_andi_i32(var, var, 0xffff0000);
+ tcg_gen_shri_i32(tmp, var, 16);
+ tcg_gen_or_i32(var, var, tmp);
+ tcg_temp_free_i32(tmp);
+}
+
+static inline TCGv_i32 neon_get_scalar(int size, int reg)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ if (size == MO_16) {
+ read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
+ if (reg & 8) {
+ gen_neon_dup_high16(tmp);
+ } else {
+ gen_neon_dup_low16(tmp);
+ }
+ } else {
+ read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
+ }
+ return tmp;
+}
+
+static bool do_2scalar(DisasContext *s, arg_2scalar *a,
+ NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
+{
+ /*
+ * Two registers and a scalar: perform an operation between
+ * the input elements and the scalar, and then possibly
+ * perform an accumulation operation of that result into the
+ * destination.
+ */
+ TCGv_i32 scalar, tmp;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
+
+ if (a->q && ((a->vd | a->vn) & 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ scalar = neon_get_scalar(a->size, a->vm);
+ tmp = tcg_temp_new_i32();
+
+ for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+ read_neon_element32(tmp, a->vn, pass, MO_32);
+ opfn(tmp, tmp, scalar);
+ if (accfn) {
+ TCGv_i32 rd = tcg_temp_new_i32();
+ read_neon_element32(rd, a->vd, pass, MO_32);
+ accfn(tmp, rd, tmp);
+ tcg_temp_free_i32(rd);
+ }
+ write_neon_element32(tmp, a->vd, pass, MO_32);
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(scalar);
+ return true;
+}
+
+static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mul_u16,
+ tcg_gen_mul_i32,
+ NULL,
+ };
+
+ return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mul_u16,
+ tcg_gen_mul_i32,
+ NULL,
+ };
+ static NeonGenTwoOpFn * const accfn[] = {
+ NULL,
+ gen_helper_neon_add_u16,
+ tcg_gen_add_i32,
+ NULL,
+ };
+
+ return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mul_u16,
+ tcg_gen_mul_i32,
+ NULL,
+ };
+ static NeonGenTwoOpFn * const accfn[] = {
+ NULL,
+ gen_helper_neon_sub_u16,
+ tcg_gen_sub_i32,
+ NULL,
+ };
+
+ return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ /* Two registers and a scalar, using gvec */
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_full_reg_offset(a->vd);
+ int rn_ofs = neon_full_reg_offset(a->vn);
+ int rm_ofs;
+ int idx;
+ TCGv_ptr fpstatus;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!fn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
+
+ if (a->q && ((a->vd | a->vn) & 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* a->vm is M:Vm, which encodes both register and index */
+ idx = extract32(a->vm, a->size + 2, 2);
+ a->vm = extract32(a->vm, 0, a->size + 2);
+ rm_ofs = neon_full_reg_offset(a->vm);
+
+ fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
+ vec_size, vec_size, idx, fn);
+ tcg_temp_free_ptr(fpstatus);
+ return true;
+}
+
+#define DO_VMUL_F_2sc(NAME, FUNC) \
+ static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a) \
+ { \
+ static gen_helper_gvec_3_ptr * const opfn[] = { \
+ NULL, \
+ gen_helper_##FUNC##_h, \
+ gen_helper_##FUNC##_s, \
+ NULL, \
+ }; \
+ if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_2scalar_fp_vec(s, a, opfn[a->size]); \
+ }
+
+DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
+DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
+DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
+
+WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
+WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
+WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
+WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
+
+static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpFn * const opfn[] = {
+ NULL,
+ gen_VQDMULH_16,
+ gen_VQDMULH_32,
+ NULL,
+ };
+
+ return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpFn * const opfn[] = {
+ NULL,
+ gen_VQRDMULH_16,
+ gen_VQRDMULH_32,
+ NULL,
+ };
+
+ return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
+ NeonGenThreeOpEnvFn *opfn)
+{
+ /*
+ * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
+ * performs a kind of fused op-then-accumulate using a helper
+ * function that takes all of rd, rn and the scalar at once.
+ */
+ TCGv_i32 scalar, rn, rd;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_rdm, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
+
+ if (a->q && ((a->vd | a->vn) & 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ scalar = neon_get_scalar(a->size, a->vm);
+ rn = tcg_temp_new_i32();
+ rd = tcg_temp_new_i32();
+
+ for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+ read_neon_element32(rn, a->vn, pass, MO_32);
+ read_neon_element32(rd, a->vd, pass, MO_32);
+ opfn(rd, cpu_env, rn, scalar, rd);
+ write_neon_element32(rd, a->vd, pass, MO_32);
+ }
+ tcg_temp_free_i32(rn);
+ tcg_temp_free_i32(rd);
+ tcg_temp_free_i32(scalar);
+
+ return true;
+}
+
+static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenThreeOpEnvFn *opfn[] = {
+ NULL,
+ gen_helper_neon_qrdmlah_s16,
+ gen_helper_neon_qrdmlah_s32,
+ NULL,
+ };
+ return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
+
+static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenThreeOpEnvFn *opfn[] = {
+ NULL,
+ gen_helper_neon_qrdmlsh_s16,
+ gen_helper_neon_qrdmlsh_s32,
+ NULL,
+ };
+ return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
+
+static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
+ NeonGenTwoOpWidenFn *opfn,
+ NeonGenTwo64OpFn *accfn)
+{
+ /*
+ * Two registers and a scalar, long operations: perform an
+ * operation on the input elements and the scalar which produces
+ * a double-width result, and then possibly perform an accumulation
+ * operation of that result into the destination.
+ */
+ TCGv_i32 scalar, rn;
+ TCGv_i64 rn0_64, rn1_64;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ scalar = neon_get_scalar(a->size, a->vm);
+
+ /* Load all inputs before writing any outputs, in case of overlap */
+ rn = tcg_temp_new_i32();
+ read_neon_element32(rn, a->vn, 0, MO_32);
+ rn0_64 = tcg_temp_new_i64();
+ opfn(rn0_64, rn, scalar);
+
+ read_neon_element32(rn, a->vn, 1, MO_32);
+ rn1_64 = tcg_temp_new_i64();
+ opfn(rn1_64, rn, scalar);
+ tcg_temp_free_i32(rn);
+ tcg_temp_free_i32(scalar);
+
+ if (accfn) {
+ TCGv_i64 t64 = tcg_temp_new_i64();
+ read_neon_element64(t64, a->vd, 0, MO_64);
+ accfn(rn0_64, t64, rn0_64);
+ read_neon_element64(t64, a->vd, 1, MO_64);
+ accfn(rn1_64, t64, rn1_64);
+ tcg_temp_free_i64(t64);
+ }
+
+ write_neon_element64(rn0_64, a->vd, 0, MO_64);
+ write_neon_element64(rn1_64, a->vd, 1, MO_64);
+ tcg_temp_free_i64(rn0_64);
+ tcg_temp_free_i64(rn1_64);
+ return true;
+}
+
+static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mull_s16,
+ gen_mull_s32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mull_u16,
+ gen_mull_u32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+#define DO_VMLAL_2SC(INSN, MULL, ACC) \
+ static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a) \
+ { \
+ static NeonGenTwoOpWidenFn * const opfn[] = { \
+ NULL, \
+ gen_helper_neon_##MULL##16, \
+ gen_##MULL##32, \
+ NULL, \
+ }; \
+ static NeonGenTwo64OpFn * const accfn[] = { \
+ NULL, \
+ gen_helper_neon_##ACC##l_u32, \
+ tcg_gen_##ACC##_i64, \
+ NULL, \
+ }; \
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]); \
+ }
+
+DO_VMLAL_2SC(VMLAL_S, mull_s, add)
+DO_VMLAL_2SC(VMLAL_U, mull_u, add)
+DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
+DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
+
+static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLAL_acc_16,
+ gen_VQDMLAL_acc_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLSL_acc_16,
+ gen_VQDMLSL_acc_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vn | a->vm | a->vd) & a->q) {
+ return false;
+ }
+
+ if (a->imm > 7 && !a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (!a->q) {
+ /* Extract 64 bits from <Vm:Vn> */
+ TCGv_i64 left, right, dest;
+
+ left = tcg_temp_new_i64();
+ right = tcg_temp_new_i64();
+ dest = tcg_temp_new_i64();
+
+ read_neon_element64(right, a->vn, 0, MO_64);
+ read_neon_element64(left, a->vm, 0, MO_64);
+ tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
+ write_neon_element64(dest, a->vd, 0, MO_64);
+
+ tcg_temp_free_i64(left);
+ tcg_temp_free_i64(right);
+ tcg_temp_free_i64(dest);
+ } else {
+ /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
+ TCGv_i64 left, middle, right, destleft, destright;
+
+ left = tcg_temp_new_i64();
+ middle = tcg_temp_new_i64();
+ right = tcg_temp_new_i64();
+ destleft = tcg_temp_new_i64();
+ destright = tcg_temp_new_i64();
+
+ if (a->imm < 8) {
+ read_neon_element64(right, a->vn, 0, MO_64);
+ read_neon_element64(middle, a->vn, 1, MO_64);
+ tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
+ read_neon_element64(left, a->vm, 0, MO_64);
+ tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
+ } else {
+ read_neon_element64(right, a->vn, 1, MO_64);
+ read_neon_element64(middle, a->vm, 0, MO_64);
+ tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
+ read_neon_element64(left, a->vm, 1, MO_64);
+ tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
+ }
+
+ write_neon_element64(destright, a->vd, 0, MO_64);
+ write_neon_element64(destleft, a->vd, 1, MO_64);
+
+ tcg_temp_free_i64(destright);
+ tcg_temp_free_i64(destleft);
+ tcg_temp_free_i64(right);
+ tcg_temp_free_i64(middle);
+ tcg_temp_free_i64(left);
+ }
+ return true;
+}
+
+static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
+{
+ TCGv_i64 val, def;
+ TCGv_i32 desc;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vn + a->len + 1) > 32) {
+ /*
+ * This is UNPREDICTABLE; we choose to UNDEF to avoid the
+ * helper function running off the end of the register file.
+ */
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ desc = tcg_constant_i32((a->vn << 2) | a->len);
+ def = tcg_temp_new_i64();
+ if (a->op) {
+ read_neon_element64(def, a->vd, 0, MO_64);
+ } else {
+ tcg_gen_movi_i64(def, 0);
+ }
+ val = tcg_temp_new_i64();
+ read_neon_element64(val, a->vm, 0, MO_64);
+
+ gen_helper_neon_tbl(val, cpu_env, desc, val, def);
+ write_neon_element64(val, a->vd, 0, MO_64);
+
+ tcg_temp_free_i64(def);
+ tcg_temp_free_i64(val);
+ return true;
+}
+
+static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
+ neon_element_offset(a->vm, a->index, a->size),
+ a->q ? 16 : 8, a->q ? 16 : 8);
+ return true;
+}
+
+static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
+{
+ int pass, half;
+ TCGv_i32 tmp[2];
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (a->size == 3) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp[0] = tcg_temp_new_i32();
+ tmp[1] = tcg_temp_new_i32();
+
+ for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
+ for (half = 0; half < 2; half++) {
+ read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
+ switch (a->size) {
+ case 0:
+ tcg_gen_bswap32_i32(tmp[half], tmp[half]);
+ break;
+ case 1:
+ gen_swap_half(tmp[half], tmp[half]);
+ break;
+ case 2:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
+ write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
+ }
+
+ tcg_temp_free_i32(tmp[0]);
+ tcg_temp_free_i32(tmp[1]);
+ return true;
+}
+
+static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
+ NeonGenWidenFn *widenfn,
+ NeonGenTwo64OpFn *opfn,
+ NeonGenTwo64OpFn *accfn)
+{
+ /*
+ * Pairwise long operations: widen both halves of the pair,
+ * combine the pairs with the opfn, and then possibly accumulate
+ * into the destination with the accfn.
+ */
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (!widenfn) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ for (pass = 0; pass < a->q + 1; pass++) {
+ TCGv_i32 tmp;
+ TCGv_i64 rm0_64, rm1_64, rd_64;
+
+ rm0_64 = tcg_temp_new_i64();
+ rm1_64 = tcg_temp_new_i64();
+ rd_64 = tcg_temp_new_i64();
+
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vm, pass * 2, MO_32);
+ widenfn(rm0_64, tmp);
+ read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
+ widenfn(rm1_64, tmp);
+ tcg_temp_free_i32(tmp);
+
+ opfn(rd_64, rm0_64, rm1_64);
+ tcg_temp_free_i64(rm0_64);
+ tcg_temp_free_i64(rm1_64);
+
+ if (accfn) {
+ TCGv_i64 tmp64 = tcg_temp_new_i64();
+ read_neon_element64(tmp64, a->vd, pass, MO_64);
+ accfn(rd_64, tmp64, rd_64);
+ tcg_temp_free_i64(tmp64);
+ }
+ write_neon_element64(rd_64, a->vd, pass, MO_64);
+ tcg_temp_free_i64(rd_64);
+ }
+ return true;
+}
+
+static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_s8,
+ gen_helper_neon_widen_s16,
+ tcg_gen_ext_i32_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const opfn[] = {
+ gen_helper_neon_paddl_u16,
+ gen_helper_neon_paddl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
+}
+
+static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_u8,
+ gen_helper_neon_widen_u16,
+ tcg_gen_extu_i32_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const opfn[] = {
+ gen_helper_neon_paddl_u16,
+ gen_helper_neon_paddl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
+}
+
+static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_s8,
+ gen_helper_neon_widen_s16,
+ tcg_gen_ext_i32_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const opfn[] = {
+ gen_helper_neon_paddl_u16,
+ gen_helper_neon_paddl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ gen_helper_neon_addl_u16,
+ gen_helper_neon_addl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
+ accfn[a->size]);
+}
+
+static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenWidenFn * const widenfn[] = {
+ gen_helper_neon_widen_u8,
+ gen_helper_neon_widen_u16,
+ tcg_gen_extu_i32_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const opfn[] = {
+ gen_helper_neon_paddl_u16,
+ gen_helper_neon_paddl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ gen_helper_neon_addl_u16,
+ gen_helper_neon_addl_u32,
+ tcg_gen_add_i64,
+ NULL,
+ };
+
+ return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
+ accfn[a->size]);
+}
+
+typedef void ZipFn(TCGv_ptr, TCGv_ptr);
+
+static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
+ ZipFn *fn)
+{
+ TCGv_ptr pd, pm;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (!fn) {
+ /* Bad size or size/q combination */
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ pd = vfp_reg_ptr(true, a->vd);
+ pm = vfp_reg_ptr(true, a->vm);
+ fn(pd, pm);
+ tcg_temp_free_ptr(pd);
+ tcg_temp_free_ptr(pm);
+ return true;
+}
+
+static bool trans_VUZP(DisasContext *s, arg_2misc *a)
+{
+ static ZipFn * const fn[2][4] = {
+ {
+ gen_helper_neon_unzip8,
+ gen_helper_neon_unzip16,
+ NULL,
+ NULL,
+ }, {
+ gen_helper_neon_qunzip8,
+ gen_helper_neon_qunzip16,
+ gen_helper_neon_qunzip32,
+ NULL,
+ }
+ };
+ return do_zip_uzp(s, a, fn[a->q][a->size]);
+}
+
+static bool trans_VZIP(DisasContext *s, arg_2misc *a)
+{
+ static ZipFn * const fn[2][4] = {
+ {
+ gen_helper_neon_zip8,
+ gen_helper_neon_zip16,
+ NULL,
+ NULL,
+ }, {
+ gen_helper_neon_qzip8,
+ gen_helper_neon_qzip16,
+ gen_helper_neon_qzip32,
+ NULL,
+ }
+ };
+ return do_zip_uzp(s, a, fn[a->q][a->size]);
+}
+
+static bool do_vmovn(DisasContext *s, arg_2misc *a,
+ NeonGenNarrowEnvFn *narrowfn)
+{
+ TCGv_i64 rm;
+ TCGv_i32 rd0, rd1;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vm & 1) {
+ return false;
+ }
+
+ if (!narrowfn) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rm = tcg_temp_new_i64();
+ rd0 = tcg_temp_new_i32();
+ rd1 = tcg_temp_new_i32();
+
+ read_neon_element64(rm, a->vm, 0, MO_64);
+ narrowfn(rd0, cpu_env, rm);
+ read_neon_element64(rm, a->vm, 1, MO_64);
+ narrowfn(rd1, cpu_env, rm);
+ write_neon_element32(rd0, a->vd, 0, MO_32);
+ write_neon_element32(rd1, a->vd, 1, MO_32);
+ tcg_temp_free_i32(rd0);
+ tcg_temp_free_i32(rd1);
+ tcg_temp_free_i64(rm);
+ return true;
+}
+
+#define DO_VMOVN(INSN, FUNC) \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ static NeonGenNarrowEnvFn * const narrowfn[] = { \
+ FUNC##8, \
+ FUNC##16, \
+ FUNC##32, \
+ NULL, \
+ }; \
+ return do_vmovn(s, a, narrowfn[a->size]); \
+ }
+
+DO_VMOVN(VMOVN, gen_neon_narrow_u)
+DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
+DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
+DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
+
+static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
+{
+ TCGv_i32 rm0, rm1;
+ TCGv_i64 rd;
+ static NeonGenWidenFn * const widenfns[] = {
+ gen_helper_neon_widen_u8,
+ gen_helper_neon_widen_u16,
+ tcg_gen_extu_i32_i64,
+ NULL,
+ };
+ NeonGenWidenFn *widenfn = widenfns[a->size];
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ if (!widenfn) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rd = tcg_temp_new_i64();
+ rm0 = tcg_temp_new_i32();
+ rm1 = tcg_temp_new_i32();
+
+ read_neon_element32(rm0, a->vm, 0, MO_32);
+ read_neon_element32(rm1, a->vm, 1, MO_32);
+
+ widenfn(rd, rm0);
+ tcg_gen_shli_i64(rd, rd, 8 << a->size);
+ write_neon_element64(rd, a->vd, 0, MO_64);
+ widenfn(rd, rm1);
+ tcg_gen_shli_i64(rd, rd, 8 << a->size);
+ write_neon_element64(rd, a->vd, 1, MO_64);
+
+ tcg_temp_free_i64(rd);
+ tcg_temp_free_i32(rm0);
+ tcg_temp_free_i32(rm1);
+ return true;
+}
+
+static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i64 tmp;
+ TCGv_i32 dst0, dst1;
+
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm & 1) || (a->size != 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_STD);
+ tmp = tcg_temp_new_i64();
+ dst0 = tcg_temp_new_i32();
+ dst1 = tcg_temp_new_i32();
+
+ read_neon_element64(tmp, a->vm, 0, MO_64);
+ gen_helper_bfcvt_pair(dst0, tmp, fpst);
+
+ read_neon_element64(tmp, a->vm, 1, MO_64);
+ gen_helper_bfcvt_pair(dst1, tmp, fpst);
+
+ write_neon_element32(dst0, a->vd, 0, MO_32);
+ write_neon_element32(dst1, a->vd, 1, MO_32);
+
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i32(dst0);
+ tcg_temp_free_i32(dst1);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp, tmp, tmp2, tmp3;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
+ !dc_isar_feature(aa32_fp16_spconv, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm & 1) || (a->size != 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_STD);
+ ahp = get_ahp_flag();
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vm, 0, MO_32);
+ gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
+ tmp2 = tcg_temp_new_i32();
+ read_neon_element32(tmp2, a->vm, 1, MO_32);
+ gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
+ tcg_gen_shli_i32(tmp2, tmp2, 16);
+ tcg_gen_or_i32(tmp2, tmp2, tmp);
+ read_neon_element32(tmp, a->vm, 2, MO_32);
+ gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
+ tmp3 = tcg_temp_new_i32();
+ read_neon_element32(tmp3, a->vm, 3, MO_32);
+ write_neon_element32(tmp2, a->vd, 0, MO_32);
+ tcg_temp_free_i32(tmp2);
+ gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
+ tcg_gen_shli_i32(tmp3, tmp3, 16);
+ tcg_gen_or_i32(tmp3, tmp3, tmp);
+ write_neon_element32(tmp3, a->vd, 1, MO_32);
+ tcg_temp_free_i32(tmp3);
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(ahp);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp, tmp, tmp2, tmp3;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
+ !dc_isar_feature(aa32_fp16_spconv, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vd & 1) || (a->size != 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_STD);
+ ahp = get_ahp_flag();
+ tmp3 = tcg_temp_new_i32();
+ tmp2 = tcg_temp_new_i32();
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vm, 0, MO_32);
+ read_neon_element32(tmp2, a->vm, 1, MO_32);
+ tcg_gen_ext16u_i32(tmp3, tmp);
+ gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
+ write_neon_element32(tmp3, a->vd, 0, MO_32);
+ tcg_gen_shri_i32(tmp, tmp, 16);
+ gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
+ write_neon_element32(tmp, a->vd, 1, MO_32);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_ext16u_i32(tmp3, tmp2);
+ gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
+ write_neon_element32(tmp3, a->vd, 2, MO_32);
+ tcg_temp_free_i32(tmp3);
+ tcg_gen_shri_i32(tmp2, tmp2, 16);
+ gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
+ write_neon_element32(tmp2, a->vd, 3, MO_32);
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(ahp);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
+{
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_full_reg_offset(a->vd);
+ int rm_ofs = neon_full_reg_offset(a->vm);
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->size == 3) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
+
+ return true;
+}
+
+#define DO_2MISC_VEC(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ return do_2misc_vec(s, a, FN); \
+ }
+
+DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
+DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
+DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
+DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
+DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
+DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
+DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
+
+static bool trans_VMVN(DisasContext *s, arg_2misc *a)
+{
+ if (a->size != 0) {
+ return false;
+ }
+ return do_2misc_vec(s, a, tcg_gen_gvec_not);
+}
+
+#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA) \
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, uint32_t oprsz, \
+ uint32_t maxsz) \
+ { \
+ tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz, \
+ DATA, FUNC); \
+ }
+
+#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA) \
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, uint32_t oprsz, \
+ uint32_t maxsz) \
+ { \
+ tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC); \
+ }
+
+WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
+WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
+WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
+WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
+WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
+WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
+WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
+
+#define DO_2M_CRYPTO(INSN, FEATURE, SIZE) \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) { \
+ return false; \
+ } \
+ return do_2misc_vec(s, a, gen_##INSN); \
+ }
+
+DO_2M_CRYPTO(AESE, aa32_aes, 0)
+DO_2M_CRYPTO(AESD, aa32_aes, 0)
+DO_2M_CRYPTO(AESMC, aa32_aes, 0)
+DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
+DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
+DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
+DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
+
+static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
+{
+ TCGv_i32 tmp;
+ int pass;
+
+ /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!fn) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+ read_neon_element32(tmp, a->vm, pass, MO_32);
+ fn(tmp, tmp);
+ write_neon_element32(tmp, a->vd, pass, MO_32);
+ }
+ tcg_temp_free_i32(tmp);
+
+ return true;
+}
+
+static bool trans_VREV32(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenOneOpFn * const fn[] = {
+ tcg_gen_bswap32_i32,
+ gen_swap_half,
+ NULL,
+ NULL,
+ };
+ return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VREV16(DisasContext *s, arg_2misc *a)
+{
+ if (a->size != 0) {
+ return false;
+ }
+ return do_2misc(s, a, gen_rev16);
+}
+
+static bool trans_VCLS(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenOneOpFn * const fn[] = {
+ gen_helper_neon_cls_s8,
+ gen_helper_neon_cls_s16,
+ gen_helper_neon_cls_s32,
+ NULL,
+ };
+ return do_2misc(s, a, fn[a->size]);
+}
+
+static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
+{
+ tcg_gen_clzi_i32(rd, rm, 32);
+}
+
+static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenOneOpFn * const fn[] = {
+ gen_helper_neon_clz_u8,
+ gen_helper_neon_clz_u16,
+ do_VCLZ_32,
+ NULL,
+ };
+ return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VCNT(DisasContext *s, arg_2misc *a)
+{
+ if (a->size != 0) {
+ return false;
+ }
+ return do_2misc(s, a, gen_helper_neon_cnt_u8);
+}
+
+static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
+ vece == MO_16 ? 0x7fff : 0x7fffffff,
+ oprsz, maxsz);
+}
+
+static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
+{
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ } else if (a->size != MO_32) {
+ return false;
+ }
+ return do_2misc_vec(s, a, gen_VABS_F);
+}
+
+static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
+ vece == MO_16 ? 0x8000 : 0x80000000,
+ oprsz, maxsz);
+}
+
+static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
+{
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ } else if (a->size != MO_32) {
+ return false;
+ }
+ return do_2misc_vec(s, a, gen_VNEG_F);
+}
+
+static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
+{
+ if (a->size != 2) {
+ return false;
+ }
+ return do_2misc(s, a, gen_helper_recpe_u32);
+}
+
+static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
+{
+ if (a->size != 2) {
+ return false;
+ }
+ return do_2misc(s, a, gen_helper_rsqrte_u32);
+}
+
+#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
+ static void WRAPNAME(TCGv_i32 d, TCGv_i32 m) \
+ { \
+ FUNC(d, cpu_env, m); \
+ }
+
+WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
+WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
+WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
+WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
+WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
+WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
+
+static bool trans_VQABS(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenOneOpFn * const fn[] = {
+ gen_VQABS_s8,
+ gen_VQABS_s16,
+ gen_VQABS_s32,
+ NULL,
+ };
+ return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
+{
+ static NeonGenOneOpFn * const fn[] = {
+ gen_VQNEG_s8,
+ gen_VQNEG_s16,
+ gen_VQNEG_s32,
+ NULL,
+ };
+ return do_2misc(s, a, fn[a->size]);
+}
+
+#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
+ NULL, HFUNC, SFUNC, NULL, \
+ }; \
+ TCGv_ptr fpst; \
+ fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD); \
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0, \
+ fns[vece]); \
+ tcg_temp_free_ptr(fpst); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ } else if (a->size != MO_32) { \
+ return false; \
+ } \
+ return do_2misc_vec(s, a, gen_##INSN); \
+ }
+
+DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
+DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
+DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
+DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
+DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
+DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
+DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
+DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
+DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
+DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
+DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
+
+DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
+
+static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+ return trans_VRINTX_impl(s, a);
+}
+
+#define DO_VEC_RMODE(INSN, RMODE, OP) \
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
+ NULL, \
+ gen_helper_gvec_##OP##h, \
+ gen_helper_gvec_##OP##s, \
+ NULL, \
+ }; \
+ TCGv_ptr fpst; \
+ fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD); \
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, \
+ arm_rmode_to_sf(RMODE), fns[vece]); \
+ tcg_temp_free_ptr(fpst); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) { \
+ return false; \
+ } \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ } else if (a->size != MO_32) { \
+ return false; \
+ } \
+ return do_2misc_vec(s, a, gen_##INSN); \
+ }
+
+DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
+DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
+DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
+DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
+DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
+DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
+
+DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
+DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
+DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
+DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
+DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
+
+static bool trans_VSWP(DisasContext *s, arg_2misc *a)
+{
+ TCGv_i64 rm, rd;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->size != 0) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ rm = tcg_temp_new_i64();
+ rd = tcg_temp_new_i64();
+ for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
+ read_neon_element64(rm, a->vm, pass, MO_64);
+ read_neon_element64(rd, a->vd, pass, MO_64);
+ write_neon_element64(rm, a->vd, pass, MO_64);
+ write_neon_element64(rd, a->vm, pass, MO_64);
+ }
+ tcg_temp_free_i64(rm);
+ tcg_temp_free_i64(rd);
+
+ return true;
+}
+static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 rd, tmp;
+
+ rd = tcg_temp_new_i32();
+ tmp = tcg_temp_new_i32();
+
+ tcg_gen_shli_i32(rd, t0, 8);
+ tcg_gen_andi_i32(rd, rd, 0xff00ff00);
+ tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
+ tcg_gen_or_i32(rd, rd, tmp);
+
+ tcg_gen_shri_i32(t1, t1, 8);
+ tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
+ tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
+ tcg_gen_or_i32(t1, t1, tmp);
+ tcg_gen_mov_i32(t0, rd);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(rd);
+}
+
+static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 rd, tmp;
+
+ rd = tcg_temp_new_i32();
+ tmp = tcg_temp_new_i32();
+
+ tcg_gen_shli_i32(rd, t0, 16);
+ tcg_gen_andi_i32(tmp, t1, 0xffff);
+ tcg_gen_or_i32(rd, rd, tmp);
+ tcg_gen_shri_i32(t1, t1, 16);
+ tcg_gen_andi_i32(tmp, t0, 0xffff0000);
+ tcg_gen_or_i32(t1, t1, tmp);
+ tcg_gen_mov_i32(t0, rd);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(rd);
+}
+
+static bool trans_VTRN(DisasContext *s, arg_2misc *a)
+{
+ TCGv_i32 tmp, tmp2;
+ int pass;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vd | a->vm) & a->q) {
+ return false;
+ }
+
+ if (a->size == 3) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ tmp2 = tcg_temp_new_i32();
+ if (a->size == MO_32) {
+ for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
+ read_neon_element32(tmp, a->vm, pass, MO_32);
+ read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
+ write_neon_element32(tmp2, a->vm, pass, MO_32);
+ write_neon_element32(tmp, a->vd, pass + 1, MO_32);
+ }
+ } else {
+ for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+ read_neon_element32(tmp, a->vm, pass, MO_32);
+ read_neon_element32(tmp2, a->vd, pass, MO_32);
+ if (a->size == MO_8) {
+ gen_neon_trn_u8(tmp, tmp2);
+ } else {
+ gen_neon_trn_u16(tmp, tmp2);
+ }
+ write_neon_element32(tmp2, a->vm, pass, MO_32);
+ write_neon_element32(tmp, a->vd, pass, MO_32);
+ }
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(tmp2);
+ return true;
+}
+
+static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_smmla_b);
+}
+
+static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_ummla_b);
+}
+
+static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
+{
+ if (!dc_isar_feature(aa32_i8mm, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_usmmla_b);
+}
+
+static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_bfmmla);
+}
+
+static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
+ gen_helper_gvec_bfmlal);
+}
+
+static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
+ (a->index << 1) | a->q, FPST_STD,
+ gen_helper_gvec_bfmlal_idx);
+}
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
new file mode 100644
index 0000000..7b87a9d
--- /dev/null
+++ b/target/arm/tcg/translate-sme.c
@@ -0,0 +1,373 @@
+/*
+ * AArch64 SME translation
+ *
+ * Copyright (c) 2022 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "translate.h"
+#include "exec/helper-gen.h"
+#include "translate-a64.h"
+#include "fpu/softfloat.h"
+
+
+/*
+ * Include the generated decoder.
+ */
+
+#include "decode-sme.c.inc"
+
+
+/*
+ * Resolve tile.size[index] to a host pointer, where tile and index
+ * are always decoded together, dependent on the element size.
+ */
+static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
+ int tile_index, bool vertical)
+{
+ int tile = tile_index >> (4 - esz);
+ int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
+ int pos, len, offset;
+ TCGv_i32 tmp;
+ TCGv_ptr addr;
+
+ /* Compute the final index, which is Rs+imm. */
+ tmp = tcg_temp_new_i32();
+ tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
+ tcg_gen_addi_i32(tmp, tmp, index);
+
+ /* Prepare a power-of-two modulo via extraction of @len bits. */
+ len = ctz32(streaming_vec_reg_size(s)) - esz;
+
+ if (vertical) {
+ /*
+ * Compute the byte offset of the index within the tile:
+ * (index % (svl / size)) * size
+ * = (index % (svl >> esz)) << esz
+ * Perform the power-of-two modulo via extraction of the low @len bits.
+ * Perform the multiply by shifting left by @pos bits.
+ * Perform these operations simultaneously via deposit into zero.
+ */
+ pos = esz;
+ tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
+
+ /*
+ * For big-endian, adjust the indexed column byte offset within
+ * the uint64_t host words that make up env->zarray[].
+ */
+ if (HOST_BIG_ENDIAN && esz < MO_64) {
+ tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
+ }
+ } else {
+ /*
+ * Compute the byte offset of the index within the tile:
+ * (index % (svl / size)) * (size * sizeof(row))
+ * = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
+ */
+ pos = esz + ctz32(sizeof(ARMVectorReg));
+ tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
+
+ /* Row slices are always aligned and need no endian adjustment. */
+ }
+
+ /* The tile byte offset within env->zarray is the row. */
+ offset = tile * sizeof(ARMVectorReg);
+
+ /* Include the byte offset of zarray to make this relative to env. */
+ offset += offsetof(CPUARMState, zarray);
+ tcg_gen_addi_i32(tmp, tmp, offset);
+
+ /* Add the byte offset to env to produce the final pointer. */
+ addr = tcg_temp_new_ptr();
+ tcg_gen_ext_i32_ptr(addr, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_add_ptr(addr, addr, cpu_env);
+
+ return addr;
+}
+
+static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
+{
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (sme_za_enabled_check(s)) {
+ gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm),
+ tcg_constant_i32(streaming_vec_reg_size(s)));
+ }
+ return true;
+}
+
+static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
+{
+ static gen_helper_gvec_4 * const h_fns[5] = {
+ gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+ gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
+ gen_helper_sve_sel_zpzz_q
+ };
+ static gen_helper_gvec_3 * const cz_fns[5] = {
+ gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
+ gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
+ gen_helper_sme_mova_cz_q,
+ };
+ static gen_helper_gvec_3 * const zc_fns[5] = {
+ gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
+ gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
+ gen_helper_sme_mova_zc_q,
+ };
+
+ TCGv_ptr t_za, t_zr, t_pg;
+ TCGv_i32 t_desc;
+ int svl;
+
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+ t_zr = vec_full_reg_ptr(s, a->zr);
+ t_pg = pred_full_reg_ptr(s, a->pg);
+
+ svl = streaming_vec_reg_size(s);
+ t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
+
+ if (a->v) {
+ /* Vertical slice -- use sme mova helpers. */
+ if (a->to_vec) {
+ zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
+ } else {
+ cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
+ }
+ } else {
+ /* Horizontal slice -- reuse sve sel helpers. */
+ if (a->to_vec) {
+ h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
+ } else {
+ h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
+ }
+ }
+
+ tcg_temp_free_ptr(t_za);
+ tcg_temp_free_ptr(t_zr);
+ tcg_temp_free_ptr(t_pg);
+
+ return true;
+}
+
+static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
+{
+ typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
+
+ /*
+ * Indexed by [esz][be][v][mte][st], which is (except for load/store)
+ * also the order in which the elements appear in the function names,
+ * and so how we must concatenate the pieces.
+ */
+
+#define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
+#define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) }
+#define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) }
+#define FN_END(L, B) { FN_HV(L), FN_HV(B) }
+
+ static GenLdSt1 * const fns[5][2][2][2][2] = {
+ FN_END(b, b),
+ FN_END(h_le, h_be),
+ FN_END(s_le, s_be),
+ FN_END(d_le, d_be),
+ FN_END(q_le, q_be),
+ };
+
+#undef FN_LS
+#undef FN_MTE
+#undef FN_HV
+#undef FN_END
+
+ TCGv_ptr t_za, t_pg;
+ TCGv_i64 addr;
+ int svl, desc = 0;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+ t_pg = pred_full_reg_ptr(s, a->pg);
+ addr = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+
+ if (mte) {
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+ desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
+ desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
+ desc <<= SVE_MTEDESC_SHIFT;
+ } else {
+ addr = clean_data_tbi(s, addr);
+ }
+ svl = streaming_vec_reg_size(s);
+ desc = simd_desc(svl, svl, desc);
+
+ fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
+ tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_za);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_i64(addr);
+ return true;
+}
+
+typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
+
+static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
+{
+ int svl = streaming_vec_reg_size(s);
+ int imm = a->imm;
+ TCGv_ptr base;
+
+ if (!sme_za_enabled_check(s)) {
+ return true;
+ }
+
+ /* ZA[n] equates to ZA0H.B[n]. */
+ base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
+
+ fn(s, base, 0, svl, a->rn, imm * svl);
+
+ tcg_temp_free_ptr(base);
+ return true;
+}
+
+TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
+TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
+
+static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
+ gen_helper_gvec_4 *fn)
+{
+ int svl = streaming_vec_reg_size(s);
+ uint32_t desc = simd_desc(svl, svl, 0);
+ TCGv_ptr za, zn, pn, pm;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ /* Sum XZR+zad to find ZAd. */
+ za = get_tile_rowcol(s, esz, 31, a->zad, false);
+ zn = vec_full_reg_ptr(s, a->zn);
+ pn = pred_full_reg_ptr(s, a->pn);
+ pm = pred_full_reg_ptr(s, a->pm);
+
+ fn(za, zn, pn, pm, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(za);
+ tcg_temp_free_ptr(zn);
+ tcg_temp_free_ptr(pn);
+ tcg_temp_free_ptr(pm);
+ return true;
+}
+
+TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
+TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
+TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
+TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
+
+static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
+ gen_helper_gvec_5 *fn)
+{
+ int svl = streaming_vec_reg_size(s);
+ uint32_t desc = simd_desc(svl, svl, a->sub);
+ TCGv_ptr za, zn, zm, pn, pm;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ /* Sum XZR+zad to find ZAd. */
+ za = get_tile_rowcol(s, esz, 31, a->zad, false);
+ zn = vec_full_reg_ptr(s, a->zn);
+ zm = vec_full_reg_ptr(s, a->zm);
+ pn = pred_full_reg_ptr(s, a->pn);
+ pm = pred_full_reg_ptr(s, a->pm);
+
+ fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(za);
+ tcg_temp_free_ptr(zn);
+ tcg_temp_free_ptr(pn);
+ tcg_temp_free_ptr(pm);
+ return true;
+}
+
+static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
+ gen_helper_gvec_5_ptr *fn)
+{
+ int svl = streaming_vec_reg_size(s);
+ uint32_t desc = simd_desc(svl, svl, a->sub);
+ TCGv_ptr za, zn, zm, pn, pm, fpst;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ /* Sum XZR+zad to find ZAd. */
+ za = get_tile_rowcol(s, esz, 31, a->zad, false);
+ zn = vec_full_reg_ptr(s, a->zn);
+ zm = vec_full_reg_ptr(s, a->zm);
+ pn = pred_full_reg_ptr(s, a->pn);
+ pm = pred_full_reg_ptr(s, a->pm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+
+ fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(za);
+ tcg_temp_free_ptr(zn);
+ tcg_temp_free_ptr(pn);
+ tcg_temp_free_ptr(pm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
+TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
+TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
+
+/* TODO: FEAT_EBF16 */
+TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
+
+TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
+TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
+TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
+TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
+
+TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
+TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
+TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
+TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
new file mode 100644
index 0000000..621a2ab
--- /dev/null
+++ b/target/arm/tcg/translate-sve.c
@@ -0,0 +1,7583 @@
+/*
+ * AArch64 SVE translation
+ *
+ * Copyright (c) 2018 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "qemu/log.h"
+#include "arm_ldst.h"
+#include "translate.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "translate-a64.h"
+#include "fpu/softfloat.h"
+
+
+typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t,
+ TCGv_i64, uint32_t, uint32_t);
+
+typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_ptr, TCGv_i32);
+
+typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32);
+typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i64, TCGv_i32);
+
+/*
+ * Helpers for extracting complex instruction fields.
+ */
+
+/* See e.g. ASR (immediate, predicated).
+ * Returns -1 for unallocated encoding; diagnose later.
+ */
+static int tszimm_esz(DisasContext *s, int x)
+{
+ x >>= 3; /* discard imm3 */
+ return 31 - clz32(x);
+}
+
+static int tszimm_shr(DisasContext *s, int x)
+{
+ return (16 << tszimm_esz(s, x)) - x;
+}
+
+/* See e.g. LSL (immediate, predicated). */
+static int tszimm_shl(DisasContext *s, int x)
+{
+ return x - (8 << tszimm_esz(s, x));
+}
+
+/* The SH bit is in bit 8. Extract the low 8 and shift. */
+static inline int expand_imm_sh8s(DisasContext *s, int x)
+{
+ return (int8_t)x << (x & 0x100 ? 8 : 0);
+}
+
+static inline int expand_imm_sh8u(DisasContext *s, int x)
+{
+ return (uint8_t)x << (x & 0x100 ? 8 : 0);
+}
+
+/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype)
+ * with unsigned data. C.f. SVE Memory Contiguous Load Group.
+ */
+static inline int msz_dtype(DisasContext *s, int msz)
+{
+ static const uint8_t dtype[4] = { 0, 5, 10, 15 };
+ return dtype[msz];
+}
+
+/*
+ * Include the generated decoder.
+ */
+
+#include "decode-sve.c.inc"
+
+/*
+ * Implement all of the translator functions referenced by the decoder.
+ */
+
+/* Invoke an out-of-line helper on 2 Zregs. */
+static bool gen_gvec_ool_zz(DisasContext *s, gen_helper_gvec_2 *fn,
+ int rd, int rn, int data)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
+ int rd, int rn, int data,
+ ARMFPStatusFlavour flavour)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(flavour);
+
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ status, vsz, vsz, data, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
+ arg_rr_esz *a, int data)
+{
+ return gen_gvec_fpst_zz(s, fn, a->rd, a->rn, data,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs. */
+static bool gen_gvec_ool_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
+ int rd, int rn, int rm, int data)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_ool_arg_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
+ arg_rrr_esz *a, int data)
+{
+ return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, data);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs, plus float_status. */
+static bool gen_gvec_fpst_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+ int rd, int rn, int rm,
+ int data, ARMFPStatusFlavour flavour)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(flavour);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ status, vsz, vsz, data, fn);
+
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+ arg_rrr_esz *a, int data)
+{
+ return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke an out-of-line helper on 4 Zregs. */
+static bool gen_gvec_ool_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
+ int rd, int rn, int rm, int ra, int data)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, ra),
+ vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_ool_arg_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
+ arg_rrrr_esz *a, int data)
+{
+ return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, data);
+}
+
+static bool gen_gvec_ool_arg_zzxz(DisasContext *s, gen_helper_gvec_4 *fn,
+ arg_rrxr_esz *a)
+{
+ return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index);
+}
+
+/* Invoke an out-of-line helper on 4 Zregs, plus a pointer. */
+static bool gen_gvec_ptr_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+ int rd, int rn, int rm, int ra,
+ int data, TCGv_ptr ptr)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, ra),
+ ptr, vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+ int rd, int rn, int rm, int ra,
+ int data, ARMFPStatusFlavour flavour)
+{
+ TCGv_ptr status = fpstatus_ptr(flavour);
+ bool ret = gen_gvec_ptr_zzzz(s, fn, rd, rn, rm, ra, data, status);
+ tcg_temp_free_ptr(status);
+ return ret;
+}
+
+/* Invoke an out-of-line helper on 4 Zregs, 1 Preg, plus fpst. */
+static bool gen_gvec_fpst_zzzzp(DisasContext *s, gen_helper_gvec_5_ptr *fn,
+ int rd, int rn, int rm, int ra, int pg,
+ int data, ARMFPStatusFlavour flavour)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(flavour);
+
+ tcg_gen_gvec_5_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ vec_full_reg_offset(s, ra),
+ pred_full_reg_offset(s, pg),
+ status, vsz, vsz, data, fn);
+
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+/* Invoke an out-of-line helper on 2 Zregs and a predicate. */
+static bool gen_gvec_ool_zzp(DisasContext *s, gen_helper_gvec_3 *fn,
+ int rd, int rn, int pg, int data)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ pred_full_reg_offset(s, pg),
+ vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_ool_arg_zpz(DisasContext *s, gen_helper_gvec_3 *fn,
+ arg_rpr_esz *a, int data)
+{
+ return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, data);
+}
+
+static bool gen_gvec_ool_arg_zpzi(DisasContext *s, gen_helper_gvec_3 *fn,
+ arg_rpri_esz *a)
+{
+ return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, a->imm);
+}
+
+static bool gen_gvec_fpst_zzp(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+ int rd, int rn, int pg, int data,
+ ARMFPStatusFlavour flavour)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(flavour);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ pred_full_reg_offset(s, pg),
+ status, vsz, vsz, data, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_arg_zpz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+ arg_rpr_esz *a, int data,
+ ARMFPStatusFlavour flavour)
+{
+ return gen_gvec_fpst_zzp(s, fn, a->rd, a->rn, a->pg, data, flavour);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
+static bool gen_gvec_ool_zzzp(DisasContext *s, gen_helper_gvec_4 *fn,
+ int rd, int rn, int rm, int pg, int data)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ pred_full_reg_offset(s, pg),
+ vsz, vsz, data, fn);
+ }
+ return true;
+}
+
+static bool gen_gvec_ool_arg_zpzz(DisasContext *s, gen_helper_gvec_4 *fn,
+ arg_rprr_esz *a, int data)
+{
+ return gen_gvec_ool_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, data);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
+static bool gen_gvec_fpst_zzzp(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+ int rd, int rn, int rm, int pg, int data,
+ ARMFPStatusFlavour flavour)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(flavour);
+
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ pred_full_reg_offset(s, pg),
+ status, vsz, vsz, data, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool gen_gvec_fpst_arg_zpzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+ arg_rprr_esz *a)
+{
+ return gen_gvec_fpst_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, 0,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke a vector expander on two Zregs and an immediate. */
+static bool gen_gvec_fn_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
+ int esz, int rd, int rn, uint64_t imm)
+{
+ if (gvec_fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ gvec_fn(esz, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn), imm, vsz, vsz);
+ }
+ return true;
+}
+
+static bool gen_gvec_fn_arg_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
+ arg_rri_esz *a)
+{
+ if (a->esz < 0) {
+ /* Invalid tsz encoding -- see tszimm_esz. */
+ return false;
+ }
+ return gen_gvec_fn_zzi(s, gvec_fn, a->esz, a->rd, a->rn, a->imm);
+}
+
+/* Invoke a vector expander on three Zregs. */
+static bool gen_gvec_fn_zzz(DisasContext *s, GVecGen3Fn *gvec_fn,
+ int esz, int rd, int rn, int rm)
+{
+ if (gvec_fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ gvec_fn(esz, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), vsz, vsz);
+ }
+ return true;
+}
+
+static bool gen_gvec_fn_arg_zzz(DisasContext *s, GVecGen3Fn *fn,
+ arg_rrr_esz *a)
+{
+ return gen_gvec_fn_zzz(s, fn, a->esz, a->rd, a->rn, a->rm);
+}
+
+/* Invoke a vector expander on four Zregs. */
+static bool gen_gvec_fn_arg_zzzz(DisasContext *s, GVecGen4Fn *gvec_fn,
+ arg_rrrr_esz *a)
+{
+ if (gvec_fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vec_full_reg_offset(s, a->ra), vsz, vsz);
+ }
+ return true;
+}
+
+/* Invoke a vector move on two Zregs. */
+static bool do_mov_z(DisasContext *s, int rd, int rn)
+{
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_mov(MO_8, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn), vsz, vsz);
+ }
+ return true;
+}
+
+/* Initialize a Zreg with replications of a 64-bit immediate. */
+static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
+}
+
+/* Invoke a vector expander on three Pregs. */
+static bool gen_gvec_fn_ppp(DisasContext *s, GVecGen3Fn *gvec_fn,
+ int rd, int rn, int rm)
+{
+ if (sve_access_check(s)) {
+ unsigned psz = pred_gvec_reg_size(s);
+ gvec_fn(MO_64, pred_full_reg_offset(s, rd),
+ pred_full_reg_offset(s, rn),
+ pred_full_reg_offset(s, rm), psz, psz);
+ }
+ return true;
+}
+
+/* Invoke a vector move on two Pregs. */
+static bool do_mov_p(DisasContext *s, int rd, int rn)
+{
+ if (sve_access_check(s)) {
+ unsigned psz = pred_gvec_reg_size(s);
+ tcg_gen_gvec_mov(MO_8, pred_full_reg_offset(s, rd),
+ pred_full_reg_offset(s, rn), psz, psz);
+ }
+ return true;
+}
+
+/* Set the cpu flags as per a return from an SVE helper. */
+static void do_pred_flags(TCGv_i32 t)
+{
+ tcg_gen_mov_i32(cpu_NF, t);
+ tcg_gen_andi_i32(cpu_ZF, t, 2);
+ tcg_gen_andi_i32(cpu_CF, t, 1);
+ tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* Subroutines computing the ARM PredTest psuedofunction. */
+static void do_predtest1(TCGv_i64 d, TCGv_i64 g)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ gen_helper_sve_predtest1(t, d, g);
+ do_pred_flags(t);
+ tcg_temp_free_i32(t);
+}
+
+static void do_predtest(DisasContext *s, int dofs, int gofs, int words)
+{
+ TCGv_ptr dptr = tcg_temp_new_ptr();
+ TCGv_ptr gptr = tcg_temp_new_ptr();
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_addi_ptr(dptr, cpu_env, dofs);
+ tcg_gen_addi_ptr(gptr, cpu_env, gofs);
+
+ gen_helper_sve_predtest(t, dptr, gptr, tcg_constant_i32(words));
+ tcg_temp_free_ptr(dptr);
+ tcg_temp_free_ptr(gptr);
+
+ do_pred_flags(t);
+ tcg_temp_free_i32(t);
+}
+
+/* For each element size, the bits within a predicate word that are active. */
+const uint64_t pred_esz_masks[5] = {
+ 0xffffffffffffffffull, 0x5555555555555555ull,
+ 0x1111111111111111ull, 0x0101010101010101ull,
+ 0x0001000100010001ull,
+};
+
+static bool trans_INVALID(DisasContext *s, arg_INVALID *a)
+{
+ unallocated_encoding(s);
+ return true;
+}
+
+/*
+ *** SVE Logical - Unpredicated Group
+ */
+
+TRANS_FEAT(AND_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_and, a)
+TRANS_FEAT(ORR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_or, a)
+TRANS_FEAT(EOR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_xor, a)
+TRANS_FEAT(BIC_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_andc, a)
+
+static void gen_xar8_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+ uint64_t mask = dup_const(MO_8, 0xff >> sh);
+
+ tcg_gen_xor_i64(t, n, m);
+ tcg_gen_shri_i64(d, t, sh);
+ tcg_gen_shli_i64(t, t, 8 - sh);
+ tcg_gen_andi_i64(d, d, mask);
+ tcg_gen_andi_i64(t, t, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_xar16_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+ uint64_t mask = dup_const(MO_16, 0xffff >> sh);
+
+ tcg_gen_xor_i64(t, n, m);
+ tcg_gen_shri_i64(d, t, sh);
+ tcg_gen_shli_i64(t, t, 16 - sh);
+ tcg_gen_andi_i64(d, d, mask);
+ tcg_gen_andi_i64(t, t, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_xar_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, int32_t sh)
+{
+ tcg_gen_xor_i32(d, n, m);
+ tcg_gen_rotri_i32(d, d, sh);
+}
+
+static void gen_xar_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+ tcg_gen_xor_i64(d, n, m);
+ tcg_gen_rotri_i64(d, d, sh);
+}
+
+static void gen_xar_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, int64_t sh)
+{
+ tcg_gen_xor_vec(vece, d, n, m);
+ tcg_gen_rotri_vec(vece, d, d, sh);
+}
+
+void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, int64_t shift,
+ uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop[] = { INDEX_op_rotli_vec, 0 };
+ static const GVecGen3i ops[4] = {
+ { .fni8 = gen_xar8_i64,
+ .fniv = gen_xar_vec,
+ .fno = gen_helper_sve2_xar_b,
+ .opt_opc = vecop,
+ .vece = MO_8 },
+ { .fni8 = gen_xar16_i64,
+ .fniv = gen_xar_vec,
+ .fno = gen_helper_sve2_xar_h,
+ .opt_opc = vecop,
+ .vece = MO_16 },
+ { .fni4 = gen_xar_i32,
+ .fniv = gen_xar_vec,
+ .fno = gen_helper_sve2_xar_s,
+ .opt_opc = vecop,
+ .vece = MO_32 },
+ { .fni8 = gen_xar_i64,
+ .fniv = gen_xar_vec,
+ .fno = gen_helper_gvec_xar_d,
+ .opt_opc = vecop,
+ .vece = MO_64 }
+ };
+ int esize = 8 << vece;
+
+ /* The SVE2 range is 1 .. esize; the AdvSIMD range is 0 .. esize-1. */
+ tcg_debug_assert(shift >= 0);
+ tcg_debug_assert(shift <= esize);
+ shift &= esize - 1;
+
+ if (shift == 0) {
+ /* xar with no rotate devolves to xor. */
+ tcg_gen_gvec_xor(vece, rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz);
+ } else {
+ tcg_gen_gvec_3i(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz,
+ shift, &ops[vece]);
+ }
+}
+
+static bool trans_XAR(DisasContext *s, arg_rrri_esz *a)
+{
+ if (a->esz < 0 || !dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ gen_gvec_xar(a->esz, vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm), a->imm, vsz, vsz);
+ }
+ return true;
+}
+
+static void gen_eor3_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+ tcg_gen_xor_i64(d, n, m);
+ tcg_gen_xor_i64(d, d, k);
+}
+
+static void gen_eor3_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec k)
+{
+ tcg_gen_xor_vec(vece, d, n, m);
+ tcg_gen_xor_vec(vece, d, d, k);
+}
+
+static void gen_eor3(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_eor3_i64,
+ .fniv = gen_eor3_vec,
+ .fno = gen_helper_sve2_eor3,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(EOR3, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_eor3, a)
+
+static void gen_bcax_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+ tcg_gen_andc_i64(d, m, k);
+ tcg_gen_xor_i64(d, d, n);
+}
+
+static void gen_bcax_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec k)
+{
+ tcg_gen_andc_vec(vece, d, m, k);
+ tcg_gen_xor_vec(vece, d, d, n);
+}
+
+static void gen_bcax(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_bcax_i64,
+ .fniv = gen_bcax_vec,
+ .fno = gen_helper_sve2_bcax,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BCAX, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bcax, a)
+
+static void gen_bsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ /* BSL differs from the generic bitsel in argument ordering. */
+ tcg_gen_gvec_bitsel(vece, d, a, n, m, oprsz, maxsz);
+}
+
+TRANS_FEAT(BSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl, a)
+
+static void gen_bsl1n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+ tcg_gen_andc_i64(n, k, n);
+ tcg_gen_andc_i64(m, m, k);
+ tcg_gen_or_i64(d, n, m);
+}
+
+static void gen_bsl1n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec k)
+{
+ if (TCG_TARGET_HAS_bitsel_vec) {
+ tcg_gen_not_vec(vece, n, n);
+ tcg_gen_bitsel_vec(vece, d, k, n, m);
+ } else {
+ tcg_gen_andc_vec(vece, n, k, n);
+ tcg_gen_andc_vec(vece, m, m, k);
+ tcg_gen_or_vec(vece, d, n, m);
+ }
+}
+
+static void gen_bsl1n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_bsl1n_i64,
+ .fniv = gen_bsl1n_vec,
+ .fno = gen_helper_sve2_bsl1n,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BSL1N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl1n, a)
+
+static void gen_bsl2n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+ /*
+ * Z[dn] = (n & k) | (~m & ~k)
+ * = | ~(m | k)
+ */
+ tcg_gen_and_i64(n, n, k);
+ if (TCG_TARGET_HAS_orc_i64) {
+ tcg_gen_or_i64(m, m, k);
+ tcg_gen_orc_i64(d, n, m);
+ } else {
+ tcg_gen_nor_i64(m, m, k);
+ tcg_gen_or_i64(d, n, m);
+ }
+}
+
+static void gen_bsl2n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec k)
+{
+ if (TCG_TARGET_HAS_bitsel_vec) {
+ tcg_gen_not_vec(vece, m, m);
+ tcg_gen_bitsel_vec(vece, d, k, n, m);
+ } else {
+ tcg_gen_and_vec(vece, n, n, k);
+ tcg_gen_or_vec(vece, m, m, k);
+ tcg_gen_orc_vec(vece, d, n, m);
+ }
+}
+
+static void gen_bsl2n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_bsl2n_i64,
+ .fniv = gen_bsl2n_vec,
+ .fno = gen_helper_sve2_bsl2n,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BSL2N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl2n, a)
+
+static void gen_nbsl_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+ tcg_gen_and_i64(n, n, k);
+ tcg_gen_andc_i64(m, m, k);
+ tcg_gen_nor_i64(d, n, m);
+}
+
+static void gen_nbsl_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec k)
+{
+ tcg_gen_bitsel_vec(vece, d, k, n, m);
+ tcg_gen_not_vec(vece, d, d);
+}
+
+static void gen_nbsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_nbsl_i64,
+ .fniv = gen_nbsl_vec,
+ .fno = gen_helper_sve2_nbsl,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(NBSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_nbsl, a)
+
+/*
+ *** SVE Integer Arithmetic - Unpredicated Group
+ */
+
+TRANS_FEAT(ADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_add, a)
+TRANS_FEAT(SUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sub, a)
+TRANS_FEAT(SQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ssadd, a)
+TRANS_FEAT(SQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sssub, a)
+TRANS_FEAT(UQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_usadd, a)
+TRANS_FEAT(UQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ussub, a)
+
+/*
+ *** SVE Integer Arithmetic - Binary Predicated Group
+ */
+
+/* Select active elememnts from Zn and inactive elements from Zm,
+ * storing the result in Zd.
+ */
+static bool do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz)
+{
+ static gen_helper_gvec_4 * const fns[4] = {
+ gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+ gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d
+ };
+ return gen_gvec_ool_zzzp(s, fns[esz], rd, rn, rm, pg, 0);
+}
+
+#define DO_ZPZZ(NAME, FEAT, name) \
+ static gen_helper_gvec_4 * const name##_zpzz_fns[4] = { \
+ gen_helper_##name##_zpzz_b, gen_helper_##name##_zpzz_h, \
+ gen_helper_##name##_zpzz_s, gen_helper_##name##_zpzz_d, \
+ }; \
+ TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpzz, \
+ name##_zpzz_fns[a->esz], a, 0)
+
+DO_ZPZZ(AND_zpzz, aa64_sve, sve_and)
+DO_ZPZZ(EOR_zpzz, aa64_sve, sve_eor)
+DO_ZPZZ(ORR_zpzz, aa64_sve, sve_orr)
+DO_ZPZZ(BIC_zpzz, aa64_sve, sve_bic)
+
+DO_ZPZZ(ADD_zpzz, aa64_sve, sve_add)
+DO_ZPZZ(SUB_zpzz, aa64_sve, sve_sub)
+
+DO_ZPZZ(SMAX_zpzz, aa64_sve, sve_smax)
+DO_ZPZZ(UMAX_zpzz, aa64_sve, sve_umax)
+DO_ZPZZ(SMIN_zpzz, aa64_sve, sve_smin)
+DO_ZPZZ(UMIN_zpzz, aa64_sve, sve_umin)
+DO_ZPZZ(SABD_zpzz, aa64_sve, sve_sabd)
+DO_ZPZZ(UABD_zpzz, aa64_sve, sve_uabd)
+
+DO_ZPZZ(MUL_zpzz, aa64_sve, sve_mul)
+DO_ZPZZ(SMULH_zpzz, aa64_sve, sve_smulh)
+DO_ZPZZ(UMULH_zpzz, aa64_sve, sve_umulh)
+
+DO_ZPZZ(ASR_zpzz, aa64_sve, sve_asr)
+DO_ZPZZ(LSR_zpzz, aa64_sve, sve_lsr)
+DO_ZPZZ(LSL_zpzz, aa64_sve, sve_lsl)
+
+static gen_helper_gvec_4 * const sdiv_fns[4] = {
+ NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d
+};
+TRANS_FEAT(SDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, sdiv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const udiv_fns[4] = {
+ NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d
+};
+TRANS_FEAT(UDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, udiv_fns[a->esz], a, 0)
+
+TRANS_FEAT(SEL_zpzz, aa64_sve, do_sel_z, a->rd, a->rn, a->rm, a->pg, a->esz)
+
+/*
+ *** SVE Integer Arithmetic - Unary Predicated Group
+ */
+
+#define DO_ZPZ(NAME, FEAT, name) \
+ static gen_helper_gvec_3 * const name##_fns[4] = { \
+ gen_helper_##name##_b, gen_helper_##name##_h, \
+ gen_helper_##name##_s, gen_helper_##name##_d, \
+ }; \
+ TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpz, name##_fns[a->esz], a, 0)
+
+DO_ZPZ(CLS, aa64_sve, sve_cls)
+DO_ZPZ(CLZ, aa64_sve, sve_clz)
+DO_ZPZ(CNT_zpz, aa64_sve, sve_cnt_zpz)
+DO_ZPZ(CNOT, aa64_sve, sve_cnot)
+DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz)
+DO_ZPZ(ABS, aa64_sve, sve_abs)
+DO_ZPZ(NEG, aa64_sve, sve_neg)
+DO_ZPZ(RBIT, aa64_sve, sve_rbit)
+
+static gen_helper_gvec_3 * const fabs_fns[4] = {
+ NULL, gen_helper_sve_fabs_h,
+ gen_helper_sve_fabs_s, gen_helper_sve_fabs_d,
+};
+TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, fabs_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const fneg_fns[4] = {
+ NULL, gen_helper_sve_fneg_h,
+ gen_helper_sve_fneg_s, gen_helper_sve_fneg_d,
+};
+TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, fneg_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sxtb_fns[4] = {
+ NULL, gen_helper_sve_sxtb_h,
+ gen_helper_sve_sxtb_s, gen_helper_sve_sxtb_d,
+};
+TRANS_FEAT(SXTB, aa64_sve, gen_gvec_ool_arg_zpz, sxtb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const uxtb_fns[4] = {
+ NULL, gen_helper_sve_uxtb_h,
+ gen_helper_sve_uxtb_s, gen_helper_sve_uxtb_d,
+};
+TRANS_FEAT(UXTB, aa64_sve, gen_gvec_ool_arg_zpz, uxtb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sxth_fns[4] = {
+ NULL, NULL, gen_helper_sve_sxth_s, gen_helper_sve_sxth_d
+};
+TRANS_FEAT(SXTH, aa64_sve, gen_gvec_ool_arg_zpz, sxth_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const uxth_fns[4] = {
+ NULL, NULL, gen_helper_sve_uxth_s, gen_helper_sve_uxth_d
+};
+TRANS_FEAT(UXTH, aa64_sve, gen_gvec_ool_arg_zpz, uxth_fns[a->esz], a, 0)
+
+TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz,
+ a->esz == 3 ? gen_helper_sve_sxtw_d : NULL, a, 0)
+TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz,
+ a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0)
+
+/*
+ *** SVE Integer Reduction Group
+ */
+
+typedef void gen_helper_gvec_reduc(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_i32);
+static bool do_vpz_ool(DisasContext *s, arg_rpr_esz *a,
+ gen_helper_gvec_reduc *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_zn, t_pg;
+ TCGv_i32 desc;
+ TCGv_i64 temp;
+
+ if (fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+ temp = tcg_temp_new_i64();
+ t_zn = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ fn(temp, t_zn, t_pg, desc);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_pg);
+
+ write_fp_dreg(s, a->rd, temp);
+ tcg_temp_free_i64(temp);
+ return true;
+}
+
+#define DO_VPZ(NAME, name) \
+ static gen_helper_gvec_reduc * const name##_fns[4] = { \
+ gen_helper_sve_##name##_b, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, do_vpz_ool, a, name##_fns[a->esz])
+
+DO_VPZ(ORV, orv)
+DO_VPZ(ANDV, andv)
+DO_VPZ(EORV, eorv)
+
+DO_VPZ(UADDV, uaddv)
+DO_VPZ(SMAXV, smaxv)
+DO_VPZ(UMAXV, umaxv)
+DO_VPZ(SMINV, sminv)
+DO_VPZ(UMINV, uminv)
+
+static gen_helper_gvec_reduc * const saddv_fns[4] = {
+ gen_helper_sve_saddv_b, gen_helper_sve_saddv_h,
+ gen_helper_sve_saddv_s, NULL
+};
+TRANS_FEAT(SADDV, aa64_sve, do_vpz_ool, a, saddv_fns[a->esz])
+
+#undef DO_VPZ
+
+/*
+ *** SVE Shift by Immediate - Predicated Group
+ */
+
+/*
+ * Copy Zn into Zd, storing zeros into inactive elements.
+ * If invert, store zeros into the active elements.
+ */
+static bool do_movz_zpz(DisasContext *s, int rd, int rn, int pg,
+ int esz, bool invert)
+{
+ static gen_helper_gvec_3 * const fns[4] = {
+ gen_helper_sve_movz_b, gen_helper_sve_movz_h,
+ gen_helper_sve_movz_s, gen_helper_sve_movz_d,
+ };
+ return gen_gvec_ool_zzp(s, fns[esz], rd, rn, pg, invert);
+}
+
+static bool do_shift_zpzi(DisasContext *s, arg_rpri_esz *a, bool asr,
+ gen_helper_gvec_3 * const fns[4])
+{
+ int max;
+
+ if (a->esz < 0) {
+ /* Invalid tsz encoding -- see tszimm_esz. */
+ return false;
+ }
+
+ /*
+ * Shift by element size is architecturally valid.
+ * For arithmetic right-shift, it's the same as by one less.
+ * For logical shifts and ASRD, it is a zeroing operation.
+ */
+ max = 8 << a->esz;
+ if (a->imm >= max) {
+ if (asr) {
+ a->imm = max - 1;
+ } else {
+ return do_movz_zpz(s, a->rd, a->rd, a->pg, a->esz, true);
+ }
+ }
+ return gen_gvec_ool_arg_zpzi(s, fns[a->esz], a);
+}
+
+static gen_helper_gvec_3 * const asr_zpzi_fns[4] = {
+ gen_helper_sve_asr_zpzi_b, gen_helper_sve_asr_zpzi_h,
+ gen_helper_sve_asr_zpzi_s, gen_helper_sve_asr_zpzi_d,
+};
+TRANS_FEAT(ASR_zpzi, aa64_sve, do_shift_zpzi, a, true, asr_zpzi_fns)
+
+static gen_helper_gvec_3 * const lsr_zpzi_fns[4] = {
+ gen_helper_sve_lsr_zpzi_b, gen_helper_sve_lsr_zpzi_h,
+ gen_helper_sve_lsr_zpzi_s, gen_helper_sve_lsr_zpzi_d,
+};
+TRANS_FEAT(LSR_zpzi, aa64_sve, do_shift_zpzi, a, false, lsr_zpzi_fns)
+
+static gen_helper_gvec_3 * const lsl_zpzi_fns[4] = {
+ gen_helper_sve_lsl_zpzi_b, gen_helper_sve_lsl_zpzi_h,
+ gen_helper_sve_lsl_zpzi_s, gen_helper_sve_lsl_zpzi_d,
+};
+TRANS_FEAT(LSL_zpzi, aa64_sve, do_shift_zpzi, a, false, lsl_zpzi_fns)
+
+static gen_helper_gvec_3 * const asrd_fns[4] = {
+ gen_helper_sve_asrd_b, gen_helper_sve_asrd_h,
+ gen_helper_sve_asrd_s, gen_helper_sve_asrd_d,
+};
+TRANS_FEAT(ASRD, aa64_sve, do_shift_zpzi, a, false, asrd_fns)
+
+static gen_helper_gvec_3 * const sqshl_zpzi_fns[4] = {
+ gen_helper_sve2_sqshl_zpzi_b, gen_helper_sve2_sqshl_zpzi_h,
+ gen_helper_sve2_sqshl_zpzi_s, gen_helper_sve2_sqshl_zpzi_d,
+};
+TRANS_FEAT(SQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
+ a->esz < 0 ? NULL : sqshl_zpzi_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const uqshl_zpzi_fns[4] = {
+ gen_helper_sve2_uqshl_zpzi_b, gen_helper_sve2_uqshl_zpzi_h,
+ gen_helper_sve2_uqshl_zpzi_s, gen_helper_sve2_uqshl_zpzi_d,
+};
+TRANS_FEAT(UQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
+ a->esz < 0 ? NULL : uqshl_zpzi_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const srshr_fns[4] = {
+ gen_helper_sve2_srshr_b, gen_helper_sve2_srshr_h,
+ gen_helper_sve2_srshr_s, gen_helper_sve2_srshr_d,
+};
+TRANS_FEAT(SRSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
+ a->esz < 0 ? NULL : srshr_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const urshr_fns[4] = {
+ gen_helper_sve2_urshr_b, gen_helper_sve2_urshr_h,
+ gen_helper_sve2_urshr_s, gen_helper_sve2_urshr_d,
+};
+TRANS_FEAT(URSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
+ a->esz < 0 ? NULL : urshr_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const sqshlu_fns[4] = {
+ gen_helper_sve2_sqshlu_b, gen_helper_sve2_sqshlu_h,
+ gen_helper_sve2_sqshlu_s, gen_helper_sve2_sqshlu_d,
+};
+TRANS_FEAT(SQSHLU, aa64_sve2, gen_gvec_ool_arg_zpzi,
+ a->esz < 0 ? NULL : sqshlu_fns[a->esz], a)
+
+/*
+ *** SVE Bitwise Shift - Predicated Group
+ */
+
+#define DO_ZPZW(NAME, name) \
+ static gen_helper_gvec_4 * const name##_zpzw_fns[4] = { \
+ gen_helper_sve_##name##_zpzw_b, gen_helper_sve_##name##_zpzw_h, \
+ gen_helper_sve_##name##_zpzw_s, NULL \
+ }; \
+ TRANS_FEAT(NAME##_zpzw, aa64_sve, gen_gvec_ool_arg_zpzz, \
+ a->esz < 0 ? NULL : name##_zpzw_fns[a->esz], a, 0)
+
+DO_ZPZW(ASR, asr)
+DO_ZPZW(LSR, lsr)
+DO_ZPZW(LSL, lsl)
+
+#undef DO_ZPZW
+
+/*
+ *** SVE Bitwise Shift - Unpredicated Group
+ */
+
+static bool do_shift_imm(DisasContext *s, arg_rri_esz *a, bool asr,
+ void (*gvec_fn)(unsigned, uint32_t, uint32_t,
+ int64_t, uint32_t, uint32_t))
+{
+ if (a->esz < 0) {
+ /* Invalid tsz encoding -- see tszimm_esz. */
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ /* Shift by element size is architecturally valid. For
+ arithmetic right-shift, it's the same as by one less.
+ Otherwise it is a zeroing operation. */
+ if (a->imm >= 8 << a->esz) {
+ if (asr) {
+ a->imm = (8 << a->esz) - 1;
+ } else {
+ do_dupi_z(s, a->rd, 0);
+ return true;
+ }
+ }
+ gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz);
+ }
+ return true;
+}
+
+TRANS_FEAT(ASR_zzi, aa64_sve, do_shift_imm, a, true, tcg_gen_gvec_sari)
+TRANS_FEAT(LSR_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shri)
+TRANS_FEAT(LSL_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shli)
+
+#define DO_ZZW(NAME, name) \
+ static gen_helper_gvec_3 * const name##_zzw_fns[4] = { \
+ gen_helper_sve_##name##_zzw_b, gen_helper_sve_##name##_zzw_h, \
+ gen_helper_sve_##name##_zzw_s, NULL \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_arg_zzz, \
+ name##_zzw_fns[a->esz], a, 0)
+
+DO_ZZW(ASR_zzw, asr)
+DO_ZZW(LSR_zzw, lsr)
+DO_ZZW(LSL_zzw, lsl)
+
+#undef DO_ZZW
+
+/*
+ *** SVE Integer Multiply-Add Group
+ */
+
+static bool do_zpzzz_ool(DisasContext *s, arg_rprrr_esz *a,
+ gen_helper_gvec_5 *fn)
+{
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_5_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->ra),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ pred_full_reg_offset(s, a->pg),
+ vsz, vsz, 0, fn);
+ }
+ return true;
+}
+
+static gen_helper_gvec_5 * const mla_fns[4] = {
+ gen_helper_sve_mla_b, gen_helper_sve_mla_h,
+ gen_helper_sve_mla_s, gen_helper_sve_mla_d,
+};
+TRANS_FEAT(MLA, aa64_sve, do_zpzzz_ool, a, mla_fns[a->esz])
+
+static gen_helper_gvec_5 * const mls_fns[4] = {
+ gen_helper_sve_mls_b, gen_helper_sve_mls_h,
+ gen_helper_sve_mls_s, gen_helper_sve_mls_d,
+};
+TRANS_FEAT(MLS, aa64_sve, do_zpzzz_ool, a, mls_fns[a->esz])
+
+/*
+ *** SVE Index Generation Group
+ */
+
+static bool do_index(DisasContext *s, int esz, int rd,
+ TCGv_i64 start, TCGv_i64 incr)
+{
+ unsigned vsz;
+ TCGv_i32 desc;
+ TCGv_ptr t_zd;
+
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vsz = vec_full_reg_size(s);
+ desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+ t_zd = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
+ if (esz == 3) {
+ gen_helper_sve_index_d(t_zd, start, incr, desc);
+ } else {
+ typedef void index_fn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+ static index_fn * const fns[3] = {
+ gen_helper_sve_index_b,
+ gen_helper_sve_index_h,
+ gen_helper_sve_index_s,
+ };
+ TCGv_i32 s32 = tcg_temp_new_i32();
+ TCGv_i32 i32 = tcg_temp_new_i32();
+
+ tcg_gen_extrl_i64_i32(s32, start);
+ tcg_gen_extrl_i64_i32(i32, incr);
+ fns[esz](t_zd, s32, i32, desc);
+
+ tcg_temp_free_i32(s32);
+ tcg_temp_free_i32(i32);
+ }
+ tcg_temp_free_ptr(t_zd);
+ return true;
+}
+
+TRANS_FEAT(INDEX_ii, aa64_sve, do_index, a->esz, a->rd,
+ tcg_constant_i64(a->imm1), tcg_constant_i64(a->imm2))
+TRANS_FEAT(INDEX_ir, aa64_sve, do_index, a->esz, a->rd,
+ tcg_constant_i64(a->imm), cpu_reg(s, a->rm))
+TRANS_FEAT(INDEX_ri, aa64_sve, do_index, a->esz, a->rd,
+ cpu_reg(s, a->rn), tcg_constant_i64(a->imm))
+TRANS_FEAT(INDEX_rr, aa64_sve, do_index, a->esz, a->rd,
+ cpu_reg(s, a->rn), cpu_reg(s, a->rm))
+
+/*
+ *** SVE Stack Allocation Group
+ */
+
+static bool trans_ADDVL(DisasContext *s, arg_ADDVL *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+ TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+ tcg_gen_addi_i64(rd, rn, a->imm * vec_full_reg_size(s));
+ }
+ return true;
+}
+
+static bool trans_ADDSVL(DisasContext *s, arg_ADDSVL *a)
+{
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (sme_enabled_check(s)) {
+ TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+ TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+ tcg_gen_addi_i64(rd, rn, a->imm * streaming_vec_reg_size(s));
+ }
+ return true;
+}
+
+static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+ TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+ tcg_gen_addi_i64(rd, rn, a->imm * pred_full_reg_size(s));
+ }
+ return true;
+}
+
+static bool trans_ADDSPL(DisasContext *s, arg_ADDSPL *a)
+{
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (sme_enabled_check(s)) {
+ TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+ TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+ tcg_gen_addi_i64(rd, rn, a->imm * streaming_pred_reg_size(s));
+ }
+ return true;
+}
+
+static bool trans_RDVL(DisasContext *s, arg_RDVL *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+ tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s));
+ }
+ return true;
+}
+
+static bool trans_RDSVL(DisasContext *s, arg_RDSVL *a)
+{
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (sme_enabled_check(s)) {
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+ tcg_gen_movi_i64(reg, a->imm * streaming_vec_reg_size(s));
+ }
+ return true;
+}
+
+/*
+ *** SVE Compute Vector Address Group
+ */
+
+static bool do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn)
+{
+ return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, a->imm);
+}
+
+TRANS_FEAT_NONSTREAMING(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32)
+TRANS_FEAT_NONSTREAMING(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64)
+TRANS_FEAT_NONSTREAMING(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32)
+TRANS_FEAT_NONSTREAMING(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32)
+
+/*
+ *** SVE Integer Misc - Unpredicated Group
+ */
+
+static gen_helper_gvec_2 * const fexpa_fns[4] = {
+ NULL, gen_helper_sve_fexpa_h,
+ gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d,
+};
+TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz,
+ fexpa_fns[a->esz], a->rd, a->rn, 0)
+
+static gen_helper_gvec_3 * const ftssel_fns[4] = {
+ NULL, gen_helper_sve_ftssel_h,
+ gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d,
+};
+TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz,
+ ftssel_fns[a->esz], a, 0)
+
+/*
+ *** SVE Predicate Logical Operations Group
+ */
+
+static bool do_pppp_flags(DisasContext *s, arg_rprr_s *a,
+ const GVecGen4 *gvec_op)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned psz = pred_gvec_reg_size(s);
+ int dofs = pred_full_reg_offset(s, a->rd);
+ int nofs = pred_full_reg_offset(s, a->rn);
+ int mofs = pred_full_reg_offset(s, a->rm);
+ int gofs = pred_full_reg_offset(s, a->pg);
+
+ if (!a->s) {
+ tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
+ return true;
+ }
+
+ if (psz == 8) {
+ /* Do the operation and the flags generation in temps. */
+ TCGv_i64 pd = tcg_temp_new_i64();
+ TCGv_i64 pn = tcg_temp_new_i64();
+ TCGv_i64 pm = tcg_temp_new_i64();
+ TCGv_i64 pg = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(pn, cpu_env, nofs);
+ tcg_gen_ld_i64(pm, cpu_env, mofs);
+ tcg_gen_ld_i64(pg, cpu_env, gofs);
+
+ gvec_op->fni8(pd, pn, pm, pg);
+ tcg_gen_st_i64(pd, cpu_env, dofs);
+
+ do_predtest1(pd, pg);
+
+ tcg_temp_free_i64(pd);
+ tcg_temp_free_i64(pn);
+ tcg_temp_free_i64(pm);
+ tcg_temp_free_i64(pg);
+ } else {
+ /* The operation and flags generation is large. The computation
+ * of the flags depends on the original contents of the guarding
+ * predicate. If the destination overwrites the guarding predicate,
+ * then the easiest way to get this right is to save a copy.
+ */
+ int tofs = gofs;
+ if (a->rd == a->pg) {
+ tofs = offsetof(CPUARMState, vfp.preg_tmp);
+ tcg_gen_gvec_mov(0, tofs, gofs, psz, psz);
+ }
+
+ tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
+ do_predtest(s, dofs, tofs, psz / 8);
+ }
+ return true;
+}
+
+static void gen_and_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_and_i64(pd, pn, pm);
+ tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_and_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_and_vec(vece, pd, pn, pm);
+ tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_AND_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_and_pg_i64,
+ .fniv = gen_and_pg_vec,
+ .fno = gen_helper_sve_and_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!a->s) {
+ if (a->rn == a->rm) {
+ if (a->pg == a->rn) {
+ return do_mov_p(s, a->rd, a->rn);
+ }
+ return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->pg);
+ } else if (a->pg == a->rn || a->pg == a->rm) {
+ return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->rm);
+ }
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static void gen_bic_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_andc_i64(pd, pn, pm);
+ tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_bic_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_andc_vec(vece, pd, pn, pm);
+ tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_BIC_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_bic_pg_i64,
+ .fniv = gen_bic_pg_vec,
+ .fno = gen_helper_sve_bic_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!a->s && a->pg == a->rn) {
+ return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->rn, a->rm);
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static void gen_eor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_xor_i64(pd, pn, pm);
+ tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_eor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_xor_vec(vece, pd, pn, pm);
+ tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_EOR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_eor_pg_i64,
+ .fniv = gen_eor_pg_vec,
+ .fno = gen_helper_sve_eor_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ /* Alias NOT (predicate) is EOR Pd.B, Pg/Z, Pn.B, Pg.B */
+ if (!a->s && a->pg == a->rm) {
+ return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->pg, a->rn);
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static bool trans_SEL_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ if (a->s || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned psz = pred_gvec_reg_size(s);
+ tcg_gen_gvec_bitsel(MO_8, pred_full_reg_offset(s, a->rd),
+ pred_full_reg_offset(s, a->pg),
+ pred_full_reg_offset(s, a->rn),
+ pred_full_reg_offset(s, a->rm), psz, psz);
+ }
+ return true;
+}
+
+static void gen_orr_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_or_i64(pd, pn, pm);
+ tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_orr_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_or_vec(vece, pd, pn, pm);
+ tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_ORR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_orr_pg_i64,
+ .fniv = gen_orr_pg_vec,
+ .fno = gen_helper_sve_orr_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!a->s && a->pg == a->rn && a->rn == a->rm) {
+ return do_mov_p(s, a->rd, a->rn);
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static void gen_orn_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_orc_i64(pd, pn, pm);
+ tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_orn_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_orc_vec(vece, pd, pn, pm);
+ tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_ORN_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_orn_pg_i64,
+ .fniv = gen_orn_pg_vec,
+ .fno = gen_helper_sve_orn_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static void gen_nor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_or_i64(pd, pn, pm);
+ tcg_gen_andc_i64(pd, pg, pd);
+}
+
+static void gen_nor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_or_vec(vece, pd, pn, pm);
+ tcg_gen_andc_vec(vece, pd, pg, pd);
+}
+
+static bool trans_NOR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_nor_pg_i64,
+ .fniv = gen_nor_pg_vec,
+ .fno = gen_helper_sve_nor_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+static void gen_nand_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+ tcg_gen_and_i64(pd, pn, pm);
+ tcg_gen_andc_i64(pd, pg, pd);
+}
+
+static void gen_nand_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+ TCGv_vec pm, TCGv_vec pg)
+{
+ tcg_gen_and_vec(vece, pd, pn, pm);
+ tcg_gen_andc_vec(vece, pd, pg, pd);
+}
+
+static bool trans_NAND_pppp(DisasContext *s, arg_rprr_s *a)
+{
+ static const GVecGen4 op = {
+ .fni8 = gen_nand_pg_i64,
+ .fniv = gen_nand_pg_vec,
+ .fno = gen_helper_sve_nand_pppp,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ return do_pppp_flags(s, a, &op);
+}
+
+/*
+ *** SVE Predicate Misc Group
+ */
+
+static bool trans_PTEST(DisasContext *s, arg_PTEST *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int nofs = pred_full_reg_offset(s, a->rn);
+ int gofs = pred_full_reg_offset(s, a->pg);
+ int words = DIV_ROUND_UP(pred_full_reg_size(s), 8);
+
+ if (words == 1) {
+ TCGv_i64 pn = tcg_temp_new_i64();
+ TCGv_i64 pg = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(pn, cpu_env, nofs);
+ tcg_gen_ld_i64(pg, cpu_env, gofs);
+ do_predtest1(pn, pg);
+
+ tcg_temp_free_i64(pn);
+ tcg_temp_free_i64(pg);
+ } else {
+ do_predtest(s, nofs, gofs, words);
+ }
+ }
+ return true;
+}
+
+/* See the ARM pseudocode DecodePredCount. */
+static unsigned decode_pred_count(unsigned fullsz, int pattern, int esz)
+{
+ unsigned elements = fullsz >> esz;
+ unsigned bound;
+
+ switch (pattern) {
+ case 0x0: /* POW2 */
+ return pow2floor(elements);
+ case 0x1: /* VL1 */
+ case 0x2: /* VL2 */
+ case 0x3: /* VL3 */
+ case 0x4: /* VL4 */
+ case 0x5: /* VL5 */
+ case 0x6: /* VL6 */
+ case 0x7: /* VL7 */
+ case 0x8: /* VL8 */
+ bound = pattern;
+ break;
+ case 0x9: /* VL16 */
+ case 0xa: /* VL32 */
+ case 0xb: /* VL64 */
+ case 0xc: /* VL128 */
+ case 0xd: /* VL256 */
+ bound = 16 << (pattern - 9);
+ break;
+ case 0x1d: /* MUL4 */
+ return elements - elements % 4;
+ case 0x1e: /* MUL3 */
+ return elements - elements % 3;
+ case 0x1f: /* ALL */
+ return elements;
+ default: /* #uimm5 */
+ return 0;
+ }
+ return elements >= bound ? bound : 0;
+}
+
+/* This handles all of the predicate initialization instructions,
+ * PTRUE, PFALSE, SETFFR. For PFALSE, we will have set PAT == 32
+ * so that decode_pred_count returns 0. For SETFFR, we will have
+ * set RD == 16 == FFR.
+ */
+static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned ofs = pred_full_reg_offset(s, rd);
+ unsigned numelem, setsz, i;
+ uint64_t word, lastword;
+ TCGv_i64 t;
+
+ numelem = decode_pred_count(fullsz, pat, esz);
+
+ /* Determine what we must store into each bit, and how many. */
+ if (numelem == 0) {
+ lastword = word = 0;
+ setsz = fullsz;
+ } else {
+ setsz = numelem << esz;
+ lastword = word = pred_esz_masks[esz];
+ if (setsz % 64) {
+ lastword &= MAKE_64BIT_MASK(0, setsz % 64);
+ }
+ }
+
+ t = tcg_temp_new_i64();
+ if (fullsz <= 64) {
+ tcg_gen_movi_i64(t, lastword);
+ tcg_gen_st_i64(t, cpu_env, ofs);
+ goto done;
+ }
+
+ if (word == lastword) {
+ unsigned maxsz = size_for_gvec(fullsz / 8);
+ unsigned oprsz = size_for_gvec(setsz / 8);
+
+ if (oprsz * 8 == setsz) {
+ tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
+ goto done;
+ }
+ }
+
+ setsz /= 8;
+ fullsz /= 8;
+
+ tcg_gen_movi_i64(t, word);
+ for (i = 0; i < QEMU_ALIGN_DOWN(setsz, 8); i += 8) {
+ tcg_gen_st_i64(t, cpu_env, ofs + i);
+ }
+ if (lastword != word) {
+ tcg_gen_movi_i64(t, lastword);
+ tcg_gen_st_i64(t, cpu_env, ofs + i);
+ i += 8;
+ }
+ if (i < fullsz) {
+ tcg_gen_movi_i64(t, 0);
+ for (; i < fullsz; i += 8) {
+ tcg_gen_st_i64(t, cpu_env, ofs + i);
+ }
+ }
+
+ done:
+ tcg_temp_free_i64(t);
+
+ /* PTRUES */
+ if (setflag) {
+ tcg_gen_movi_i32(cpu_NF, -(word != 0));
+ tcg_gen_movi_i32(cpu_CF, word == 0);
+ tcg_gen_movi_i32(cpu_VF, 0);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ }
+ return true;
+}
+
+TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s)
+
+/* Note pat == 31 is #all, to set all elements. */
+TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve,
+ do_predset, 0, FFR_PRED_NUM, 31, false)
+
+/* Note pat == 32 is #unimp, to set no elements. */
+TRANS_FEAT(PFALSE, aa64_sve, do_predset, 0, a->rd, 32, false)
+
+static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a)
+{
+ /* The path through do_pppp_flags is complicated enough to want to avoid
+ * duplication. Frob the arguments into the form of a predicated AND.
+ */
+ arg_rprr_s alt_a = {
+ .rd = a->rd, .pg = a->pg, .s = a->s,
+ .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM,
+ };
+
+ s->is_nonstreaming = true;
+ return trans_AND_pppp(s, &alt_a);
+}
+
+TRANS_FEAT_NONSTREAMING(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM)
+TRANS_FEAT_NONSTREAMING(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn)
+
+static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a,
+ void (*gen_fn)(TCGv_i32, TCGv_ptr,
+ TCGv_ptr, TCGv_i32))
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ TCGv_ptr t_pd = tcg_temp_new_ptr();
+ TCGv_ptr t_pg = tcg_temp_new_ptr();
+ TCGv_i32 t;
+ unsigned desc = 0;
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+ tcg_gen_addi_ptr(t_pd, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->rn));
+ t = tcg_temp_new_i32();
+
+ gen_fn(t, t_pd, t_pg, tcg_constant_i32(desc));
+ tcg_temp_free_ptr(t_pd);
+ tcg_temp_free_ptr(t_pg);
+
+ do_pred_flags(t);
+ tcg_temp_free_i32(t);
+ return true;
+}
+
+TRANS_FEAT(PFIRST, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pfirst)
+TRANS_FEAT(PNEXT, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pnext)
+
+/*
+ *** SVE Element Count Group
+ */
+
+/* Perform an inline saturating addition of a 32-bit value within
+ * a 64-bit register. The second operand is known to be positive,
+ * which halves the comparisions we must perform to bound the result.
+ */
+static void do_sat_addsub_32(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
+{
+ int64_t ibound;
+
+ /* Use normal 64-bit arithmetic to detect 32-bit overflow. */
+ if (u) {
+ tcg_gen_ext32u_i64(reg, reg);
+ } else {
+ tcg_gen_ext32s_i64(reg, reg);
+ }
+ if (d) {
+ tcg_gen_sub_i64(reg, reg, val);
+ ibound = (u ? 0 : INT32_MIN);
+ tcg_gen_smax_i64(reg, reg, tcg_constant_i64(ibound));
+ } else {
+ tcg_gen_add_i64(reg, reg, val);
+ ibound = (u ? UINT32_MAX : INT32_MAX);
+ tcg_gen_smin_i64(reg, reg, tcg_constant_i64(ibound));
+ }
+}
+
+/* Similarly with 64-bit values. */
+static void do_sat_addsub_64(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t2;
+
+ if (u) {
+ if (d) {
+ tcg_gen_sub_i64(t0, reg, val);
+ t2 = tcg_constant_i64(0);
+ tcg_gen_movcond_i64(TCG_COND_LTU, reg, reg, val, t2, t0);
+ } else {
+ tcg_gen_add_i64(t0, reg, val);
+ t2 = tcg_constant_i64(-1);
+ tcg_gen_movcond_i64(TCG_COND_LTU, reg, t0, reg, t2, t0);
+ }
+ } else {
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ if (d) {
+ /* Detect signed overflow for subtraction. */
+ tcg_gen_xor_i64(t0, reg, val);
+ tcg_gen_sub_i64(t1, reg, val);
+ tcg_gen_xor_i64(reg, reg, t1);
+ tcg_gen_and_i64(t0, t0, reg);
+
+ /* Bound the result. */
+ tcg_gen_movi_i64(reg, INT64_MIN);
+ t2 = tcg_constant_i64(0);
+ tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, reg, t1);
+ } else {
+ /* Detect signed overflow for addition. */
+ tcg_gen_xor_i64(t0, reg, val);
+ tcg_gen_add_i64(reg, reg, val);
+ tcg_gen_xor_i64(t1, reg, val);
+ tcg_gen_andc_i64(t0, t1, t0);
+
+ /* Bound the result. */
+ tcg_gen_movi_i64(t1, INT64_MAX);
+ t2 = tcg_constant_i64(0);
+ tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, t1, reg);
+ }
+ tcg_temp_free_i64(t1);
+ }
+ tcg_temp_free_i64(t0);
+}
+
+/* Similarly with a vector and a scalar operand. */
+static void do_sat_addsub_vec(DisasContext *s, int esz, int rd, int rn,
+ TCGv_i64 val, bool u, bool d)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr dptr, nptr;
+ TCGv_i32 t32, desc;
+ TCGv_i64 t64;
+
+ dptr = tcg_temp_new_ptr();
+ nptr = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(dptr, cpu_env, vec_full_reg_offset(s, rd));
+ tcg_gen_addi_ptr(nptr, cpu_env, vec_full_reg_offset(s, rn));
+ desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+
+ switch (esz) {
+ case MO_8:
+ t32 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t32, val);
+ if (d) {
+ tcg_gen_neg_i32(t32, t32);
+ }
+ if (u) {
+ gen_helper_sve_uqaddi_b(dptr, nptr, t32, desc);
+ } else {
+ gen_helper_sve_sqaddi_b(dptr, nptr, t32, desc);
+ }
+ tcg_temp_free_i32(t32);
+ break;
+
+ case MO_16:
+ t32 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t32, val);
+ if (d) {
+ tcg_gen_neg_i32(t32, t32);
+ }
+ if (u) {
+ gen_helper_sve_uqaddi_h(dptr, nptr, t32, desc);
+ } else {
+ gen_helper_sve_sqaddi_h(dptr, nptr, t32, desc);
+ }
+ tcg_temp_free_i32(t32);
+ break;
+
+ case MO_32:
+ t64 = tcg_temp_new_i64();
+ if (d) {
+ tcg_gen_neg_i64(t64, val);
+ } else {
+ tcg_gen_mov_i64(t64, val);
+ }
+ if (u) {
+ gen_helper_sve_uqaddi_s(dptr, nptr, t64, desc);
+ } else {
+ gen_helper_sve_sqaddi_s(dptr, nptr, t64, desc);
+ }
+ tcg_temp_free_i64(t64);
+ break;
+
+ case MO_64:
+ if (u) {
+ if (d) {
+ gen_helper_sve_uqsubi_d(dptr, nptr, val, desc);
+ } else {
+ gen_helper_sve_uqaddi_d(dptr, nptr, val, desc);
+ }
+ } else if (d) {
+ t64 = tcg_temp_new_i64();
+ tcg_gen_neg_i64(t64, val);
+ gen_helper_sve_sqaddi_d(dptr, nptr, t64, desc);
+ tcg_temp_free_i64(t64);
+ } else {
+ gen_helper_sve_sqaddi_d(dptr, nptr, val, desc);
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_temp_free_ptr(dptr);
+ tcg_temp_free_ptr(nptr);
+}
+
+static bool trans_CNT_r(DisasContext *s, arg_CNT_r *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ tcg_gen_movi_i64(cpu_reg(s, a->rd), numelem * a->imm);
+ }
+ return true;
+}
+
+static bool trans_INCDEC_r(DisasContext *s, arg_incdec_cnt *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ int inc = numelem * a->imm * (a->d ? -1 : 1);
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+
+ tcg_gen_addi_i64(reg, reg, inc);
+ }
+ return true;
+}
+
+static bool trans_SINCDEC_r_32(DisasContext *s, arg_incdec_cnt *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ int inc = numelem * a->imm;
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+
+ /* Use normal 64-bit arithmetic to detect 32-bit overflow. */
+ if (inc == 0) {
+ if (a->u) {
+ tcg_gen_ext32u_i64(reg, reg);
+ } else {
+ tcg_gen_ext32s_i64(reg, reg);
+ }
+ } else {
+ do_sat_addsub_32(reg, tcg_constant_i64(inc), a->u, a->d);
+ }
+ return true;
+}
+
+static bool trans_SINCDEC_r_64(DisasContext *s, arg_incdec_cnt *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ int inc = numelem * a->imm;
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+
+ if (inc != 0) {
+ do_sat_addsub_64(reg, tcg_constant_i64(inc), a->u, a->d);
+ }
+ return true;
+}
+
+static bool trans_INCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ int inc = numelem * a->imm;
+
+ if (inc != 0) {
+ if (sve_access_check(s)) {
+ tcg_gen_gvec_adds(a->esz, vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ tcg_constant_i64(a->d ? -inc : inc),
+ fullsz, fullsz);
+ }
+ } else {
+ do_mov_z(s, a->rd, a->rn);
+ }
+ return true;
+}
+
+static bool trans_SINCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+
+ unsigned fullsz = vec_full_reg_size(s);
+ unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+ int inc = numelem * a->imm;
+
+ if (inc != 0) {
+ if (sve_access_check(s)) {
+ do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
+ tcg_constant_i64(inc), a->u, a->d);
+ }
+ } else {
+ do_mov_z(s, a->rd, a->rn);
+ }
+ return true;
+}
+
+/*
+ *** SVE Bitwise Immediate Group
+ */
+
+static bool do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn)
+{
+ uint64_t imm;
+ if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+ extract32(a->dbm, 0, 6),
+ extract32(a->dbm, 6, 6))) {
+ return false;
+ }
+ return gen_gvec_fn_zzi(s, gvec_fn, MO_64, a->rd, a->rn, imm);
+}
+
+TRANS_FEAT(AND_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_andi)
+TRANS_FEAT(ORR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_ori)
+TRANS_FEAT(EOR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_xori)
+
+static bool trans_DUPM(DisasContext *s, arg_DUPM *a)
+{
+ uint64_t imm;
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+ extract32(a->dbm, 0, 6),
+ extract32(a->dbm, 6, 6))) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_dupi_z(s, a->rd, imm);
+ }
+ return true;
+}
+
+/*
+ *** SVE Integer Wide Immediate - Predicated Group
+ */
+
+/* Implement all merging copies. This is used for CPY (immediate),
+ * FCPY, CPY (scalar), CPY (SIMD&FP scalar).
+ */
+static void do_cpy_m(DisasContext *s, int esz, int rd, int rn, int pg,
+ TCGv_i64 val)
+{
+ typedef void gen_cpy(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+ static gen_cpy * const fns[4] = {
+ gen_helper_sve_cpy_m_b, gen_helper_sve_cpy_m_h,
+ gen_helper_sve_cpy_m_s, gen_helper_sve_cpy_m_d,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+ TCGv_ptr t_zd = tcg_temp_new_ptr();
+ TCGv_ptr t_zn = tcg_temp_new_ptr();
+ TCGv_ptr t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, rn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+ fns[esz](t_zd, t_zn, t_pg, val, desc);
+
+ tcg_temp_free_ptr(t_zd);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_pg);
+}
+
+static bool trans_FCPY(DisasContext *s, arg_FCPY *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ /* Decode the VFP immediate. */
+ uint64_t imm = vfp_expand_imm(a->esz, a->imm);
+ do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(imm));
+ }
+ return true;
+}
+
+static bool trans_CPY_m_i(DisasContext *s, arg_rpri_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(a->imm));
+ }
+ return true;
+}
+
+static bool trans_CPY_z_i(DisasContext *s, arg_CPY_z_i *a)
+{
+ static gen_helper_gvec_2i * const fns[4] = {
+ gen_helper_sve_cpy_z_b, gen_helper_sve_cpy_z_h,
+ gen_helper_sve_cpy_z_s, gen_helper_sve_cpy_z_d,
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
+ pred_full_reg_offset(s, a->pg),
+ tcg_constant_i64(a->imm),
+ vsz, vsz, 0, fns[a->esz]);
+ }
+ return true;
+}
+
+/*
+ *** SVE Permute Extract Group
+ */
+
+static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned n_ofs = imm >= vsz ? 0 : imm;
+ unsigned n_siz = vsz - n_ofs;
+ unsigned d = vec_full_reg_offset(s, rd);
+ unsigned n = vec_full_reg_offset(s, rn);
+ unsigned m = vec_full_reg_offset(s, rm);
+
+ /* Use host vector move insns if we have appropriate sizes
+ * and no unfortunate overlap.
+ */
+ if (m != d
+ && n_ofs == size_for_gvec(n_ofs)
+ && n_siz == size_for_gvec(n_siz)
+ && (d != n || n_siz <= n_ofs)) {
+ tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz);
+ if (n_ofs != 0) {
+ tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs);
+ }
+ } else {
+ tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext);
+ }
+ return true;
+}
+
+TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm)
+TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm)
+
+/*
+ *** SVE Permute - Unpredicated Group
+ */
+
+static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd),
+ vsz, vsz, cpu_reg_sp(s, a->rn));
+ }
+ return true;
+}
+
+static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if ((a->imm & 0x1f) == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned dofs = vec_full_reg_offset(s, a->rd);
+ unsigned esz, index;
+
+ esz = ctz32(a->imm);
+ index = a->imm >> (esz + 1);
+
+ if ((index << esz) < vsz) {
+ unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
+ tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
+ } else {
+ /*
+ * While dup_mem handles 128-bit elements, dup_imm does not.
+ * Thankfully element size doesn't matter for splatting zero.
+ */
+ tcg_gen_gvec_dup_imm(MO_64, dofs, vsz, vsz, 0);
+ }
+ }
+ return true;
+}
+
+static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val)
+{
+ typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+ static gen_insr * const fns[4] = {
+ gen_helper_sve_insr_b, gen_helper_sve_insr_h,
+ gen_helper_sve_insr_s, gen_helper_sve_insr_d,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+ TCGv_ptr t_zd = tcg_temp_new_ptr();
+ TCGv_ptr t_zn = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+
+ fns[a->esz](t_zd, t_zn, val, desc);
+
+ tcg_temp_free_ptr(t_zd);
+ tcg_temp_free_ptr(t_zn);
+}
+
+static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 t = tcg_temp_new_i64();
+ tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64));
+ do_insr_i64(s, a, t);
+ tcg_temp_free_i64(t);
+ }
+ return true;
+}
+
+static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_insr_i64(s, a, cpu_reg(s, a->rm));
+ }
+ return true;
+}
+
+static gen_helper_gvec_2 * const rev_fns[4] = {
+ gen_helper_sve_rev_b, gen_helper_sve_rev_h,
+ gen_helper_sve_rev_s, gen_helper_sve_rev_d
+};
+TRANS_FEAT(REV_v, aa64_sve, gen_gvec_ool_zz, rev_fns[a->esz], a->rd, a->rn, 0)
+
+static gen_helper_gvec_3 * const sve_tbl_fns[4] = {
+ gen_helper_sve_tbl_b, gen_helper_sve_tbl_h,
+ gen_helper_sve_tbl_s, gen_helper_sve_tbl_d
+};
+TRANS_FEAT(TBL, aa64_sve, gen_gvec_ool_arg_zzz, sve_tbl_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const sve2_tbl_fns[4] = {
+ gen_helper_sve2_tbl_b, gen_helper_sve2_tbl_h,
+ gen_helper_sve2_tbl_s, gen_helper_sve2_tbl_d
+};
+TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz],
+ a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0)
+
+static gen_helper_gvec_3 * const tbx_fns[4] = {
+ gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h,
+ gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d
+};
+TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0)
+
+static bool trans_UNPK(DisasContext *s, arg_UNPK *a)
+{
+ static gen_helper_gvec_2 * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h },
+ { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s },
+ { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d },
+ };
+
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn)
+ + (a->h ? vsz / 2 : 0),
+ vsz, vsz, 0, fns[a->esz][a->u]);
+ }
+ return true;
+}
+
+/*
+ *** SVE Permute - Predicates Group
+ */
+
+static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd,
+ gen_helper_gvec_3 *fn)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = pred_full_reg_size(s);
+
+ TCGv_ptr t_d = tcg_temp_new_ptr();
+ TCGv_ptr t_n = tcg_temp_new_ptr();
+ TCGv_ptr t_m = tcg_temp_new_ptr();
+ uint32_t desc = 0;
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+ desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
+
+ tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm));
+
+ fn(t_d, t_n, t_m, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_d);
+ tcg_temp_free_ptr(t_n);
+ tcg_temp_free_ptr(t_m);
+ return true;
+}
+
+static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd,
+ gen_helper_gvec_2 *fn)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = pred_full_reg_size(s);
+ TCGv_ptr t_d = tcg_temp_new_ptr();
+ TCGv_ptr t_n = tcg_temp_new_ptr();
+ uint32_t desc = 0;
+
+ tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+ desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
+
+ fn(t_d, t_n, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_d);
+ tcg_temp_free_ptr(t_n);
+ return true;
+}
+
+TRANS_FEAT(ZIP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_zip_p)
+TRANS_FEAT(ZIP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_zip_p)
+TRANS_FEAT(UZP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_uzp_p)
+TRANS_FEAT(UZP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_uzp_p)
+TRANS_FEAT(TRN1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_trn_p)
+TRANS_FEAT(TRN2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_trn_p)
+
+TRANS_FEAT(REV_p, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_rev_p)
+TRANS_FEAT(PUNPKLO, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_punpk_p)
+TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p)
+
+/*
+ *** SVE Permute - Interleaving Group
+ */
+
+static gen_helper_gvec_3 * const zip_fns[4] = {
+ gen_helper_sve_zip_b, gen_helper_sve_zip_h,
+ gen_helper_sve_zip_s, gen_helper_sve_zip_d,
+};
+TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ zip_fns[a->esz], a, 0)
+TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ zip_fns[a->esz], a, vec_full_reg_size(s) / 2)
+
+TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_zip_q, a, 0)
+TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_zip_q, a,
+ QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2)
+
+static gen_helper_gvec_3 * const uzp_fns[4] = {
+ gen_helper_sve_uzp_b, gen_helper_sve_uzp_h,
+ gen_helper_sve_uzp_s, gen_helper_sve_uzp_d,
+};
+
+TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ uzp_fns[a->esz], a, 0)
+TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ uzp_fns[a->esz], a, 1 << a->esz)
+
+TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_uzp_q, a, 0)
+TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_uzp_q, a, 16)
+
+static gen_helper_gvec_3 * const trn_fns[4] = {
+ gen_helper_sve_trn_b, gen_helper_sve_trn_h,
+ gen_helper_sve_trn_s, gen_helper_sve_trn_d,
+};
+
+TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ trn_fns[a->esz], a, 0)
+TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+ trn_fns[a->esz], a, 1 << a->esz)
+
+TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_trn_q, a, 0)
+TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+ gen_helper_sve2_trn_q, a, 16)
+
+/*
+ *** SVE Permute Vector - Predicated Group
+ */
+
+static gen_helper_gvec_3 * const compact_fns[4] = {
+ NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d
+};
+TRANS_FEAT_NONSTREAMING(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz,
+ compact_fns[a->esz], a, 0)
+
+/* Call the helper that computes the ARM LastActiveElement pseudocode
+ * function, scaled by the element size. This includes the not found
+ * indication; e.g. not found for esz=3 is -8.
+ */
+static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg)
+{
+ /* Predicate sizes may be smaller and cannot use simd_desc. We cannot
+ * round up, as we do elsewhere, because we need the exact size.
+ */
+ TCGv_ptr t_p = tcg_temp_new_ptr();
+ unsigned desc = 0;
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
+
+ tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg));
+
+ gen_helper_sve_last_active_element(ret, t_p, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_p);
+}
+
+/* Increment LAST to the offset of the next element in the vector,
+ * wrapping around to 0.
+ */
+static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+ unsigned vsz = vec_full_reg_size(s);
+
+ tcg_gen_addi_i32(last, last, 1 << esz);
+ if (is_power_of_2(vsz)) {
+ tcg_gen_andi_i32(last, last, vsz - 1);
+ } else {
+ TCGv_i32 max = tcg_constant_i32(vsz);
+ TCGv_i32 zero = tcg_constant_i32(0);
+ tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last);
+ }
+}
+
+/* If LAST < 0, set LAST to the offset of the last element in the vector. */
+static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+ unsigned vsz = vec_full_reg_size(s);
+
+ if (is_power_of_2(vsz)) {
+ tcg_gen_andi_i32(last, last, vsz - 1);
+ } else {
+ TCGv_i32 max = tcg_constant_i32(vsz - (1 << esz));
+ TCGv_i32 zero = tcg_constant_i32(0);
+ tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last);
+ }
+}
+
+/* Load an unsigned element of ESZ from BASE+OFS. */
+static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz)
+{
+ TCGv_i64 r = tcg_temp_new_i64();
+
+ switch (esz) {
+ case 0:
+ tcg_gen_ld8u_i64(r, base, ofs);
+ break;
+ case 1:
+ tcg_gen_ld16u_i64(r, base, ofs);
+ break;
+ case 2:
+ tcg_gen_ld32u_i64(r, base, ofs);
+ break;
+ case 3:
+ tcg_gen_ld_i64(r, base, ofs);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return r;
+}
+
+/* Load an unsigned element of ESZ from RM[LAST]. */
+static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last,
+ int rm, int esz)
+{
+ TCGv_ptr p = tcg_temp_new_ptr();
+ TCGv_i64 r;
+
+ /* Convert offset into vector into offset into ENV.
+ * The final adjustment for the vector register base
+ * is added via constant offset to the load.
+ */
+#if HOST_BIG_ENDIAN
+ /* Adjust for element ordering. See vec_reg_offset. */
+ if (esz < 3) {
+ tcg_gen_xori_i32(last, last, 8 - (1 << esz));
+ }
+#endif
+ tcg_gen_ext_i32_ptr(p, last);
+ tcg_gen_add_ptr(p, p, cpu_env);
+
+ r = load_esz(p, vec_full_reg_offset(s, rm), esz);
+ tcg_temp_free_ptr(p);
+
+ return r;
+}
+
+/* Compute CLAST for a Zreg. */
+static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before)
+{
+ TCGv_i32 last;
+ TCGLabel *over;
+ TCGv_i64 ele;
+ unsigned vsz, esz = a->esz;
+
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ last = tcg_temp_local_new_i32();
+ over = gen_new_label();
+
+ find_last_active(s, last, esz, a->pg);
+
+ /* There is of course no movcond for a 2048-bit vector,
+ * so we must branch over the actual store.
+ */
+ tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over);
+
+ if (!before) {
+ incr_last_active(s, last, esz);
+ }
+
+ ele = load_last_active(s, last, a->rm, esz);
+ tcg_temp_free_i32(last);
+
+ vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele);
+ tcg_temp_free_i64(ele);
+
+ /* If this insn used MOVPRFX, we may need a second move. */
+ if (a->rd != a->rn) {
+ TCGLabel *done = gen_new_label();
+ tcg_gen_br(done);
+
+ gen_set_label(over);
+ do_mov_z(s, a->rd, a->rn);
+
+ gen_set_label(done);
+ } else {
+ gen_set_label(over);
+ }
+ return true;
+}
+
+TRANS_FEAT(CLASTA_z, aa64_sve, do_clast_vector, a, false)
+TRANS_FEAT(CLASTB_z, aa64_sve, do_clast_vector, a, true)
+
+/* Compute CLAST for a scalar. */
+static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm,
+ bool before, TCGv_i64 reg_val)
+{
+ TCGv_i32 last = tcg_temp_new_i32();
+ TCGv_i64 ele, cmp;
+
+ find_last_active(s, last, esz, pg);
+
+ /* Extend the original value of last prior to incrementing. */
+ cmp = tcg_temp_new_i64();
+ tcg_gen_ext_i32_i64(cmp, last);
+
+ if (!before) {
+ incr_last_active(s, last, esz);
+ }
+
+ /* The conceit here is that while last < 0 indicates not found, after
+ * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address
+ * from which we can load garbage. We then discard the garbage with
+ * a conditional move.
+ */
+ ele = load_last_active(s, last, rm, esz);
+ tcg_temp_free_i32(last);
+
+ tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, tcg_constant_i64(0),
+ ele, reg_val);
+
+ tcg_temp_free_i64(cmp);
+ tcg_temp_free_i64(ele);
+}
+
+/* Compute CLAST for a Vreg. */
+static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+ if (sve_access_check(s)) {
+ int esz = a->esz;
+ int ofs = vec_reg_offset(s, a->rd, 0, esz);
+ TCGv_i64 reg = load_esz(cpu_env, ofs, esz);
+
+ do_clast_scalar(s, esz, a->pg, a->rn, before, reg);
+ write_fp_dreg(s, a->rd, reg);
+ tcg_temp_free_i64(reg);
+ }
+ return true;
+}
+
+TRANS_FEAT(CLASTA_v, aa64_sve, do_clast_fp, a, false)
+TRANS_FEAT(CLASTB_v, aa64_sve, do_clast_fp, a, true)
+
+/* Compute CLAST for a Xreg. */
+static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+ TCGv_i64 reg;
+
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ reg = cpu_reg(s, a->rd);
+ switch (a->esz) {
+ case 0:
+ tcg_gen_ext8u_i64(reg, reg);
+ break;
+ case 1:
+ tcg_gen_ext16u_i64(reg, reg);
+ break;
+ case 2:
+ tcg_gen_ext32u_i64(reg, reg);
+ break;
+ case 3:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg);
+ return true;
+}
+
+TRANS_FEAT(CLASTA_r, aa64_sve, do_clast_general, a, false)
+TRANS_FEAT(CLASTB_r, aa64_sve, do_clast_general, a, true)
+
+/* Compute LAST for a scalar. */
+static TCGv_i64 do_last_scalar(DisasContext *s, int esz,
+ int pg, int rm, bool before)
+{
+ TCGv_i32 last = tcg_temp_new_i32();
+ TCGv_i64 ret;
+
+ find_last_active(s, last, esz, pg);
+ if (before) {
+ wrap_last_active(s, last, esz);
+ } else {
+ incr_last_active(s, last, esz);
+ }
+
+ ret = load_last_active(s, last, rm, esz);
+ tcg_temp_free_i32(last);
+ return ret;
+}
+
+/* Compute LAST for a Vreg. */
+static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+ if (sve_access_check(s)) {
+ TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
+ write_fp_dreg(s, a->rd, val);
+ tcg_temp_free_i64(val);
+ }
+ return true;
+}
+
+TRANS_FEAT(LASTA_v, aa64_sve, do_last_fp, a, false)
+TRANS_FEAT(LASTB_v, aa64_sve, do_last_fp, a, true)
+
+/* Compute LAST for a Xreg. */
+static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+ if (sve_access_check(s)) {
+ TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
+ tcg_gen_mov_i64(cpu_reg(s, a->rd), val);
+ tcg_temp_free_i64(val);
+ }
+ return true;
+}
+
+TRANS_FEAT(LASTA_r, aa64_sve, do_last_general, a, false)
+TRANS_FEAT(LASTB_r, aa64_sve, do_last_general, a, true)
+
+static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn));
+ }
+ return true;
+}
+
+static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int ofs = vec_reg_offset(s, a->rn, 0, a->esz);
+ TCGv_i64 t = load_esz(cpu_env, ofs, a->esz);
+ do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t);
+ tcg_temp_free_i64(t);
+ }
+ return true;
+}
+
+static gen_helper_gvec_3 * const revb_fns[4] = {
+ NULL, gen_helper_sve_revb_h,
+ gen_helper_sve_revb_s, gen_helper_sve_revb_d,
+};
+TRANS_FEAT(REVB, aa64_sve, gen_gvec_ool_arg_zpz, revb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const revh_fns[4] = {
+ NULL, NULL, gen_helper_sve_revh_s, gen_helper_sve_revh_d,
+};
+TRANS_FEAT(REVH, aa64_sve, gen_gvec_ool_arg_zpz, revh_fns[a->esz], a, 0)
+
+TRANS_FEAT(REVW, aa64_sve, gen_gvec_ool_arg_zpz,
+ a->esz == 3 ? gen_helper_sve_revw_d : NULL, a, 0)
+
+TRANS_FEAT(REVD, aa64_sme, gen_gvec_ool_arg_zpz, gen_helper_sme_revd_q, a, 0)
+
+TRANS_FEAT(SPLICE, aa64_sve, gen_gvec_ool_arg_zpzz,
+ gen_helper_sve_splice, a, a->esz)
+
+TRANS_FEAT(SPLICE_sve2, aa64_sve2, gen_gvec_ool_zzzp, gen_helper_sve_splice,
+ a->rd, a->rn, (a->rn + 1) % 32, a->pg, a->esz)
+
+/*
+ *** SVE Integer Compare - Vectors Group
+ */
+
+static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a,
+ gen_helper_gvec_flags_4 *gen_fn)
+{
+ TCGv_ptr pd, zn, zm, pg;
+ unsigned vsz;
+ TCGv_i32 t;
+
+ if (gen_fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vsz = vec_full_reg_size(s);
+ t = tcg_temp_new_i32();
+ pd = tcg_temp_new_ptr();
+ zn = tcg_temp_new_ptr();
+ zm = tcg_temp_new_ptr();
+ pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm));
+ tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+
+ gen_fn(t, pd, zn, zm, pg, tcg_constant_i32(simd_desc(vsz, vsz, 0)));
+
+ tcg_temp_free_ptr(pd);
+ tcg_temp_free_ptr(zn);
+ tcg_temp_free_ptr(zm);
+ tcg_temp_free_ptr(pg);
+
+ do_pred_flags(t);
+
+ tcg_temp_free_i32(t);
+ return true;
+}
+
+#define DO_PPZZ(NAME, name) \
+ static gen_helper_gvec_flags_4 * const name##_ppzz_fns[4] = { \
+ gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \
+ gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \
+ }; \
+ TRANS_FEAT(NAME##_ppzz, aa64_sve, do_ppzz_flags, \
+ a, name##_ppzz_fns[a->esz])
+
+DO_PPZZ(CMPEQ, cmpeq)
+DO_PPZZ(CMPNE, cmpne)
+DO_PPZZ(CMPGT, cmpgt)
+DO_PPZZ(CMPGE, cmpge)
+DO_PPZZ(CMPHI, cmphi)
+DO_PPZZ(CMPHS, cmphs)
+
+#undef DO_PPZZ
+
+#define DO_PPZW(NAME, name) \
+ static gen_helper_gvec_flags_4 * const name##_ppzw_fns[4] = { \
+ gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \
+ gen_helper_sve_##name##_ppzw_s, NULL \
+ }; \
+ TRANS_FEAT(NAME##_ppzw, aa64_sve, do_ppzz_flags, \
+ a, name##_ppzw_fns[a->esz])
+
+DO_PPZW(CMPEQ, cmpeq)
+DO_PPZW(CMPNE, cmpne)
+DO_PPZW(CMPGT, cmpgt)
+DO_PPZW(CMPGE, cmpge)
+DO_PPZW(CMPHI, cmphi)
+DO_PPZW(CMPHS, cmphs)
+DO_PPZW(CMPLT, cmplt)
+DO_PPZW(CMPLE, cmple)
+DO_PPZW(CMPLO, cmplo)
+DO_PPZW(CMPLS, cmpls)
+
+#undef DO_PPZW
+
+/*
+ *** SVE Integer Compare - Immediate Groups
+ */
+
+static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a,
+ gen_helper_gvec_flags_3 *gen_fn)
+{
+ TCGv_ptr pd, zn, pg;
+ unsigned vsz;
+ TCGv_i32 t;
+
+ if (gen_fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vsz = vec_full_reg_size(s);
+ t = tcg_temp_new_i32();
+ pd = tcg_temp_new_ptr();
+ zn = tcg_temp_new_ptr();
+ pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+
+ gen_fn(t, pd, zn, pg, tcg_constant_i32(simd_desc(vsz, vsz, a->imm)));
+
+ tcg_temp_free_ptr(pd);
+ tcg_temp_free_ptr(zn);
+ tcg_temp_free_ptr(pg);
+
+ do_pred_flags(t);
+
+ tcg_temp_free_i32(t);
+ return true;
+}
+
+#define DO_PPZI(NAME, name) \
+ static gen_helper_gvec_flags_3 * const name##_ppzi_fns[4] = { \
+ gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h, \
+ gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d, \
+ }; \
+ TRANS_FEAT(NAME##_ppzi, aa64_sve, do_ppzi_flags, a, \
+ name##_ppzi_fns[a->esz])
+
+DO_PPZI(CMPEQ, cmpeq)
+DO_PPZI(CMPNE, cmpne)
+DO_PPZI(CMPGT, cmpgt)
+DO_PPZI(CMPGE, cmpge)
+DO_PPZI(CMPHI, cmphi)
+DO_PPZI(CMPHS, cmphs)
+DO_PPZI(CMPLT, cmplt)
+DO_PPZI(CMPLE, cmple)
+DO_PPZI(CMPLO, cmplo)
+DO_PPZI(CMPLS, cmpls)
+
+#undef DO_PPZI
+
+/*
+ *** SVE Partition Break Group
+ */
+
+static bool do_brk3(DisasContext *s, arg_rprr_s *a,
+ gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = pred_full_reg_size(s);
+
+ /* Predicate sizes may be smaller and cannot use simd_desc. */
+ TCGv_ptr d = tcg_temp_new_ptr();
+ TCGv_ptr n = tcg_temp_new_ptr();
+ TCGv_ptr m = tcg_temp_new_ptr();
+ TCGv_ptr g = tcg_temp_new_ptr();
+ TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
+
+ tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm));
+ tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
+
+ if (a->s) {
+ TCGv_i32 t = tcg_temp_new_i32();
+ fn_s(t, d, n, m, g, desc);
+ do_pred_flags(t);
+ tcg_temp_free_i32(t);
+ } else {
+ fn(d, n, m, g, desc);
+ }
+ tcg_temp_free_ptr(d);
+ tcg_temp_free_ptr(n);
+ tcg_temp_free_ptr(m);
+ tcg_temp_free_ptr(g);
+ return true;
+}
+
+static bool do_brk2(DisasContext *s, arg_rpr_s *a,
+ gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = pred_full_reg_size(s);
+
+ /* Predicate sizes may be smaller and cannot use simd_desc. */
+ TCGv_ptr d = tcg_temp_new_ptr();
+ TCGv_ptr n = tcg_temp_new_ptr();
+ TCGv_ptr g = tcg_temp_new_ptr();
+ TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
+
+ tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
+ tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
+
+ if (a->s) {
+ TCGv_i32 t = tcg_temp_new_i32();
+ fn_s(t, d, n, g, desc);
+ do_pred_flags(t);
+ tcg_temp_free_i32(t);
+ } else {
+ fn(d, n, g, desc);
+ }
+ tcg_temp_free_ptr(d);
+ tcg_temp_free_ptr(n);
+ tcg_temp_free_ptr(g);
+ return true;
+}
+
+TRANS_FEAT(BRKPA, aa64_sve, do_brk3, a,
+ gen_helper_sve_brkpa, gen_helper_sve_brkpas)
+TRANS_FEAT(BRKPB, aa64_sve, do_brk3, a,
+ gen_helper_sve_brkpb, gen_helper_sve_brkpbs)
+
+TRANS_FEAT(BRKA_m, aa64_sve, do_brk2, a,
+ gen_helper_sve_brka_m, gen_helper_sve_brkas_m)
+TRANS_FEAT(BRKB_m, aa64_sve, do_brk2, a,
+ gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m)
+
+TRANS_FEAT(BRKA_z, aa64_sve, do_brk2, a,
+ gen_helper_sve_brka_z, gen_helper_sve_brkas_z)
+TRANS_FEAT(BRKB_z, aa64_sve, do_brk2, a,
+ gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z)
+
+TRANS_FEAT(BRKN, aa64_sve, do_brk2, a,
+ gen_helper_sve_brkn, gen_helper_sve_brkns)
+
+/*
+ *** SVE Predicate Count Group
+ */
+
+static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg)
+{
+ unsigned psz = pred_full_reg_size(s);
+
+ if (psz <= 8) {
+ uint64_t psz_mask;
+
+ tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn));
+ if (pn != pg) {
+ TCGv_i64 g = tcg_temp_new_i64();
+ tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg));
+ tcg_gen_and_i64(val, val, g);
+ tcg_temp_free_i64(g);
+ }
+
+ /* Reduce the pred_esz_masks value simply to reduce the
+ * size of the code generated here.
+ */
+ psz_mask = MAKE_64BIT_MASK(0, psz * 8);
+ tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask);
+
+ tcg_gen_ctpop_i64(val, val);
+ } else {
+ TCGv_ptr t_pn = tcg_temp_new_ptr();
+ TCGv_ptr t_pg = tcg_temp_new_ptr();
+ unsigned desc = 0;
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, psz);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
+
+ tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+ gen_helper_sve_cntp(val, t_pn, t_pg, tcg_constant_i32(desc));
+ tcg_temp_free_ptr(t_pn);
+ tcg_temp_free_ptr(t_pg);
+ }
+}
+
+static bool trans_CNTP(DisasContext *s, arg_CNTP *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg);
+ }
+ return true;
+}
+
+static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+ TCGv_i64 val = tcg_temp_new_i64();
+
+ do_cntp(s, val, a->esz, a->pg, a->pg);
+ if (a->d) {
+ tcg_gen_sub_i64(reg, reg, val);
+ } else {
+ tcg_gen_add_i64(reg, reg, val);
+ }
+ tcg_temp_free_i64(val);
+ }
+ return true;
+}
+
+static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_i64 val = tcg_temp_new_i64();
+ GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds;
+
+ do_cntp(s, val, a->esz, a->pg, a->pg);
+ gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn), val, vsz, vsz);
+ }
+ return true;
+}
+
+static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+ TCGv_i64 val = tcg_temp_new_i64();
+
+ do_cntp(s, val, a->esz, a->pg, a->pg);
+ do_sat_addsub_32(reg, val, a->u, a->d);
+ }
+ return true;
+}
+
+static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 reg = cpu_reg(s, a->rd);
+ TCGv_i64 val = tcg_temp_new_i64();
+
+ do_cntp(s, val, a->esz, a->pg, a->pg);
+ do_sat_addsub_64(reg, val, a->u, a->d);
+ }
+ return true;
+}
+
+static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 val = tcg_temp_new_i64();
+ do_cntp(s, val, a->esz, a->pg, a->pg);
+ do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d);
+ }
+ return true;
+}
+
+/*
+ *** SVE Integer Compare Scalars Group
+ */
+
+static bool trans_CTERM(DisasContext *s, arg_CTERM *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ);
+ TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf);
+ TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf);
+ TCGv_i64 cmp = tcg_temp_new_i64();
+
+ tcg_gen_setcond_i64(cond, cmp, rn, rm);
+ tcg_gen_extrl_i64_i32(cpu_NF, cmp);
+ tcg_temp_free_i64(cmp);
+
+ /* VF = !NF & !CF. */
+ tcg_gen_xori_i32(cpu_VF, cpu_NF, 1);
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF);
+
+ /* Both NF and VF actually look at bit 31. */
+ tcg_gen_neg_i32(cpu_NF, cpu_NF);
+ tcg_gen_neg_i32(cpu_VF, cpu_VF);
+ return true;
+}
+
+static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
+{
+ TCGv_i64 op0, op1, t0, t1, tmax;
+ TCGv_i32 t2;
+ TCGv_ptr ptr;
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned desc = 0;
+ TCGCond cond;
+ uint64_t maxval;
+ /* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */
+ bool eq = a->eq == a->lt;
+
+ /* The greater-than conditions are all SVE2. */
+ if (a->lt
+ ? !dc_isar_feature(aa64_sve, s)
+ : !dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ op0 = read_cpu_reg(s, a->rn, 1);
+ op1 = read_cpu_reg(s, a->rm, 1);
+
+ if (!a->sf) {
+ if (a->u) {
+ tcg_gen_ext32u_i64(op0, op0);
+ tcg_gen_ext32u_i64(op1, op1);
+ } else {
+ tcg_gen_ext32s_i64(op0, op0);
+ tcg_gen_ext32s_i64(op1, op1);
+ }
+ }
+
+ /* For the helper, compress the different conditions into a computation
+ * of how many iterations for which the condition is true.
+ */
+ t0 = tcg_temp_new_i64();
+ t1 = tcg_temp_new_i64();
+
+ if (a->lt) {
+ tcg_gen_sub_i64(t0, op1, op0);
+ if (a->u) {
+ maxval = a->sf ? UINT64_MAX : UINT32_MAX;
+ cond = eq ? TCG_COND_LEU : TCG_COND_LTU;
+ } else {
+ maxval = a->sf ? INT64_MAX : INT32_MAX;
+ cond = eq ? TCG_COND_LE : TCG_COND_LT;
+ }
+ } else {
+ tcg_gen_sub_i64(t0, op0, op1);
+ if (a->u) {
+ maxval = 0;
+ cond = eq ? TCG_COND_GEU : TCG_COND_GTU;
+ } else {
+ maxval = a->sf ? INT64_MIN : INT32_MIN;
+ cond = eq ? TCG_COND_GE : TCG_COND_GT;
+ }
+ }
+
+ tmax = tcg_constant_i64(vsz >> a->esz);
+ if (eq) {
+ /* Equality means one more iteration. */
+ tcg_gen_addi_i64(t0, t0, 1);
+
+ /*
+ * For the less-than while, if op1 is maxval (and the only time
+ * the addition above could overflow), then we produce an all-true
+ * predicate by setting the count to the vector length. This is
+ * because the pseudocode is described as an increment + compare
+ * loop, and the maximum integer would always compare true.
+ * Similarly, the greater-than while has the same issue with the
+ * minimum integer due to the decrement + compare loop.
+ */
+ tcg_gen_movi_i64(t1, maxval);
+ tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0);
+ }
+
+ /* Bound to the maximum. */
+ tcg_gen_umin_i64(t0, t0, tmax);
+
+ /* Set the count to zero if the condition is false. */
+ tcg_gen_movi_i64(t1, 0);
+ tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1);
+ tcg_temp_free_i64(t1);
+
+ /* Since we're bounded, pass as a 32-bit type. */
+ t2 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t2, t0);
+ tcg_temp_free_i64(t0);
+
+ /* Scale elements to bits. */
+ tcg_gen_shli_i32(t2, t2, a->esz);
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+ ptr = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
+
+ if (a->lt) {
+ gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
+ } else {
+ gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc));
+ }
+ do_pred_flags(t2);
+
+ tcg_temp_free_ptr(ptr);
+ tcg_temp_free_i32(t2);
+ return true;
+}
+
+static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
+{
+ TCGv_i64 op0, op1, diff, t1, tmax;
+ TCGv_i32 t2;
+ TCGv_ptr ptr;
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned desc = 0;
+
+ if (!dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ op0 = read_cpu_reg(s, a->rn, 1);
+ op1 = read_cpu_reg(s, a->rm, 1);
+
+ tmax = tcg_constant_i64(vsz);
+ diff = tcg_temp_new_i64();
+
+ if (a->rw) {
+ /* WHILERW */
+ /* diff = abs(op1 - op0), noting that op0/1 are unsigned. */
+ t1 = tcg_temp_new_i64();
+ tcg_gen_sub_i64(diff, op0, op1);
+ tcg_gen_sub_i64(t1, op1, op0);
+ tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1);
+ tcg_temp_free_i64(t1);
+ /* Round down to a multiple of ESIZE. */
+ tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+ /* If op1 == op0, diff == 0, and the condition is always true. */
+ tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff);
+ } else {
+ /* WHILEWR */
+ tcg_gen_sub_i64(diff, op1, op0);
+ /* Round down to a multiple of ESIZE. */
+ tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+ /* If op0 >= op1, diff <= 0, the condition is always true. */
+ tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff);
+ }
+
+ /* Bound to the maximum. */
+ tcg_gen_umin_i64(diff, diff, tmax);
+
+ /* Since we're bounded, pass as a 32-bit type. */
+ t2 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t2, diff);
+ tcg_temp_free_i64(diff);
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+ ptr = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
+
+ gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
+ do_pred_flags(t2);
+
+ tcg_temp_free_ptr(ptr);
+ tcg_temp_free_i32(t2);
+ return true;
+}
+
+/*
+ *** SVE Integer Wide Immediate - Unpredicated Group
+ */
+
+static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
+{
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ int dofs = vec_full_reg_offset(s, a->rd);
+ uint64_t imm;
+
+ /* Decode the VFP immediate. */
+ imm = vfp_expand_imm(a->esz, a->imm);
+ tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
+ }
+ return true;
+}
+
+static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ int dofs = vec_full_reg_offset(s, a->rd);
+ tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
+ }
+ return true;
+}
+
+TRANS_FEAT(ADD_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_addi, a)
+
+static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a)
+{
+ a->imm = -a->imm;
+ return trans_ADD_zzi(s, a);
+}
+
+static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_sub_vec, 0 };
+ static const GVecGen2s op[4] = {
+ { .fni8 = tcg_gen_vec_sub8_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_sve_subri_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8,
+ .scalar_first = true },
+ { .fni8 = tcg_gen_vec_sub16_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_sve_subri_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16,
+ .scalar_first = true },
+ { .fni4 = tcg_gen_sub_i32,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_sve_subri_s,
+ .opt_opc = vecop_list,
+ .vece = MO_32,
+ .scalar_first = true },
+ { .fni8 = tcg_gen_sub_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_sve_subri_d,
+ .opt_opc = vecop_list,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64,
+ .scalar_first = true }
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vsz, vsz, tcg_constant_i64(a->imm), &op[a->esz]);
+ }
+ return true;
+}
+
+TRANS_FEAT(MUL_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_muli, a)
+
+static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, bool u, bool d)
+{
+ if (sve_access_check(s)) {
+ do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
+ tcg_constant_i64(a->imm), u, d);
+ }
+ return true;
+}
+
+TRANS_FEAT(SQADD_zzi, aa64_sve, do_zzi_sat, a, false, false)
+TRANS_FEAT(UQADD_zzi, aa64_sve, do_zzi_sat, a, true, false)
+TRANS_FEAT(SQSUB_zzi, aa64_sve, do_zzi_sat, a, false, true)
+TRANS_FEAT(UQSUB_zzi, aa64_sve, do_zzi_sat, a, true, true)
+
+static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn)
+{
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ tcg_constant_i64(a->imm), vsz, vsz, 0, fn);
+ }
+ return true;
+}
+
+#define DO_ZZI(NAME, name) \
+ static gen_helper_gvec_2i * const name##i_fns[4] = { \
+ gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h, \
+ gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d, \
+ }; \
+ TRANS_FEAT(NAME##_zzi, aa64_sve, do_zzi_ool, a, name##i_fns[a->esz])
+
+DO_ZZI(SMAX, smax)
+DO_ZZI(UMAX, umax)
+DO_ZZI(SMIN, smin)
+DO_ZZI(UMIN, umin)
+
+#undef DO_ZZI
+
+static gen_helper_gvec_4 * const dot_fns[2][2] = {
+ { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h },
+ { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h }
+};
+TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz,
+ dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0)
+
+/*
+ * SVE Multiply - Indexed
+ */
+
+TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sdot_idx_b, a)
+TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sdot_idx_h, a)
+TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_udot_idx_b, a)
+TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_udot_idx_h, a)
+
+TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sudot_idx_b, a)
+TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_usdot_idx_b, a)
+
+#define DO_SVE2_RRX(NAME, FUNC) \
+ TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \
+ a->rd, a->rn, a->rm, a->index)
+
+DO_SVE2_RRX(MUL_zzx_h, gen_helper_gvec_mul_idx_h)
+DO_SVE2_RRX(MUL_zzx_s, gen_helper_gvec_mul_idx_s)
+DO_SVE2_RRX(MUL_zzx_d, gen_helper_gvec_mul_idx_d)
+
+DO_SVE2_RRX(SQDMULH_zzx_h, gen_helper_sve2_sqdmulh_idx_h)
+DO_SVE2_RRX(SQDMULH_zzx_s, gen_helper_sve2_sqdmulh_idx_s)
+DO_SVE2_RRX(SQDMULH_zzx_d, gen_helper_sve2_sqdmulh_idx_d)
+
+DO_SVE2_RRX(SQRDMULH_zzx_h, gen_helper_sve2_sqrdmulh_idx_h)
+DO_SVE2_RRX(SQRDMULH_zzx_s, gen_helper_sve2_sqrdmulh_idx_s)
+DO_SVE2_RRX(SQRDMULH_zzx_d, gen_helper_sve2_sqrdmulh_idx_d)
+
+#undef DO_SVE2_RRX
+
+#define DO_SVE2_RRX_TB(NAME, FUNC, TOP) \
+ TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \
+ a->rd, a->rn, a->rm, (a->index << 1) | TOP)
+
+DO_SVE2_RRX_TB(SQDMULLB_zzx_s, gen_helper_sve2_sqdmull_idx_s, false)
+DO_SVE2_RRX_TB(SQDMULLB_zzx_d, gen_helper_sve2_sqdmull_idx_d, false)
+DO_SVE2_RRX_TB(SQDMULLT_zzx_s, gen_helper_sve2_sqdmull_idx_s, true)
+DO_SVE2_RRX_TB(SQDMULLT_zzx_d, gen_helper_sve2_sqdmull_idx_d, true)
+
+DO_SVE2_RRX_TB(SMULLB_zzx_s, gen_helper_sve2_smull_idx_s, false)
+DO_SVE2_RRX_TB(SMULLB_zzx_d, gen_helper_sve2_smull_idx_d, false)
+DO_SVE2_RRX_TB(SMULLT_zzx_s, gen_helper_sve2_smull_idx_s, true)
+DO_SVE2_RRX_TB(SMULLT_zzx_d, gen_helper_sve2_smull_idx_d, true)
+
+DO_SVE2_RRX_TB(UMULLB_zzx_s, gen_helper_sve2_umull_idx_s, false)
+DO_SVE2_RRX_TB(UMULLB_zzx_d, gen_helper_sve2_umull_idx_d, false)
+DO_SVE2_RRX_TB(UMULLT_zzx_s, gen_helper_sve2_umull_idx_s, true)
+DO_SVE2_RRX_TB(UMULLT_zzx_d, gen_helper_sve2_umull_idx_d, true)
+
+#undef DO_SVE2_RRX_TB
+
+#define DO_SVE2_RRXR(NAME, FUNC) \
+ TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzxz, FUNC, a)
+
+DO_SVE2_RRXR(MLA_zzxz_h, gen_helper_gvec_mla_idx_h)
+DO_SVE2_RRXR(MLA_zzxz_s, gen_helper_gvec_mla_idx_s)
+DO_SVE2_RRXR(MLA_zzxz_d, gen_helper_gvec_mla_idx_d)
+
+DO_SVE2_RRXR(MLS_zzxz_h, gen_helper_gvec_mls_idx_h)
+DO_SVE2_RRXR(MLS_zzxz_s, gen_helper_gvec_mls_idx_s)
+DO_SVE2_RRXR(MLS_zzxz_d, gen_helper_gvec_mls_idx_d)
+
+DO_SVE2_RRXR(SQRDMLAH_zzxz_h, gen_helper_sve2_sqrdmlah_idx_h)
+DO_SVE2_RRXR(SQRDMLAH_zzxz_s, gen_helper_sve2_sqrdmlah_idx_s)
+DO_SVE2_RRXR(SQRDMLAH_zzxz_d, gen_helper_sve2_sqrdmlah_idx_d)
+
+DO_SVE2_RRXR(SQRDMLSH_zzxz_h, gen_helper_sve2_sqrdmlsh_idx_h)
+DO_SVE2_RRXR(SQRDMLSH_zzxz_s, gen_helper_sve2_sqrdmlsh_idx_s)
+DO_SVE2_RRXR(SQRDMLSH_zzxz_d, gen_helper_sve2_sqrdmlsh_idx_d)
+
+#undef DO_SVE2_RRXR
+
+#define DO_SVE2_RRXR_TB(NAME, FUNC, TOP) \
+ TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC, \
+ a->rd, a->rn, a->rm, a->ra, (a->index << 1) | TOP)
+
+DO_SVE2_RRXR_TB(SQDMLALB_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, false)
+DO_SVE2_RRXR_TB(SQDMLALB_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, false)
+DO_SVE2_RRXR_TB(SQDMLALT_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, true)
+DO_SVE2_RRXR_TB(SQDMLALT_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, false)
+DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, false)
+DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, true)
+DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, true)
+
+DO_SVE2_RRXR_TB(SMLALB_zzxw_s, gen_helper_sve2_smlal_idx_s, false)
+DO_SVE2_RRXR_TB(SMLALB_zzxw_d, gen_helper_sve2_smlal_idx_d, false)
+DO_SVE2_RRXR_TB(SMLALT_zzxw_s, gen_helper_sve2_smlal_idx_s, true)
+DO_SVE2_RRXR_TB(SMLALT_zzxw_d, gen_helper_sve2_smlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(UMLALB_zzxw_s, gen_helper_sve2_umlal_idx_s, false)
+DO_SVE2_RRXR_TB(UMLALB_zzxw_d, gen_helper_sve2_umlal_idx_d, false)
+DO_SVE2_RRXR_TB(UMLALT_zzxw_s, gen_helper_sve2_umlal_idx_s, true)
+DO_SVE2_RRXR_TB(UMLALT_zzxw_d, gen_helper_sve2_umlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(SMLSLB_zzxw_s, gen_helper_sve2_smlsl_idx_s, false)
+DO_SVE2_RRXR_TB(SMLSLB_zzxw_d, gen_helper_sve2_smlsl_idx_d, false)
+DO_SVE2_RRXR_TB(SMLSLT_zzxw_s, gen_helper_sve2_smlsl_idx_s, true)
+DO_SVE2_RRXR_TB(SMLSLT_zzxw_d, gen_helper_sve2_smlsl_idx_d, true)
+
+DO_SVE2_RRXR_TB(UMLSLB_zzxw_s, gen_helper_sve2_umlsl_idx_s, false)
+DO_SVE2_RRXR_TB(UMLSLB_zzxw_d, gen_helper_sve2_umlsl_idx_d, false)
+DO_SVE2_RRXR_TB(UMLSLT_zzxw_s, gen_helper_sve2_umlsl_idx_s, true)
+DO_SVE2_RRXR_TB(UMLSLT_zzxw_d, gen_helper_sve2_umlsl_idx_d, true)
+
+#undef DO_SVE2_RRXR_TB
+
+#define DO_SVE2_RRXR_ROT(NAME, FUNC) \
+ TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC, \
+ a->rd, a->rn, a->rm, a->ra, (a->index << 2) | a->rot)
+
+DO_SVE2_RRXR_ROT(CMLA_zzxz_h, gen_helper_sve2_cmla_idx_h)
+DO_SVE2_RRXR_ROT(CMLA_zzxz_s, gen_helper_sve2_cmla_idx_s)
+
+DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_h, gen_helper_sve2_sqrdcmlah_idx_h)
+DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_s, gen_helper_sve2_sqrdcmlah_idx_s)
+
+DO_SVE2_RRXR_ROT(CDOT_zzxw_s, gen_helper_sve2_cdot_idx_s)
+DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d)
+
+#undef DO_SVE2_RRXR_ROT
+
+/*
+ *** SVE Floating Point Multiply-Add Indexed Group
+ */
+
+static bool do_FMLA_zzxz(DisasContext *s, arg_rrxr_esz *a, bool sub)
+{
+ static gen_helper_gvec_4_ptr * const fns[4] = {
+ NULL,
+ gen_helper_gvec_fmla_idx_h,
+ gen_helper_gvec_fmla_idx_s,
+ gen_helper_gvec_fmla_idx_d,
+ };
+ return gen_gvec_fpst_zzzz(s, fns[a->esz], a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sub,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+TRANS_FEAT(FMLA_zzxz, aa64_sve, do_FMLA_zzxz, a, false)
+TRANS_FEAT(FMLS_zzxz, aa64_sve, do_FMLA_zzxz, a, true)
+
+/*
+ *** SVE Floating Point Multiply Indexed Group
+ */
+
+static gen_helper_gvec_3_ptr * const fmul_idx_fns[4] = {
+ NULL, gen_helper_gvec_fmul_idx_h,
+ gen_helper_gvec_fmul_idx_s, gen_helper_gvec_fmul_idx_d,
+};
+TRANS_FEAT(FMUL_zzx, aa64_sve, gen_gvec_fpst_zzz,
+ fmul_idx_fns[a->esz], a->rd, a->rn, a->rm, a->index,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Fast Reduction Group
+ */
+
+typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+
+static bool do_reduce(DisasContext *s, arg_rpr_esz *a,
+ gen_helper_fp_reduce *fn)
+{
+ unsigned vsz, p2vsz;
+ TCGv_i32 t_desc;
+ TCGv_ptr t_zn, t_pg, status;
+ TCGv_i64 temp;
+
+ if (fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vsz = vec_full_reg_size(s);
+ p2vsz = pow2ceil(vsz);
+ t_desc = tcg_constant_i32(simd_desc(vsz, vsz, p2vsz));
+ temp = tcg_temp_new_i64();
+ t_zn = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ fn(temp, t_zn, t_pg, status, t_desc);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(status);
+
+ write_fp_dreg(s, a->rd, temp);
+ tcg_temp_free_i64(temp);
+ return true;
+}
+
+#define DO_VPZ(NAME, name) \
+ static gen_helper_fp_reduce * const name##_fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, do_reduce, a, name##_fns[a->esz])
+
+DO_VPZ(FADDV, faddv)
+DO_VPZ(FMINNMV, fminnmv)
+DO_VPZ(FMAXNMV, fmaxnmv)
+DO_VPZ(FMINV, fminv)
+DO_VPZ(FMAXV, fmaxv)
+
+#undef DO_VPZ
+
+/*
+ *** SVE Floating Point Unary Operations - Unpredicated Group
+ */
+
+static gen_helper_gvec_2_ptr * const frecpe_fns[] = {
+ NULL, gen_helper_gvec_frecpe_h,
+ gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d,
+};
+TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_arg_zz, frecpe_fns[a->esz], a, 0)
+
+static gen_helper_gvec_2_ptr * const frsqrte_fns[] = {
+ NULL, gen_helper_gvec_frsqrte_h,
+ gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d,
+};
+TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_arg_zz, frsqrte_fns[a->esz], a, 0)
+
+/*
+ *** SVE Floating Point Compare with Zero Group
+ */
+
+static bool do_ppz_fp(DisasContext *s, arg_rpr_esz *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status =
+ fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+#define DO_PPZ(NAME, name) \
+ static gen_helper_gvec_3_ptr * const name##_fns[] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, do_ppz_fp, a, name##_fns[a->esz])
+
+DO_PPZ(FCMGE_ppz0, fcmge0)
+DO_PPZ(FCMGT_ppz0, fcmgt0)
+DO_PPZ(FCMLE_ppz0, fcmle0)
+DO_PPZ(FCMLT_ppz0, fcmlt0)
+DO_PPZ(FCMEQ_ppz0, fcmeq0)
+DO_PPZ(FCMNE_ppz0, fcmne0)
+
+#undef DO_PPZ
+
+/*
+ *** SVE floating-point trig multiply-add coefficient
+ */
+
+static gen_helper_gvec_3_ptr * const ftmad_fns[4] = {
+ NULL, gen_helper_sve_ftmad_h,
+ gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d,
+};
+TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz,
+ ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Accumulating Reduction Group
+ */
+
+static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a)
+{
+ typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr,
+ TCGv_ptr, TCGv_ptr, TCGv_i32);
+ static fadda_fn * const fns[3] = {
+ gen_helper_sve_fadda_h,
+ gen_helper_sve_fadda_s,
+ gen_helper_sve_fadda_d,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_rm, t_pg, t_fpst;
+ TCGv_i64 t_val;
+ TCGv_i32 t_desc;
+
+ if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz);
+ t_rm = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ t_fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ t_desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+
+ fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc);
+
+ tcg_temp_free_ptr(t_fpst);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(t_rm);
+
+ write_fp_dreg(s, a->rd, t_val);
+ tcg_temp_free_i64(t_val);
+ return true;
+}
+
+/*
+ *** SVE Floating Point Arithmetic - Unpredicated Group
+ */
+
+#define DO_FP3(NAME, name) \
+ static gen_helper_gvec_3_ptr * const name##_fns[4] = { \
+ NULL, gen_helper_gvec_##name##_h, \
+ gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0)
+
+DO_FP3(FADD_zzz, fadd)
+DO_FP3(FSUB_zzz, fsub)
+DO_FP3(FMUL_zzz, fmul)
+DO_FP3(FRECPS, recps)
+DO_FP3(FRSQRTS, rsqrts)
+
+#undef DO_FP3
+
+static gen_helper_gvec_3_ptr * const ftsmul_fns[4] = {
+ NULL, gen_helper_gvec_ftsmul_h,
+ gen_helper_gvec_ftsmul_s, gen_helper_gvec_ftsmul_d
+};
+TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz,
+ ftsmul_fns[a->esz], a, 0)
+
+/*
+ *** SVE Floating Point Arithmetic - Predicated Group
+ */
+
+#define DO_ZPZZ_FP(NAME, FEAT, name) \
+ static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \
+ NULL, gen_helper_##name##_h, \
+ gen_helper_##name##_s, gen_helper_##name##_d \
+ }; \
+ TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a)
+
+DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd)
+DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub)
+DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul)
+DO_ZPZZ_FP(FMIN_zpzz, aa64_sve, sve_fmin)
+DO_ZPZZ_FP(FMAX_zpzz, aa64_sve, sve_fmax)
+DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum)
+DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum)
+DO_ZPZZ_FP(FABD, aa64_sve, sve_fabd)
+DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn)
+DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv)
+DO_ZPZZ_FP(FMULX, aa64_sve, sve_fmulx)
+
+typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_i64, TCGv_ptr, TCGv_i32);
+
+static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16,
+ TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_zd, t_zn, t_pg, status;
+ TCGv_i32 desc;
+
+ t_zd = tcg_temp_new_ptr();
+ t_zn = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd));
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+ status = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+ desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+ fn(t_zd, t_zn, t_pg, scalar, status, desc);
+
+ tcg_temp_free_ptr(status);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_zd);
+}
+
+static bool do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm,
+ gen_helper_sve_fp2scalar *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16,
+ tcg_constant_i64(imm), fn);
+ }
+ return true;
+}
+
+#define DO_FP_IMM(NAME, name, const0, const1) \
+ static gen_helper_sve_fp2scalar * const name##_fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, \
+ gen_helper_sve_##name##_d \
+ }; \
+ static uint64_t const name##_const[4][2] = { \
+ { -1, -1 }, \
+ { float16_##const0, float16_##const1 }, \
+ { float32_##const0, float32_##const1 }, \
+ { float64_##const0, float64_##const1 }, \
+ }; \
+ TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a, \
+ name##_const[a->esz][a->imm], name##_fns[a->esz])
+
+DO_FP_IMM(FADD, fadds, half, one)
+DO_FP_IMM(FSUB, fsubs, half, one)
+DO_FP_IMM(FMUL, fmuls, half, two)
+DO_FP_IMM(FSUBR, fsubrs, half, one)
+DO_FP_IMM(FMAXNM, fmaxnms, zero, one)
+DO_FP_IMM(FMINNM, fminnms, zero, one)
+DO_FP_IMM(FMAX, fmaxs, zero, one)
+DO_FP_IMM(FMIN, fmins, zero, one)
+
+#undef DO_FP_IMM
+
+static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+#define DO_FPCMP(NAME, name) \
+ static gen_helper_gvec_4_ptr * const name##_fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \
+ }; \
+ TRANS_FEAT(NAME##_ppzz, aa64_sve, do_fp_cmp, a, name##_fns[a->esz])
+
+DO_FPCMP(FCMGE, fcmge)
+DO_FPCMP(FCMGT, fcmgt)
+DO_FPCMP(FCMEQ, fcmeq)
+DO_FPCMP(FCMNE, fcmne)
+DO_FPCMP(FCMUO, fcmuo)
+DO_FPCMP(FACGE, facge)
+DO_FPCMP(FACGT, facgt)
+
+#undef DO_FPCMP
+
+static gen_helper_gvec_4_ptr * const fcadd_fns[] = {
+ NULL, gen_helper_sve_fcadd_h,
+ gen_helper_sve_fcadd_s, gen_helper_sve_fcadd_d,
+};
+TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz],
+ a->rd, a->rn, a->rm, a->pg, a->rot,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+#define DO_FMLA(NAME, name) \
+ static gen_helper_gvec_5_ptr * const name##_fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, name##_fns[a->esz], \
+ a->rd, a->rn, a->rm, a->ra, a->pg, 0, \
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+DO_FMLA(FMLA_zpzzz, fmla_zpzzz)
+DO_FMLA(FMLS_zpzzz, fmls_zpzzz)
+DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz)
+DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz)
+
+#undef DO_FMLA
+
+static gen_helper_gvec_5_ptr * const fcmla_fns[4] = {
+ NULL, gen_helper_sve_fcmla_zpzzz_h,
+ gen_helper_sve_fcmla_zpzzz_s, gen_helper_sve_fcmla_zpzzz_d,
+};
+TRANS_FEAT(FCMLA_zpzzz, aa64_sve, gen_gvec_fpst_zzzzp, fcmla_fns[a->esz],
+ a->rd, a->rn, a->rm, a->ra, a->pg, a->rot,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_4_ptr * const fcmla_idx_fns[4] = {
+ NULL, gen_helper_gvec_fcmlah_idx, gen_helper_gvec_fcmlas_idx, NULL
+};
+TRANS_FEAT(FCMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, fcmla_idx_fns[a->esz],
+ a->rd, a->rn, a->rm, a->ra, a->index * 4 + a->rot,
+ a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Unary Operations Predicated Group
+ */
+
+TRANS_FEAT(FCVT_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_sh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_hs, a, 0, FPST_FPCR)
+
+TRANS_FEAT(BFCVT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_bfcvt, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVT_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_dh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_hd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTZS_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZS_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_hs, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_hs, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZS_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_hd, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_hd, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(FCVTZS_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZS_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZS_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTZS_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzs_dd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_fcvtzu_dd, a, 0, FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const frint_fns[] = {
+ NULL,
+ gen_helper_sve_frint_h,
+ gen_helper_sve_frint_s,
+ gen_helper_sve_frint_d
+};
+TRANS_FEAT(FRINTI, aa64_sve, gen_gvec_fpst_arg_zpz, frint_fns[a->esz],
+ a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const frintx_fns[] = {
+ NULL,
+ gen_helper_sve_frintx_h,
+ gen_helper_sve_frintx_s,
+ gen_helper_sve_frintx_d
+};
+TRANS_FEAT(FRINTX, aa64_sve, gen_gvec_fpst_arg_zpz, frintx_fns[a->esz],
+ a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a,
+ int mode, gen_helper_gvec_3_ptr *fn)
+{
+ unsigned vsz;
+ TCGv_i32 tmode;
+ TCGv_ptr status;
+
+ if (fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vsz = vec_full_reg_size(s);
+ tmode = tcg_const_i32(mode);
+ status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+ gen_helper_set_rmode(tmode, tmode, status);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+
+ gen_helper_set_rmode(tmode, tmode, status);
+ tcg_temp_free_i32(tmode);
+ tcg_temp_free_ptr(status);
+ return true;
+}
+
+TRANS_FEAT(FRINTN, aa64_sve, do_frint_mode, a,
+ float_round_nearest_even, frint_fns[a->esz])
+TRANS_FEAT(FRINTP, aa64_sve, do_frint_mode, a,
+ float_round_up, frint_fns[a->esz])
+TRANS_FEAT(FRINTM, aa64_sve, do_frint_mode, a,
+ float_round_down, frint_fns[a->esz])
+TRANS_FEAT(FRINTZ, aa64_sve, do_frint_mode, a,
+ float_round_to_zero, frint_fns[a->esz])
+TRANS_FEAT(FRINTA, aa64_sve, do_frint_mode, a,
+ float_round_ties_away, frint_fns[a->esz])
+
+static gen_helper_gvec_3_ptr * const frecpx_fns[] = {
+ NULL, gen_helper_sve_frecpx_h,
+ gen_helper_sve_frecpx_s, gen_helper_sve_frecpx_d,
+};
+TRANS_FEAT(FRECPX, aa64_sve, gen_gvec_fpst_arg_zpz, frecpx_fns[a->esz],
+ a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const fsqrt_fns[] = {
+ NULL, gen_helper_sve_fsqrt_h,
+ gen_helper_sve_fsqrt_s, gen_helper_sve_fsqrt_d,
+};
+TRANS_FEAT(FSQRT, aa64_sve, gen_gvec_fpst_arg_zpz, fsqrt_fns[a->esz],
+ a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+TRANS_FEAT(SCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(SCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_sh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(SCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_dh, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(SCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(SCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(SCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(SCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_scvt_dd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(UCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(UCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_sh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(UCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_dh, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(UCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(UCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(UCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_ucvt_dd, a, 0, FPST_FPCR)
+
+/*
+ *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
+ */
+
+/* Subroutine loading a vector register at VOFS of LEN bytes.
+ * The load should begin at the address Rn + IMM.
+ */
+
+void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
+ int len, int rn, int imm)
+{
+ int len_align = QEMU_ALIGN_DOWN(len, 8);
+ int len_remain = len % 8;
+ int nparts = len / 8 + ctpop8(len_remain);
+ int midx = get_mem_index(s);
+ TCGv_i64 dirty_addr, clean_addr, t0, t1;
+
+ dirty_addr = tcg_temp_new_i64();
+ tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
+ clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
+ tcg_temp_free_i64(dirty_addr);
+
+ /*
+ * Note that unpredicated load/store of vector/predicate registers
+ * are defined as a stream of bytes, which equates to little-endian
+ * operations on larger quantities.
+ * Attempt to keep code expansion to a minimum by limiting the
+ * amount of unrolling done.
+ */
+ if (nparts <= 4) {
+ int i;
+
+ t0 = tcg_temp_new_i64();
+ for (i = 0; i < len_align; i += 8) {
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+ tcg_gen_st_i64(t0, base, vofs + i);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ }
+ tcg_temp_free_i64(t0);
+ } else {
+ TCGLabel *loop = gen_new_label();
+ TCGv_ptr tp, i = tcg_const_local_ptr(0);
+
+ /* Copy the clean address into a local temp, live across the loop. */
+ t0 = clean_addr;
+ clean_addr = new_tmp_a64_local(s);
+ tcg_gen_mov_i64(clean_addr, t0);
+
+ if (base != cpu_env) {
+ TCGv_ptr b = tcg_temp_local_new_ptr();
+ tcg_gen_mov_ptr(b, base);
+ base = b;
+ }
+
+ gen_set_label(loop);
+
+ t0 = tcg_temp_new_i64();
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+
+ tp = tcg_temp_new_ptr();
+ tcg_gen_add_ptr(tp, base, i);
+ tcg_gen_addi_ptr(i, i, 8);
+ tcg_gen_st_i64(t0, tp, vofs);
+ tcg_temp_free_ptr(tp);
+ tcg_temp_free_i64(t0);
+
+ tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+ tcg_temp_free_ptr(i);
+
+ if (base != cpu_env) {
+ tcg_temp_free_ptr(base);
+ assert(len_remain == 0);
+ }
+ }
+
+ /*
+ * Predicate register loads can be any multiple of 2.
+ * Note that we still store the entire 64-bit unit into cpu_env.
+ */
+ if (len_remain) {
+ t0 = tcg_temp_new_i64();
+ switch (len_remain) {
+ case 2:
+ case 4:
+ case 8:
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
+ MO_LE | ctz32(len_remain));
+ break;
+
+ case 6:
+ t1 = tcg_temp_new_i64();
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 4);
+ tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
+ tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
+ tcg_temp_free_i64(t1);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ tcg_gen_st_i64(t0, base, vofs + len_align);
+ tcg_temp_free_i64(t0);
+ }
+}
+
+/* Similarly for stores. */
+void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
+ int len, int rn, int imm)
+{
+ int len_align = QEMU_ALIGN_DOWN(len, 8);
+ int len_remain = len % 8;
+ int nparts = len / 8 + ctpop8(len_remain);
+ int midx = get_mem_index(s);
+ TCGv_i64 dirty_addr, clean_addr, t0;
+
+ dirty_addr = tcg_temp_new_i64();
+ tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
+ clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
+ tcg_temp_free_i64(dirty_addr);
+
+ /* Note that unpredicated load/store of vector/predicate registers
+ * are defined as a stream of bytes, which equates to little-endian
+ * operations on larger quantities. There is no nice way to force
+ * a little-endian store for aarch64_be-linux-user out of line.
+ *
+ * Attempt to keep code expansion to a minimum by limiting the
+ * amount of unrolling done.
+ */
+ if (nparts <= 4) {
+ int i;
+
+ t0 = tcg_temp_new_i64();
+ for (i = 0; i < len_align; i += 8) {
+ tcg_gen_ld_i64(t0, base, vofs + i);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ }
+ tcg_temp_free_i64(t0);
+ } else {
+ TCGLabel *loop = gen_new_label();
+ TCGv_ptr tp, i = tcg_const_local_ptr(0);
+
+ /* Copy the clean address into a local temp, live across the loop. */
+ t0 = clean_addr;
+ clean_addr = new_tmp_a64_local(s);
+ tcg_gen_mov_i64(clean_addr, t0);
+
+ if (base != cpu_env) {
+ TCGv_ptr b = tcg_temp_local_new_ptr();
+ tcg_gen_mov_ptr(b, base);
+ base = b;
+ }
+
+ gen_set_label(loop);
+
+ t0 = tcg_temp_new_i64();
+ tp = tcg_temp_new_ptr();
+ tcg_gen_add_ptr(tp, base, i);
+ tcg_gen_ld_i64(t0, tp, vofs);
+ tcg_gen_addi_ptr(i, i, 8);
+ tcg_temp_free_ptr(tp);
+
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ tcg_temp_free_i64(t0);
+
+ tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+ tcg_temp_free_ptr(i);
+
+ if (base != cpu_env) {
+ tcg_temp_free_ptr(base);
+ assert(len_remain == 0);
+ }
+ }
+
+ /* Predicate register stores can be any multiple of 2. */
+ if (len_remain) {
+ t0 = tcg_temp_new_i64();
+ tcg_gen_ld_i64(t0, base, vofs + len_align);
+
+ switch (len_remain) {
+ case 2:
+ case 4:
+ case 8:
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx,
+ MO_LE | ctz32(len_remain));
+ break;
+
+ case 6:
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 4);
+ tcg_gen_shri_i64(t0, t0, 32);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i64(t0);
+ }
+}
+
+static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int size = vec_full_reg_size(s);
+ int off = vec_full_reg_offset(s, a->rd);
+ gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int size = pred_full_reg_size(s);
+ int off = pred_full_reg_offset(s, a->rd);
+ gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+static bool trans_STR_zri(DisasContext *s, arg_rri *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int size = vec_full_reg_size(s);
+ int off = vec_full_reg_offset(s, a->rd);
+ gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+static bool trans_STR_pri(DisasContext *s, arg_rri *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int size = pred_full_reg_size(s);
+ int off = pred_full_reg_offset(s, a->rd);
+ gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+/*
+ *** SVE Memory - Contiguous Load Group
+ */
+
+/* The memory mode of the dtype. */
+static const MemOp dtype_mop[16] = {
+ MO_UB, MO_UB, MO_UB, MO_UB,
+ MO_SL, MO_UW, MO_UW, MO_UW,
+ MO_SW, MO_SW, MO_UL, MO_UL,
+ MO_SB, MO_SB, MO_SB, MO_UQ
+};
+
+#define dtype_msz(x) (dtype_mop[x] & MO_SIZE)
+
+/* The vector element size of dtype. */
+static const uint8_t dtype_esz[16] = {
+ 0, 1, 2, 3,
+ 3, 1, 2, 3,
+ 3, 2, 2, 3,
+ 3, 2, 1, 3
+};
+
+static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+ int dtype, uint32_t mte_n, bool is_write,
+ gen_helper_gvec_mem *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_pg;
+ int desc = 0;
+
+ /*
+ * For e.g. LD4, there are not enough arguments to pass all 4
+ * registers as pointers, so encode the regno into the data field.
+ * For consistency, do this even for LD1.
+ */
+ if (s->mte_active[0]) {
+ int msz = dtype_msz(dtype);
+
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+ desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+ desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1);
+ desc <<= SVE_MTEDESC_SHIFT;
+ } else {
+ addr = clean_data_tbi(s, addr);
+ }
+
+ desc = simd_desc(vsz, vsz, zt | desc);
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+ fn(cpu_env, t_pg, addr, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_pg);
+}
+
+/* Indexed by [mte][be][dtype][nreg] */
+static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
+ { /* mte inactive, little-endian */
+ { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
+ gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
+ { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r,
+ gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r },
+ { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r,
+ gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r },
+ { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r,
+ gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } },
+
+ /* mte inactive, big-endian */
+ { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
+ gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
+ { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r,
+ gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r },
+ { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r,
+ gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r },
+ { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r,
+ gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } },
+
+ { /* mte active, little-endian */
+ { { gen_helper_sve_ld1bb_r_mte,
+ gen_helper_sve_ld2bb_r_mte,
+ gen_helper_sve_ld3bb_r_mte,
+ gen_helper_sve_ld4bb_r_mte },
+ { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1sds_le_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hh_le_r_mte,
+ gen_helper_sve_ld2hh_le_r_mte,
+ gen_helper_sve_ld3hh_le_r_mte,
+ gen_helper_sve_ld4hh_le_r_mte },
+ { gen_helper_sve_ld1hsu_le_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hdu_le_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1hds_le_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hss_le_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1ss_le_r_mte,
+ gen_helper_sve_ld2ss_le_r_mte,
+ gen_helper_sve_ld3ss_le_r_mte,
+ gen_helper_sve_ld4ss_le_r_mte },
+ { gen_helper_sve_ld1sdu_le_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dd_le_r_mte,
+ gen_helper_sve_ld2dd_le_r_mte,
+ gen_helper_sve_ld3dd_le_r_mte,
+ gen_helper_sve_ld4dd_le_r_mte } },
+
+ /* mte active, big-endian */
+ { { gen_helper_sve_ld1bb_r_mte,
+ gen_helper_sve_ld2bb_r_mte,
+ gen_helper_sve_ld3bb_r_mte,
+ gen_helper_sve_ld4bb_r_mte },
+ { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1sds_be_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hh_be_r_mte,
+ gen_helper_sve_ld2hh_be_r_mte,
+ gen_helper_sve_ld3hh_be_r_mte,
+ gen_helper_sve_ld4hh_be_r_mte },
+ { gen_helper_sve_ld1hsu_be_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hdu_be_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1hds_be_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hss_be_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1ss_be_r_mte,
+ gen_helper_sve_ld2ss_be_r_mte,
+ gen_helper_sve_ld3ss_be_r_mte,
+ gen_helper_sve_ld4ss_be_r_mte },
+ { gen_helper_sve_ld1sdu_be_r_mte, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dd_be_r_mte,
+ gen_helper_sve_ld2dd_be_r_mte,
+ gen_helper_sve_ld3dd_be_r_mte,
+ gen_helper_sve_ld4dd_be_r_mte } } },
+};
+
+static void do_ld_zpa(DisasContext *s, int zt, int pg,
+ TCGv_i64 addr, int dtype, int nreg)
+{
+ gen_helper_gvec_mem *fn
+ = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][nreg];
+
+ /*
+ * While there are holes in the table, they are not
+ * accessible via the instruction encoding.
+ */
+ assert(fn != NULL);
+ do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn);
+}
+
+static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
+{
+ if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> dtype_esz[a->dtype];
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+ (a->imm * elements * (a->nreg + 1))
+ << dtype_msz(a->dtype));
+ do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a)
+{
+ static gen_helper_gvec_mem * const fns[2][2][16] = {
+ { /* mte inactive, little-endian */
+ { gen_helper_sve_ldff1bb_r,
+ gen_helper_sve_ldff1bhu_r,
+ gen_helper_sve_ldff1bsu_r,
+ gen_helper_sve_ldff1bdu_r,
+
+ gen_helper_sve_ldff1sds_le_r,
+ gen_helper_sve_ldff1hh_le_r,
+ gen_helper_sve_ldff1hsu_le_r,
+ gen_helper_sve_ldff1hdu_le_r,
+
+ gen_helper_sve_ldff1hds_le_r,
+ gen_helper_sve_ldff1hss_le_r,
+ gen_helper_sve_ldff1ss_le_r,
+ gen_helper_sve_ldff1sdu_le_r,
+
+ gen_helper_sve_ldff1bds_r,
+ gen_helper_sve_ldff1bss_r,
+ gen_helper_sve_ldff1bhs_r,
+ gen_helper_sve_ldff1dd_le_r },
+
+ /* mte inactive, big-endian */
+ { gen_helper_sve_ldff1bb_r,
+ gen_helper_sve_ldff1bhu_r,
+ gen_helper_sve_ldff1bsu_r,
+ gen_helper_sve_ldff1bdu_r,
+
+ gen_helper_sve_ldff1sds_be_r,
+ gen_helper_sve_ldff1hh_be_r,
+ gen_helper_sve_ldff1hsu_be_r,
+ gen_helper_sve_ldff1hdu_be_r,
+
+ gen_helper_sve_ldff1hds_be_r,
+ gen_helper_sve_ldff1hss_be_r,
+ gen_helper_sve_ldff1ss_be_r,
+ gen_helper_sve_ldff1sdu_be_r,
+
+ gen_helper_sve_ldff1bds_r,
+ gen_helper_sve_ldff1bss_r,
+ gen_helper_sve_ldff1bhs_r,
+ gen_helper_sve_ldff1dd_be_r } },
+
+ { /* mte active, little-endian */
+ { gen_helper_sve_ldff1bb_r_mte,
+ gen_helper_sve_ldff1bhu_r_mte,
+ gen_helper_sve_ldff1bsu_r_mte,
+ gen_helper_sve_ldff1bdu_r_mte,
+
+ gen_helper_sve_ldff1sds_le_r_mte,
+ gen_helper_sve_ldff1hh_le_r_mte,
+ gen_helper_sve_ldff1hsu_le_r_mte,
+ gen_helper_sve_ldff1hdu_le_r_mte,
+
+ gen_helper_sve_ldff1hds_le_r_mte,
+ gen_helper_sve_ldff1hss_le_r_mte,
+ gen_helper_sve_ldff1ss_le_r_mte,
+ gen_helper_sve_ldff1sdu_le_r_mte,
+
+ gen_helper_sve_ldff1bds_r_mte,
+ gen_helper_sve_ldff1bss_r_mte,
+ gen_helper_sve_ldff1bhs_r_mte,
+ gen_helper_sve_ldff1dd_le_r_mte },
+
+ /* mte active, big-endian */
+ { gen_helper_sve_ldff1bb_r_mte,
+ gen_helper_sve_ldff1bhu_r_mte,
+ gen_helper_sve_ldff1bsu_r_mte,
+ gen_helper_sve_ldff1bdu_r_mte,
+
+ gen_helper_sve_ldff1sds_be_r_mte,
+ gen_helper_sve_ldff1hh_be_r_mte,
+ gen_helper_sve_ldff1hsu_be_r_mte,
+ gen_helper_sve_ldff1hdu_be_r_mte,
+
+ gen_helper_sve_ldff1hds_be_r_mte,
+ gen_helper_sve_ldff1hss_be_r_mte,
+ gen_helper_sve_ldff1ss_be_r_mte,
+ gen_helper_sve_ldff1sdu_be_r_mte,
+
+ gen_helper_sve_ldff1bds_r_mte,
+ gen_helper_sve_ldff1bss_r_mte,
+ gen_helper_sve_ldff1bhs_r_mte,
+ gen_helper_sve_ldff1dd_be_r_mte } },
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
+ fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
+ }
+ return true;
+}
+
+static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ static gen_helper_gvec_mem * const fns[2][2][16] = {
+ { /* mte inactive, little-endian */
+ { gen_helper_sve_ldnf1bb_r,
+ gen_helper_sve_ldnf1bhu_r,
+ gen_helper_sve_ldnf1bsu_r,
+ gen_helper_sve_ldnf1bdu_r,
+
+ gen_helper_sve_ldnf1sds_le_r,
+ gen_helper_sve_ldnf1hh_le_r,
+ gen_helper_sve_ldnf1hsu_le_r,
+ gen_helper_sve_ldnf1hdu_le_r,
+
+ gen_helper_sve_ldnf1hds_le_r,
+ gen_helper_sve_ldnf1hss_le_r,
+ gen_helper_sve_ldnf1ss_le_r,
+ gen_helper_sve_ldnf1sdu_le_r,
+
+ gen_helper_sve_ldnf1bds_r,
+ gen_helper_sve_ldnf1bss_r,
+ gen_helper_sve_ldnf1bhs_r,
+ gen_helper_sve_ldnf1dd_le_r },
+
+ /* mte inactive, big-endian */
+ { gen_helper_sve_ldnf1bb_r,
+ gen_helper_sve_ldnf1bhu_r,
+ gen_helper_sve_ldnf1bsu_r,
+ gen_helper_sve_ldnf1bdu_r,
+
+ gen_helper_sve_ldnf1sds_be_r,
+ gen_helper_sve_ldnf1hh_be_r,
+ gen_helper_sve_ldnf1hsu_be_r,
+ gen_helper_sve_ldnf1hdu_be_r,
+
+ gen_helper_sve_ldnf1hds_be_r,
+ gen_helper_sve_ldnf1hss_be_r,
+ gen_helper_sve_ldnf1ss_be_r,
+ gen_helper_sve_ldnf1sdu_be_r,
+
+ gen_helper_sve_ldnf1bds_r,
+ gen_helper_sve_ldnf1bss_r,
+ gen_helper_sve_ldnf1bhs_r,
+ gen_helper_sve_ldnf1dd_be_r } },
+
+ { /* mte inactive, little-endian */
+ { gen_helper_sve_ldnf1bb_r_mte,
+ gen_helper_sve_ldnf1bhu_r_mte,
+ gen_helper_sve_ldnf1bsu_r_mte,
+ gen_helper_sve_ldnf1bdu_r_mte,
+
+ gen_helper_sve_ldnf1sds_le_r_mte,
+ gen_helper_sve_ldnf1hh_le_r_mte,
+ gen_helper_sve_ldnf1hsu_le_r_mte,
+ gen_helper_sve_ldnf1hdu_le_r_mte,
+
+ gen_helper_sve_ldnf1hds_le_r_mte,
+ gen_helper_sve_ldnf1hss_le_r_mte,
+ gen_helper_sve_ldnf1ss_le_r_mte,
+ gen_helper_sve_ldnf1sdu_le_r_mte,
+
+ gen_helper_sve_ldnf1bds_r_mte,
+ gen_helper_sve_ldnf1bss_r_mte,
+ gen_helper_sve_ldnf1bhs_r_mte,
+ gen_helper_sve_ldnf1dd_le_r_mte },
+
+ /* mte inactive, big-endian */
+ { gen_helper_sve_ldnf1bb_r_mte,
+ gen_helper_sve_ldnf1bhu_r_mte,
+ gen_helper_sve_ldnf1bsu_r_mte,
+ gen_helper_sve_ldnf1bdu_r_mte,
+
+ gen_helper_sve_ldnf1sds_be_r_mte,
+ gen_helper_sve_ldnf1hh_be_r_mte,
+ gen_helper_sve_ldnf1hsu_be_r_mte,
+ gen_helper_sve_ldnf1hdu_be_r_mte,
+
+ gen_helper_sve_ldnf1hds_be_r_mte,
+ gen_helper_sve_ldnf1hss_be_r_mte,
+ gen_helper_sve_ldnf1ss_be_r_mte,
+ gen_helper_sve_ldnf1sdu_be_r_mte,
+
+ gen_helper_sve_ldnf1bds_r_mte,
+ gen_helper_sve_ldnf1bss_r_mte,
+ gen_helper_sve_ldnf1bhs_r_mte,
+ gen_helper_sve_ldnf1dd_be_r_mte } },
+ };
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> dtype_esz[a->dtype];
+ int off = (a->imm * elements) << dtype_msz(a->dtype);
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
+ do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
+ fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
+ }
+ return true;
+}
+
+static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_pg;
+ int poff;
+
+ /* Load the first quadword using the normal predicated load helpers. */
+ poff = pred_full_reg_offset(s, pg);
+ if (vsz > 16) {
+ /*
+ * Zero-extend the first 16 bits of the predicate into a temporary.
+ * This avoids triggering an assert making sure we don't have bits
+ * set within a predicate beyond VQ, but we have lowered VQ to 1
+ * for this load operation.
+ */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+#if HOST_BIG_ENDIAN
+ poff += 6;
+#endif
+ tcg_gen_ld16u_i64(tmp, cpu_env, poff);
+
+ poff = offsetof(CPUARMState, vfp.preg_tmp);
+ tcg_gen_st_i64(tmp, cpu_env, poff);
+ tcg_temp_free_i64(tmp);
+ }
+
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+ gen_helper_gvec_mem *fn
+ = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+ fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(16, 16, zt)));
+
+ tcg_temp_free_ptr(t_pg);
+
+ /* Replicate that first quadword. */
+ if (vsz > 16) {
+ int doff = vec_full_reg_offset(s, zt);
+ tcg_gen_gvec_dup_mem(4, doff + 16, doff, vsz - 16, vsz - 16);
+ }
+}
+
+static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a)
+{
+ if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int msz = dtype_msz(a->dtype);
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ldrq(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
+static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16);
+ do_ldrq(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
+static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned vsz_r32;
+ TCGv_ptr t_pg;
+ int poff, doff;
+
+ if (vsz < 32) {
+ /*
+ * Note that this UNDEFINED check comes after CheckSVEEnabled()
+ * in the ARM pseudocode, which is the sve_access_check() done
+ * in our caller. We should not now return false from the caller.
+ */
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Load the first octaword using the normal predicated load helpers. */
+
+ poff = pred_full_reg_offset(s, pg);
+ if (vsz > 32) {
+ /*
+ * Zero-extend the first 32 bits of the predicate into a temporary.
+ * This avoids triggering an assert making sure we don't have bits
+ * set within a predicate beyond VQ, but we have lowered VQ to 2
+ * for this load operation.
+ */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+#if HOST_BIG_ENDIAN
+ poff += 4;
+#endif
+ tcg_gen_ld32u_i64(tmp, cpu_env, poff);
+
+ poff = offsetof(CPUARMState, vfp.preg_tmp);
+ tcg_gen_st_i64(tmp, cpu_env, poff);
+ tcg_temp_free_i64(tmp);
+ }
+
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+ gen_helper_gvec_mem *fn
+ = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+ fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(32, 32, zt)));
+
+ tcg_temp_free_ptr(t_pg);
+
+ /*
+ * Replicate that first octaword.
+ * The replication happens in units of 32; if the full vector size
+ * is not a multiple of 32, the final bits are zeroed.
+ */
+ doff = vec_full_reg_offset(s, zt);
+ vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32);
+ if (vsz >= 64) {
+ tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz_r32 - 32);
+ }
+ vsz -= vsz_r32;
+ if (vsz) {
+ tcg_gen_gvec_dup_imm(MO_64, doff + vsz_r32, vsz, vsz, 0);
+ }
+}
+
+static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a)
+{
+ if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+ return false;
+ }
+ if (a->rm == 31) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ldro(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
+static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32);
+ do_ldro(s, a->rd, a->pg, addr, a->dtype);
+ }
+ return true;
+}
+
+/* Load and broadcast element. */
+static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned psz = pred_full_reg_size(s);
+ unsigned esz = dtype_esz[a->dtype];
+ unsigned msz = dtype_msz(a->dtype);
+ TCGLabel *over;
+ TCGv_i64 temp, clean_addr;
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ over = gen_new_label();
+
+ /* If the guarding predicate has no bits set, no load occurs. */
+ if (psz <= 8) {
+ /* Reduce the pred_esz_masks value simply to reduce the
+ * size of the code generated here.
+ */
+ uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8);
+ temp = tcg_temp_new_i64();
+ tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg));
+ tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask);
+ tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over);
+ tcg_temp_free_i64(temp);
+ } else {
+ TCGv_i32 t32 = tcg_temp_new_i32();
+ find_last_active(s, t32, esz, a->pg);
+ tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over);
+ tcg_temp_free_i32(t32);
+ }
+
+ /* Load the data. */
+ temp = tcg_temp_new_i64();
+ tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz);
+ clean_addr = gen_mte_check1(s, temp, false, true, msz);
+
+ tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s),
+ finalize_memop(s, dtype_mop[a->dtype]));
+
+ /* Broadcast to *all* elements. */
+ tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd),
+ vsz, vsz, temp);
+ tcg_temp_free_i64(temp);
+
+ /* Zero the inactive elements. */
+ gen_set_label(over);
+ return do_movz_zpz(s, a->rd, a->rd, a->pg, esz, false);
+}
+
+static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+ int msz, int esz, int nreg)
+{
+ static gen_helper_gvec_mem * const fn_single[2][2][4][4] = {
+ { { { gen_helper_sve_st1bb_r,
+ gen_helper_sve_st1bh_r,
+ gen_helper_sve_st1bs_r,
+ gen_helper_sve_st1bd_r },
+ { NULL,
+ gen_helper_sve_st1hh_le_r,
+ gen_helper_sve_st1hs_le_r,
+ gen_helper_sve_st1hd_le_r },
+ { NULL, NULL,
+ gen_helper_sve_st1ss_le_r,
+ gen_helper_sve_st1sd_le_r },
+ { NULL, NULL, NULL,
+ gen_helper_sve_st1dd_le_r } },
+ { { gen_helper_sve_st1bb_r,
+ gen_helper_sve_st1bh_r,
+ gen_helper_sve_st1bs_r,
+ gen_helper_sve_st1bd_r },
+ { NULL,
+ gen_helper_sve_st1hh_be_r,
+ gen_helper_sve_st1hs_be_r,
+ gen_helper_sve_st1hd_be_r },
+ { NULL, NULL,
+ gen_helper_sve_st1ss_be_r,
+ gen_helper_sve_st1sd_be_r },
+ { NULL, NULL, NULL,
+ gen_helper_sve_st1dd_be_r } } },
+
+ { { { gen_helper_sve_st1bb_r_mte,
+ gen_helper_sve_st1bh_r_mte,
+ gen_helper_sve_st1bs_r_mte,
+ gen_helper_sve_st1bd_r_mte },
+ { NULL,
+ gen_helper_sve_st1hh_le_r_mte,
+ gen_helper_sve_st1hs_le_r_mte,
+ gen_helper_sve_st1hd_le_r_mte },
+ { NULL, NULL,
+ gen_helper_sve_st1ss_le_r_mte,
+ gen_helper_sve_st1sd_le_r_mte },
+ { NULL, NULL, NULL,
+ gen_helper_sve_st1dd_le_r_mte } },
+ { { gen_helper_sve_st1bb_r_mte,
+ gen_helper_sve_st1bh_r_mte,
+ gen_helper_sve_st1bs_r_mte,
+ gen_helper_sve_st1bd_r_mte },
+ { NULL,
+ gen_helper_sve_st1hh_be_r_mte,
+ gen_helper_sve_st1hs_be_r_mte,
+ gen_helper_sve_st1hd_be_r_mte },
+ { NULL, NULL,
+ gen_helper_sve_st1ss_be_r_mte,
+ gen_helper_sve_st1sd_be_r_mte },
+ { NULL, NULL, NULL,
+ gen_helper_sve_st1dd_be_r_mte } } },
+ };
+ static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = {
+ { { { gen_helper_sve_st2bb_r,
+ gen_helper_sve_st2hh_le_r,
+ gen_helper_sve_st2ss_le_r,
+ gen_helper_sve_st2dd_le_r },
+ { gen_helper_sve_st3bb_r,
+ gen_helper_sve_st3hh_le_r,
+ gen_helper_sve_st3ss_le_r,
+ gen_helper_sve_st3dd_le_r },
+ { gen_helper_sve_st4bb_r,
+ gen_helper_sve_st4hh_le_r,
+ gen_helper_sve_st4ss_le_r,
+ gen_helper_sve_st4dd_le_r } },
+ { { gen_helper_sve_st2bb_r,
+ gen_helper_sve_st2hh_be_r,
+ gen_helper_sve_st2ss_be_r,
+ gen_helper_sve_st2dd_be_r },
+ { gen_helper_sve_st3bb_r,
+ gen_helper_sve_st3hh_be_r,
+ gen_helper_sve_st3ss_be_r,
+ gen_helper_sve_st3dd_be_r },
+ { gen_helper_sve_st4bb_r,
+ gen_helper_sve_st4hh_be_r,
+ gen_helper_sve_st4ss_be_r,
+ gen_helper_sve_st4dd_be_r } } },
+ { { { gen_helper_sve_st2bb_r_mte,
+ gen_helper_sve_st2hh_le_r_mte,
+ gen_helper_sve_st2ss_le_r_mte,
+ gen_helper_sve_st2dd_le_r_mte },
+ { gen_helper_sve_st3bb_r_mte,
+ gen_helper_sve_st3hh_le_r_mte,
+ gen_helper_sve_st3ss_le_r_mte,
+ gen_helper_sve_st3dd_le_r_mte },
+ { gen_helper_sve_st4bb_r_mte,
+ gen_helper_sve_st4hh_le_r_mte,
+ gen_helper_sve_st4ss_le_r_mte,
+ gen_helper_sve_st4dd_le_r_mte } },
+ { { gen_helper_sve_st2bb_r_mte,
+ gen_helper_sve_st2hh_be_r_mte,
+ gen_helper_sve_st2ss_be_r_mte,
+ gen_helper_sve_st2dd_be_r_mte },
+ { gen_helper_sve_st3bb_r_mte,
+ gen_helper_sve_st3hh_be_r_mte,
+ gen_helper_sve_st3ss_be_r_mte,
+ gen_helper_sve_st3dd_be_r_mte },
+ { gen_helper_sve_st4bb_r_mte,
+ gen_helper_sve_st4hh_be_r_mte,
+ gen_helper_sve_st4ss_be_r_mte,
+ gen_helper_sve_st4dd_be_r_mte } } },
+ };
+ gen_helper_gvec_mem *fn;
+ int be = s->be_data == MO_BE;
+
+ if (nreg == 0) {
+ /* ST1 */
+ fn = fn_single[s->mte_active[0]][be][msz][esz];
+ nreg = 1;
+ } else {
+ /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */
+ assert(msz == esz);
+ fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz];
+ }
+ assert(fn != NULL);
+ do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn);
+}
+
+static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (a->rm == 31 || a->msz > a->esz) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ if (a->msz > a->esz) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> a->esz;
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+ (a->imm * elements * (a->nreg + 1)) << a->msz);
+ do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+ }
+ return true;
+}
+
+/*
+ *** SVE gather loads / scatter stores
+ */
+
+static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm,
+ int scale, TCGv_i64 scalar, int msz, bool is_write,
+ gen_helper_gvec_mem_scatter *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_zm = tcg_temp_new_ptr();
+ TCGv_ptr t_pg = tcg_temp_new_ptr();
+ TCGv_ptr t_zt = tcg_temp_new_ptr();
+ int desc = 0;
+
+ if (s->mte_active[0]) {
+ desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+ desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+ desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+ desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+ desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << msz) - 1);
+ desc <<= SVE_MTEDESC_SHIFT;
+ }
+ desc = simd_desc(vsz, vsz, desc | scale);
+
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+ tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm));
+ tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt));
+ fn(cpu_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i32(desc));
+
+ tcg_temp_free_ptr(t_zt);
+ tcg_temp_free_ptr(t_zm);
+ tcg_temp_free_ptr(t_pg);
+}
+
+/* Indexed by [mte][be][ff][xs][u][msz]. */
+static gen_helper_gvec_mem_scatter * const
+gather_load_fn32[2][2][2][2][2][3] = {
+ { /* MTE Inactive */
+ { /* Little-endian */
+ { { { gen_helper_sve_ldbss_zsu,
+ gen_helper_sve_ldhss_le_zsu,
+ NULL, },
+ { gen_helper_sve_ldbsu_zsu,
+ gen_helper_sve_ldhsu_le_zsu,
+ gen_helper_sve_ldss_le_zsu, } },
+ { { gen_helper_sve_ldbss_zss,
+ gen_helper_sve_ldhss_le_zss,
+ NULL, },
+ { gen_helper_sve_ldbsu_zss,
+ gen_helper_sve_ldhsu_le_zss,
+ gen_helper_sve_ldss_le_zss, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbss_zsu,
+ gen_helper_sve_ldffhss_le_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zsu,
+ gen_helper_sve_ldffhsu_le_zsu,
+ gen_helper_sve_ldffss_le_zsu, } },
+ { { gen_helper_sve_ldffbss_zss,
+ gen_helper_sve_ldffhss_le_zss,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zss,
+ gen_helper_sve_ldffhsu_le_zss,
+ gen_helper_sve_ldffss_le_zss, } } } },
+
+ { /* Big-endian */
+ { { { gen_helper_sve_ldbss_zsu,
+ gen_helper_sve_ldhss_be_zsu,
+ NULL, },
+ { gen_helper_sve_ldbsu_zsu,
+ gen_helper_sve_ldhsu_be_zsu,
+ gen_helper_sve_ldss_be_zsu, } },
+ { { gen_helper_sve_ldbss_zss,
+ gen_helper_sve_ldhss_be_zss,
+ NULL, },
+ { gen_helper_sve_ldbsu_zss,
+ gen_helper_sve_ldhsu_be_zss,
+ gen_helper_sve_ldss_be_zss, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbss_zsu,
+ gen_helper_sve_ldffhss_be_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zsu,
+ gen_helper_sve_ldffhsu_be_zsu,
+ gen_helper_sve_ldffss_be_zsu, } },
+ { { gen_helper_sve_ldffbss_zss,
+ gen_helper_sve_ldffhss_be_zss,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zss,
+ gen_helper_sve_ldffhsu_be_zss,
+ gen_helper_sve_ldffss_be_zss, } } } } },
+ { /* MTE Active */
+ { /* Little-endian */
+ { { { gen_helper_sve_ldbss_zsu_mte,
+ gen_helper_sve_ldhss_le_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldbsu_zsu_mte,
+ gen_helper_sve_ldhsu_le_zsu_mte,
+ gen_helper_sve_ldss_le_zsu_mte, } },
+ { { gen_helper_sve_ldbss_zss_mte,
+ gen_helper_sve_ldhss_le_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldbsu_zss_mte,
+ gen_helper_sve_ldhsu_le_zss_mte,
+ gen_helper_sve_ldss_le_zss_mte, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbss_zsu_mte,
+ gen_helper_sve_ldffhss_le_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zsu_mte,
+ gen_helper_sve_ldffhsu_le_zsu_mte,
+ gen_helper_sve_ldffss_le_zsu_mte, } },
+ { { gen_helper_sve_ldffbss_zss_mte,
+ gen_helper_sve_ldffhss_le_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zss_mte,
+ gen_helper_sve_ldffhsu_le_zss_mte,
+ gen_helper_sve_ldffss_le_zss_mte, } } } },
+
+ { /* Big-endian */
+ { { { gen_helper_sve_ldbss_zsu_mte,
+ gen_helper_sve_ldhss_be_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldbsu_zsu_mte,
+ gen_helper_sve_ldhsu_be_zsu_mte,
+ gen_helper_sve_ldss_be_zsu_mte, } },
+ { { gen_helper_sve_ldbss_zss_mte,
+ gen_helper_sve_ldhss_be_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldbsu_zss_mte,
+ gen_helper_sve_ldhsu_be_zss_mte,
+ gen_helper_sve_ldss_be_zss_mte, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbss_zsu_mte,
+ gen_helper_sve_ldffhss_be_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zsu_mte,
+ gen_helper_sve_ldffhsu_be_zsu_mte,
+ gen_helper_sve_ldffss_be_zsu_mte, } },
+ { { gen_helper_sve_ldffbss_zss_mte,
+ gen_helper_sve_ldffhss_be_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zss_mte,
+ gen_helper_sve_ldffhsu_be_zss_mte,
+ gen_helper_sve_ldffss_be_zss_mte, } } } } },
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset. */
+static gen_helper_gvec_mem_scatter * const
+gather_load_fn64[2][2][2][3][2][4] = {
+ { /* MTE Inactive */
+ { /* Little-endian */
+ { { { gen_helper_sve_ldbds_zsu,
+ gen_helper_sve_ldhds_le_zsu,
+ gen_helper_sve_ldsds_le_zsu,
+ NULL, },
+ { gen_helper_sve_ldbdu_zsu,
+ gen_helper_sve_ldhdu_le_zsu,
+ gen_helper_sve_ldsdu_le_zsu,
+ gen_helper_sve_lddd_le_zsu, } },
+ { { gen_helper_sve_ldbds_zss,
+ gen_helper_sve_ldhds_le_zss,
+ gen_helper_sve_ldsds_le_zss,
+ NULL, },
+ { gen_helper_sve_ldbdu_zss,
+ gen_helper_sve_ldhdu_le_zss,
+ gen_helper_sve_ldsdu_le_zss,
+ gen_helper_sve_lddd_le_zss, } },
+ { { gen_helper_sve_ldbds_zd,
+ gen_helper_sve_ldhds_le_zd,
+ gen_helper_sve_ldsds_le_zd,
+ NULL, },
+ { gen_helper_sve_ldbdu_zd,
+ gen_helper_sve_ldhdu_le_zd,
+ gen_helper_sve_ldsdu_le_zd,
+ gen_helper_sve_lddd_le_zd, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbds_zsu,
+ gen_helper_sve_ldffhds_le_zsu,
+ gen_helper_sve_ldffsds_le_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zsu,
+ gen_helper_sve_ldffhdu_le_zsu,
+ gen_helper_sve_ldffsdu_le_zsu,
+ gen_helper_sve_ldffdd_le_zsu, } },
+ { { gen_helper_sve_ldffbds_zss,
+ gen_helper_sve_ldffhds_le_zss,
+ gen_helper_sve_ldffsds_le_zss,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zss,
+ gen_helper_sve_ldffhdu_le_zss,
+ gen_helper_sve_ldffsdu_le_zss,
+ gen_helper_sve_ldffdd_le_zss, } },
+ { { gen_helper_sve_ldffbds_zd,
+ gen_helper_sve_ldffhds_le_zd,
+ gen_helper_sve_ldffsds_le_zd,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zd,
+ gen_helper_sve_ldffhdu_le_zd,
+ gen_helper_sve_ldffsdu_le_zd,
+ gen_helper_sve_ldffdd_le_zd, } } } },
+ { /* Big-endian */
+ { { { gen_helper_sve_ldbds_zsu,
+ gen_helper_sve_ldhds_be_zsu,
+ gen_helper_sve_ldsds_be_zsu,
+ NULL, },
+ { gen_helper_sve_ldbdu_zsu,
+ gen_helper_sve_ldhdu_be_zsu,
+ gen_helper_sve_ldsdu_be_zsu,
+ gen_helper_sve_lddd_be_zsu, } },
+ { { gen_helper_sve_ldbds_zss,
+ gen_helper_sve_ldhds_be_zss,
+ gen_helper_sve_ldsds_be_zss,
+ NULL, },
+ { gen_helper_sve_ldbdu_zss,
+ gen_helper_sve_ldhdu_be_zss,
+ gen_helper_sve_ldsdu_be_zss,
+ gen_helper_sve_lddd_be_zss, } },
+ { { gen_helper_sve_ldbds_zd,
+ gen_helper_sve_ldhds_be_zd,
+ gen_helper_sve_ldsds_be_zd,
+ NULL, },
+ { gen_helper_sve_ldbdu_zd,
+ gen_helper_sve_ldhdu_be_zd,
+ gen_helper_sve_ldsdu_be_zd,
+ gen_helper_sve_lddd_be_zd, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbds_zsu,
+ gen_helper_sve_ldffhds_be_zsu,
+ gen_helper_sve_ldffsds_be_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zsu,
+ gen_helper_sve_ldffhdu_be_zsu,
+ gen_helper_sve_ldffsdu_be_zsu,
+ gen_helper_sve_ldffdd_be_zsu, } },
+ { { gen_helper_sve_ldffbds_zss,
+ gen_helper_sve_ldffhds_be_zss,
+ gen_helper_sve_ldffsds_be_zss,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zss,
+ gen_helper_sve_ldffhdu_be_zss,
+ gen_helper_sve_ldffsdu_be_zss,
+ gen_helper_sve_ldffdd_be_zss, } },
+ { { gen_helper_sve_ldffbds_zd,
+ gen_helper_sve_ldffhds_be_zd,
+ gen_helper_sve_ldffsds_be_zd,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zd,
+ gen_helper_sve_ldffhdu_be_zd,
+ gen_helper_sve_ldffsdu_be_zd,
+ gen_helper_sve_ldffdd_be_zd, } } } } },
+ { /* MTE Active */
+ { /* Little-endian */
+ { { { gen_helper_sve_ldbds_zsu_mte,
+ gen_helper_sve_ldhds_le_zsu_mte,
+ gen_helper_sve_ldsds_le_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zsu_mte,
+ gen_helper_sve_ldhdu_le_zsu_mte,
+ gen_helper_sve_ldsdu_le_zsu_mte,
+ gen_helper_sve_lddd_le_zsu_mte, } },
+ { { gen_helper_sve_ldbds_zss_mte,
+ gen_helper_sve_ldhds_le_zss_mte,
+ gen_helper_sve_ldsds_le_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zss_mte,
+ gen_helper_sve_ldhdu_le_zss_mte,
+ gen_helper_sve_ldsdu_le_zss_mte,
+ gen_helper_sve_lddd_le_zss_mte, } },
+ { { gen_helper_sve_ldbds_zd_mte,
+ gen_helper_sve_ldhds_le_zd_mte,
+ gen_helper_sve_ldsds_le_zd_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zd_mte,
+ gen_helper_sve_ldhdu_le_zd_mte,
+ gen_helper_sve_ldsdu_le_zd_mte,
+ gen_helper_sve_lddd_le_zd_mte, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbds_zsu_mte,
+ gen_helper_sve_ldffhds_le_zsu_mte,
+ gen_helper_sve_ldffsds_le_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zsu_mte,
+ gen_helper_sve_ldffhdu_le_zsu_mte,
+ gen_helper_sve_ldffsdu_le_zsu_mte,
+ gen_helper_sve_ldffdd_le_zsu_mte, } },
+ { { gen_helper_sve_ldffbds_zss_mte,
+ gen_helper_sve_ldffhds_le_zss_mte,
+ gen_helper_sve_ldffsds_le_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zss_mte,
+ gen_helper_sve_ldffhdu_le_zss_mte,
+ gen_helper_sve_ldffsdu_le_zss_mte,
+ gen_helper_sve_ldffdd_le_zss_mte, } },
+ { { gen_helper_sve_ldffbds_zd_mte,
+ gen_helper_sve_ldffhds_le_zd_mte,
+ gen_helper_sve_ldffsds_le_zd_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zd_mte,
+ gen_helper_sve_ldffhdu_le_zd_mte,
+ gen_helper_sve_ldffsdu_le_zd_mte,
+ gen_helper_sve_ldffdd_le_zd_mte, } } } },
+ { /* Big-endian */
+ { { { gen_helper_sve_ldbds_zsu_mte,
+ gen_helper_sve_ldhds_be_zsu_mte,
+ gen_helper_sve_ldsds_be_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zsu_mte,
+ gen_helper_sve_ldhdu_be_zsu_mte,
+ gen_helper_sve_ldsdu_be_zsu_mte,
+ gen_helper_sve_lddd_be_zsu_mte, } },
+ { { gen_helper_sve_ldbds_zss_mte,
+ gen_helper_sve_ldhds_be_zss_mte,
+ gen_helper_sve_ldsds_be_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zss_mte,
+ gen_helper_sve_ldhdu_be_zss_mte,
+ gen_helper_sve_ldsdu_be_zss_mte,
+ gen_helper_sve_lddd_be_zss_mte, } },
+ { { gen_helper_sve_ldbds_zd_mte,
+ gen_helper_sve_ldhds_be_zd_mte,
+ gen_helper_sve_ldsds_be_zd_mte,
+ NULL, },
+ { gen_helper_sve_ldbdu_zd_mte,
+ gen_helper_sve_ldhdu_be_zd_mte,
+ gen_helper_sve_ldsdu_be_zd_mte,
+ gen_helper_sve_lddd_be_zd_mte, } } },
+
+ /* First-fault */
+ { { { gen_helper_sve_ldffbds_zsu_mte,
+ gen_helper_sve_ldffhds_be_zsu_mte,
+ gen_helper_sve_ldffsds_be_zsu_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zsu_mte,
+ gen_helper_sve_ldffhdu_be_zsu_mte,
+ gen_helper_sve_ldffsdu_be_zsu_mte,
+ gen_helper_sve_ldffdd_be_zsu_mte, } },
+ { { gen_helper_sve_ldffbds_zss_mte,
+ gen_helper_sve_ldffhds_be_zss_mte,
+ gen_helper_sve_ldffsds_be_zss_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zss_mte,
+ gen_helper_sve_ldffhdu_be_zss_mte,
+ gen_helper_sve_ldffsdu_be_zss_mte,
+ gen_helper_sve_ldffdd_be_zss_mte, } },
+ { { gen_helper_sve_ldffbds_zd_mte,
+ gen_helper_sve_ldffhds_be_zd_mte,
+ gen_helper_sve_ldffsds_be_zd_mte,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zd_mte,
+ gen_helper_sve_ldffhdu_be_zd_mte,
+ gen_helper_sve_ldffsdu_be_zd_mte,
+ gen_helper_sve_ldffdd_be_zd_mte, } } } } },
+};
+
+static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = gather_load_fn32[mte][be][a->ff][a->xs][a->u][a->msz];
+ break;
+ case MO_64:
+ fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+ cpu_reg_sp(s, a->rn), a->msz, false, fn);
+ return true;
+}
+
+static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (a->esz < a->msz || (a->esz == a->msz && !a->u)) {
+ return false;
+ }
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = gather_load_fn32[mte][be][a->ff][0][a->u][a->msz];
+ break;
+ case MO_64:
+ fn = gather_load_fn64[mte][be][a->ff][2][a->u][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x])
+ * by loading the immediate into the scalar parameter.
+ */
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+ tcg_constant_i64(a->imm << a->msz), a->msz, false, fn);
+ return true;
+}
+
+static bool trans_LDNT1_zprz(DisasContext *s, arg_LD1_zprz *a)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (a->esz < a->msz + !a->u) {
+ return false;
+ }
+ if (!dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = gather_load_fn32[mte][be][0][0][a->u][a->msz];
+ break;
+ case MO_64:
+ fn = gather_load_fn64[mte][be][0][2][a->u][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+ cpu_reg(s, a->rm), a->msz, false, fn);
+ return true;
+}
+
+/* Indexed by [mte][be][xs][msz]. */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][2][3] = {
+ { /* MTE Inactive */
+ { /* Little-endian */
+ { gen_helper_sve_stbs_zsu,
+ gen_helper_sve_sths_le_zsu,
+ gen_helper_sve_stss_le_zsu, },
+ { gen_helper_sve_stbs_zss,
+ gen_helper_sve_sths_le_zss,
+ gen_helper_sve_stss_le_zss, } },
+ { /* Big-endian */
+ { gen_helper_sve_stbs_zsu,
+ gen_helper_sve_sths_be_zsu,
+ gen_helper_sve_stss_be_zsu, },
+ { gen_helper_sve_stbs_zss,
+ gen_helper_sve_sths_be_zss,
+ gen_helper_sve_stss_be_zss, } } },
+ { /* MTE Active */
+ { /* Little-endian */
+ { gen_helper_sve_stbs_zsu_mte,
+ gen_helper_sve_sths_le_zsu_mte,
+ gen_helper_sve_stss_le_zsu_mte, },
+ { gen_helper_sve_stbs_zss_mte,
+ gen_helper_sve_sths_le_zss_mte,
+ gen_helper_sve_stss_le_zss_mte, } },
+ { /* Big-endian */
+ { gen_helper_sve_stbs_zsu_mte,
+ gen_helper_sve_sths_be_zsu_mte,
+ gen_helper_sve_stss_be_zsu_mte, },
+ { gen_helper_sve_stbs_zss_mte,
+ gen_helper_sve_sths_be_zss_mte,
+ gen_helper_sve_stss_be_zss_mte, } } },
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset. */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = {
+ { /* MTE Inactive */
+ { /* Little-endian */
+ { gen_helper_sve_stbd_zsu,
+ gen_helper_sve_sthd_le_zsu,
+ gen_helper_sve_stsd_le_zsu,
+ gen_helper_sve_stdd_le_zsu, },
+ { gen_helper_sve_stbd_zss,
+ gen_helper_sve_sthd_le_zss,
+ gen_helper_sve_stsd_le_zss,
+ gen_helper_sve_stdd_le_zss, },
+ { gen_helper_sve_stbd_zd,
+ gen_helper_sve_sthd_le_zd,
+ gen_helper_sve_stsd_le_zd,
+ gen_helper_sve_stdd_le_zd, } },
+ { /* Big-endian */
+ { gen_helper_sve_stbd_zsu,
+ gen_helper_sve_sthd_be_zsu,
+ gen_helper_sve_stsd_be_zsu,
+ gen_helper_sve_stdd_be_zsu, },
+ { gen_helper_sve_stbd_zss,
+ gen_helper_sve_sthd_be_zss,
+ gen_helper_sve_stsd_be_zss,
+ gen_helper_sve_stdd_be_zss, },
+ { gen_helper_sve_stbd_zd,
+ gen_helper_sve_sthd_be_zd,
+ gen_helper_sve_stsd_be_zd,
+ gen_helper_sve_stdd_be_zd, } } },
+ { /* MTE Inactive */
+ { /* Little-endian */
+ { gen_helper_sve_stbd_zsu_mte,
+ gen_helper_sve_sthd_le_zsu_mte,
+ gen_helper_sve_stsd_le_zsu_mte,
+ gen_helper_sve_stdd_le_zsu_mte, },
+ { gen_helper_sve_stbd_zss_mte,
+ gen_helper_sve_sthd_le_zss_mte,
+ gen_helper_sve_stsd_le_zss_mte,
+ gen_helper_sve_stdd_le_zss_mte, },
+ { gen_helper_sve_stbd_zd_mte,
+ gen_helper_sve_sthd_le_zd_mte,
+ gen_helper_sve_stsd_le_zd_mte,
+ gen_helper_sve_stdd_le_zd_mte, } },
+ { /* Big-endian */
+ { gen_helper_sve_stbd_zsu_mte,
+ gen_helper_sve_sthd_be_zsu_mte,
+ gen_helper_sve_stsd_be_zsu_mte,
+ gen_helper_sve_stdd_be_zsu_mte, },
+ { gen_helper_sve_stbd_zss_mte,
+ gen_helper_sve_sthd_be_zss_mte,
+ gen_helper_sve_stsd_be_zss_mte,
+ gen_helper_sve_stdd_be_zss_mte, },
+ { gen_helper_sve_stbd_zd_mte,
+ gen_helper_sve_sthd_be_zd_mte,
+ gen_helper_sve_stsd_be_zd_mte,
+ gen_helper_sve_stdd_be_zd_mte, } } },
+};
+
+static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
+{
+ gen_helper_gvec_mem_scatter *fn;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
+ return false;
+ }
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+ switch (a->esz) {
+ case MO_32:
+ fn = scatter_store_fn32[mte][be][a->xs][a->msz];
+ break;
+ case MO_64:
+ fn = scatter_store_fn64[mte][be][a->xs][a->msz];
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+ cpu_reg_sp(s, a->rn), a->msz, true, fn);
+ return true;
+}
+
+static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (a->esz < a->msz) {
+ return false;
+ }
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = scatter_store_fn32[mte][be][0][a->msz];
+ break;
+ case MO_64:
+ fn = scatter_store_fn64[mte][be][2][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x])
+ * by loading the immediate into the scalar parameter.
+ */
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+ tcg_constant_i64(a->imm << a->msz), a->msz, true, fn);
+ return true;
+}
+
+static bool trans_STNT1_zprz(DisasContext *s, arg_ST1_zprz *a)
+{
+ gen_helper_gvec_mem_scatter *fn;
+ bool be = s->be_data == MO_BE;
+ bool mte = s->mte_active[0];
+
+ if (a->esz < a->msz) {
+ return false;
+ }
+ if (!dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = scatter_store_fn32[mte][be][0][a->msz];
+ break;
+ case MO_64:
+ fn = scatter_store_fn64[mte][be][2][a->msz];
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+ cpu_reg(s, a->rm), a->msz, true, fn);
+ return true;
+}
+
+/*
+ * Prefetches
+ */
+
+static bool trans_PRF(DisasContext *s, arg_PRF *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ /* Prefetch is a nop within QEMU. */
+ (void)sve_access_check(s);
+ return true;
+}
+
+static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a)
+{
+ if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ /* Prefetch is a nop within QEMU. */
+ (void)sve_access_check(s);
+ return true;
+}
+
+static bool trans_PRF_ns(DisasContext *s, arg_PRF_ns *a)
+{
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ /* Prefetch is a nop within QEMU. */
+ s->is_nonstreaming = true;
+ (void)sve_access_check(s);
+ return true;
+}
+
+/*
+ * Move Prefix
+ *
+ * TODO: The implementation so far could handle predicated merging movprfx.
+ * The helper functions as written take an extra source register to
+ * use in the operation, but the result is only written when predication
+ * succeeds. For unpredicated movprfx, we need to rearrange the helpers
+ * to allow the final write back to the destination to be unconditional.
+ * For predicated zeroing movprfx, we need to rearrange the helpers to
+ * allow the final write back to zero inactives.
+ *
+ * In the meantime, just emit the moves.
+ */
+
+TRANS_FEAT(MOVPRFX, aa64_sve, do_mov_z, a->rd, a->rn)
+TRANS_FEAT(MOVPRFX_m, aa64_sve, do_sel_z, a->rd, a->rn, a->rd, a->pg, a->esz)
+TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false)
+
+/*
+ * SVE2 Integer Multiply - Unpredicated
+ */
+
+TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a)
+
+static gen_helper_gvec_3 * const smulh_zzz_fns[4] = {
+ gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h,
+ gen_helper_gvec_smulh_s, gen_helper_gvec_smulh_d,
+};
+TRANS_FEAT(SMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ smulh_zzz_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const umulh_zzz_fns[4] = {
+ gen_helper_gvec_umulh_b, gen_helper_gvec_umulh_h,
+ gen_helper_gvec_umulh_s, gen_helper_gvec_umulh_d,
+};
+TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ umulh_zzz_fns[a->esz], a, 0)
+
+TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ gen_helper_gvec_pmul_b, a, 0)
+
+static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = {
+ gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h,
+ gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d,
+};
+TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqdmulh_zzz_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = {
+ gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h,
+ gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d,
+};
+TRANS_FEAT(SQRDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqrdmulh_zzz_fns[a->esz], a, 0)
+
+/*
+ * SVE2 Integer - Predicated
+ */
+
+static gen_helper_gvec_4 * const sadlp_fns[4] = {
+ NULL, gen_helper_sve2_sadalp_zpzz_h,
+ gen_helper_sve2_sadalp_zpzz_s, gen_helper_sve2_sadalp_zpzz_d,
+};
+TRANS_FEAT(SADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
+ sadlp_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const uadlp_fns[4] = {
+ NULL, gen_helper_sve2_uadalp_zpzz_h,
+ gen_helper_sve2_uadalp_zpzz_s, gen_helper_sve2_uadalp_zpzz_d,
+};
+TRANS_FEAT(UADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
+ uadlp_fns[a->esz], a, 0)
+
+/*
+ * SVE2 integer unary operations (predicated)
+ */
+
+TRANS_FEAT(URECPE, aa64_sve2, gen_gvec_ool_arg_zpz,
+ a->esz == 2 ? gen_helper_sve2_urecpe_s : NULL, a, 0)
+
+TRANS_FEAT(URSQRTE, aa64_sve2, gen_gvec_ool_arg_zpz,
+ a->esz == 2 ? gen_helper_sve2_ursqrte_s : NULL, a, 0)
+
+static gen_helper_gvec_3 * const sqabs_fns[4] = {
+ gen_helper_sve2_sqabs_b, gen_helper_sve2_sqabs_h,
+ gen_helper_sve2_sqabs_s, gen_helper_sve2_sqabs_d,
+};
+TRANS_FEAT(SQABS, aa64_sve2, gen_gvec_ool_arg_zpz, sqabs_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sqneg_fns[4] = {
+ gen_helper_sve2_sqneg_b, gen_helper_sve2_sqneg_h,
+ gen_helper_sve2_sqneg_s, gen_helper_sve2_sqneg_d,
+};
+TRANS_FEAT(SQNEG, aa64_sve2, gen_gvec_ool_arg_zpz, sqneg_fns[a->esz], a, 0)
+
+DO_ZPZZ(SQSHL, aa64_sve2, sve2_sqshl)
+DO_ZPZZ(SQRSHL, aa64_sve2, sve2_sqrshl)
+DO_ZPZZ(SRSHL, aa64_sve2, sve2_srshl)
+
+DO_ZPZZ(UQSHL, aa64_sve2, sve2_uqshl)
+DO_ZPZZ(UQRSHL, aa64_sve2, sve2_uqrshl)
+DO_ZPZZ(URSHL, aa64_sve2, sve2_urshl)
+
+DO_ZPZZ(SHADD, aa64_sve2, sve2_shadd)
+DO_ZPZZ(SRHADD, aa64_sve2, sve2_srhadd)
+DO_ZPZZ(SHSUB, aa64_sve2, sve2_shsub)
+
+DO_ZPZZ(UHADD, aa64_sve2, sve2_uhadd)
+DO_ZPZZ(URHADD, aa64_sve2, sve2_urhadd)
+DO_ZPZZ(UHSUB, aa64_sve2, sve2_uhsub)
+
+DO_ZPZZ(ADDP, aa64_sve2, sve2_addp)
+DO_ZPZZ(SMAXP, aa64_sve2, sve2_smaxp)
+DO_ZPZZ(UMAXP, aa64_sve2, sve2_umaxp)
+DO_ZPZZ(SMINP, aa64_sve2, sve2_sminp)
+DO_ZPZZ(UMINP, aa64_sve2, sve2_uminp)
+
+DO_ZPZZ(SQADD_zpzz, aa64_sve2, sve2_sqadd)
+DO_ZPZZ(UQADD_zpzz, aa64_sve2, sve2_uqadd)
+DO_ZPZZ(SQSUB_zpzz, aa64_sve2, sve2_sqsub)
+DO_ZPZZ(UQSUB_zpzz, aa64_sve2, sve2_uqsub)
+DO_ZPZZ(SUQADD, aa64_sve2, sve2_suqadd)
+DO_ZPZZ(USQADD, aa64_sve2, sve2_usqadd)
+
+/*
+ * SVE2 Widening Integer Arithmetic
+ */
+
+static gen_helper_gvec_3 * const saddl_fns[4] = {
+ NULL, gen_helper_sve2_saddl_h,
+ gen_helper_sve2_saddl_s, gen_helper_sve2_saddl_d,
+};
+TRANS_FEAT(SADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ saddl_fns[a->esz], a, 0)
+TRANS_FEAT(SADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ saddl_fns[a->esz], a, 3)
+TRANS_FEAT(SADDLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ saddl_fns[a->esz], a, 2)
+
+static gen_helper_gvec_3 * const ssubl_fns[4] = {
+ NULL, gen_helper_sve2_ssubl_h,
+ gen_helper_sve2_ssubl_s, gen_helper_sve2_ssubl_d,
+};
+TRANS_FEAT(SSUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ ssubl_fns[a->esz], a, 0)
+TRANS_FEAT(SSUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ ssubl_fns[a->esz], a, 3)
+TRANS_FEAT(SSUBLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ ssubl_fns[a->esz], a, 2)
+TRANS_FEAT(SSUBLTB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ ssubl_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const sabdl_fns[4] = {
+ NULL, gen_helper_sve2_sabdl_h,
+ gen_helper_sve2_sabdl_s, gen_helper_sve2_sabdl_d,
+};
+TRANS_FEAT(SABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sabdl_fns[a->esz], a, 0)
+TRANS_FEAT(SABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sabdl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const uaddl_fns[4] = {
+ NULL, gen_helper_sve2_uaddl_h,
+ gen_helper_sve2_uaddl_s, gen_helper_sve2_uaddl_d,
+};
+TRANS_FEAT(UADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ uaddl_fns[a->esz], a, 0)
+TRANS_FEAT(UADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ uaddl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const usubl_fns[4] = {
+ NULL, gen_helper_sve2_usubl_h,
+ gen_helper_sve2_usubl_s, gen_helper_sve2_usubl_d,
+};
+TRANS_FEAT(USUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ usubl_fns[a->esz], a, 0)
+TRANS_FEAT(USUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ usubl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const uabdl_fns[4] = {
+ NULL, gen_helper_sve2_uabdl_h,
+ gen_helper_sve2_uabdl_s, gen_helper_sve2_uabdl_d,
+};
+TRANS_FEAT(UABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+ uabdl_fns[a->esz], a, 0)
+TRANS_FEAT(UABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+ uabdl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const sqdmull_fns[4] = {
+ NULL, gen_helper_sve2_sqdmull_zzz_h,
+ gen_helper_sve2_sqdmull_zzz_s, gen_helper_sve2_sqdmull_zzz_d,
+};
+TRANS_FEAT(SQDMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqdmull_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqdmull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const smull_fns[4] = {
+ NULL, gen_helper_sve2_smull_zzz_h,
+ gen_helper_sve2_smull_zzz_s, gen_helper_sve2_smull_zzz_d,
+};
+TRANS_FEAT(SMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ smull_fns[a->esz], a, 0)
+TRANS_FEAT(SMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ smull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const umull_fns[4] = {
+ NULL, gen_helper_sve2_umull_zzz_h,
+ gen_helper_sve2_umull_zzz_s, gen_helper_sve2_umull_zzz_d,
+};
+TRANS_FEAT(UMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ umull_fns[a->esz], a, 0)
+TRANS_FEAT(UMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+ umull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const eoril_fns[4] = {
+ gen_helper_sve2_eoril_b, gen_helper_sve2_eoril_h,
+ gen_helper_sve2_eoril_s, gen_helper_sve2_eoril_d,
+};
+TRANS_FEAT(EORBT, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 2)
+TRANS_FEAT(EORTB, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 1)
+
+static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel)
+{
+ static gen_helper_gvec_3 * const fns[4] = {
+ gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h,
+ NULL, gen_helper_sve2_pmull_d,
+ };
+
+ if (a->esz == 0) {
+ if (!dc_isar_feature(aa64_sve2_pmull128, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ } else if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ return gen_gvec_ool_arg_zzz(s, fns[a->esz], a, sel);
+}
+
+TRANS_FEAT(PMULLB, aa64_sve2, do_trans_pmull, a, false)
+TRANS_FEAT(PMULLT, aa64_sve2, do_trans_pmull, a, true)
+
+static gen_helper_gvec_3 * const saddw_fns[4] = {
+ NULL, gen_helper_sve2_saddw_h,
+ gen_helper_sve2_saddw_s, gen_helper_sve2_saddw_d,
+};
+TRANS_FEAT(SADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 0)
+TRANS_FEAT(SADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const ssubw_fns[4] = {
+ NULL, gen_helper_sve2_ssubw_h,
+ gen_helper_sve2_ssubw_s, gen_helper_sve2_ssubw_d,
+};
+TRANS_FEAT(SSUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 0)
+TRANS_FEAT(SSUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const uaddw_fns[4] = {
+ NULL, gen_helper_sve2_uaddw_h,
+ gen_helper_sve2_uaddw_s, gen_helper_sve2_uaddw_d,
+};
+TRANS_FEAT(UADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 0)
+TRANS_FEAT(UADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const usubw_fns[4] = {
+ NULL, gen_helper_sve2_usubw_h,
+ gen_helper_sve2_usubw_s, gen_helper_sve2_usubw_d,
+};
+TRANS_FEAT(USUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 0)
+TRANS_FEAT(USUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 1)
+
+static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
+{
+ int top = imm & 1;
+ int shl = imm >> 1;
+ int halfbits = 4 << vece;
+
+ if (top) {
+ if (shl == halfbits) {
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
+ tcg_gen_and_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+ } else {
+ tcg_gen_sari_vec(vece, d, n, halfbits);
+ tcg_gen_shli_vec(vece, d, d, shl);
+ }
+ } else {
+ tcg_gen_shli_vec(vece, d, n, halfbits);
+ tcg_gen_sari_vec(vece, d, d, halfbits - shl);
+ }
+}
+
+static void gen_ushll_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int imm)
+{
+ int halfbits = 4 << vece;
+ int top = imm & 1;
+ int shl = (imm >> 1);
+ int shift;
+ uint64_t mask;
+
+ mask = MAKE_64BIT_MASK(0, halfbits);
+ mask <<= shl;
+ mask = dup_const(vece, mask);
+
+ shift = shl - top * halfbits;
+ if (shift < 0) {
+ tcg_gen_shri_i64(d, n, -shift);
+ } else {
+ tcg_gen_shli_i64(d, n, shift);
+ }
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+static void gen_ushll16_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+ gen_ushll_i64(MO_16, d, n, imm);
+}
+
+static void gen_ushll32_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+ gen_ushll_i64(MO_32, d, n, imm);
+}
+
+static void gen_ushll64_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+ gen_ushll_i64(MO_64, d, n, imm);
+}
+
+static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
+{
+ int halfbits = 4 << vece;
+ int top = imm & 1;
+ int shl = imm >> 1;
+
+ if (top) {
+ if (shl == halfbits) {
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
+ tcg_gen_and_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+ } else {
+ tcg_gen_shri_vec(vece, d, n, halfbits);
+ tcg_gen_shli_vec(vece, d, d, shl);
+ }
+ } else {
+ if (shl == 0) {
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_and_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+ } else {
+ tcg_gen_shli_vec(vece, d, n, halfbits);
+ tcg_gen_shri_vec(vece, d, d, halfbits - shl);
+ }
+ }
+}
+
+static bool do_shll_tb(DisasContext *s, arg_rri_esz *a,
+ const GVecGen2i ops[3], bool sel)
+{
+
+ if (a->esz < 0 || a->esz > 2) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vsz, vsz, (a->imm << 1) | sel,
+ &ops[a->esz]);
+ }
+ return true;
+}
+
+static const TCGOpcode sshll_list[] = {
+ INDEX_op_shli_vec, INDEX_op_sari_vec, 0
+};
+static const GVecGen2i sshll_ops[3] = {
+ { .fniv = gen_sshll_vec,
+ .opt_opc = sshll_list,
+ .fno = gen_helper_sve2_sshll_h,
+ .vece = MO_16 },
+ { .fniv = gen_sshll_vec,
+ .opt_opc = sshll_list,
+ .fno = gen_helper_sve2_sshll_s,
+ .vece = MO_32 },
+ { .fniv = gen_sshll_vec,
+ .opt_opc = sshll_list,
+ .fno = gen_helper_sve2_sshll_d,
+ .vece = MO_64 }
+};
+TRANS_FEAT(SSHLLB, aa64_sve2, do_shll_tb, a, sshll_ops, false)
+TRANS_FEAT(SSHLLT, aa64_sve2, do_shll_tb, a, sshll_ops, true)
+
+static const TCGOpcode ushll_list[] = {
+ INDEX_op_shli_vec, INDEX_op_shri_vec, 0
+};
+static const GVecGen2i ushll_ops[3] = {
+ { .fni8 = gen_ushll16_i64,
+ .fniv = gen_ushll_vec,
+ .opt_opc = ushll_list,
+ .fno = gen_helper_sve2_ushll_h,
+ .vece = MO_16 },
+ { .fni8 = gen_ushll32_i64,
+ .fniv = gen_ushll_vec,
+ .opt_opc = ushll_list,
+ .fno = gen_helper_sve2_ushll_s,
+ .vece = MO_32 },
+ { .fni8 = gen_ushll64_i64,
+ .fniv = gen_ushll_vec,
+ .opt_opc = ushll_list,
+ .fno = gen_helper_sve2_ushll_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(USHLLB, aa64_sve2, do_shll_tb, a, ushll_ops, false)
+TRANS_FEAT(USHLLT, aa64_sve2, do_shll_tb, a, ushll_ops, true)
+
+static gen_helper_gvec_3 * const bext_fns[4] = {
+ gen_helper_sve2_bext_b, gen_helper_sve2_bext_h,
+ gen_helper_sve2_bext_s, gen_helper_sve2_bext_d,
+};
+TRANS_FEAT_NONSTREAMING(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+ bext_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const bdep_fns[4] = {
+ gen_helper_sve2_bdep_b, gen_helper_sve2_bdep_h,
+ gen_helper_sve2_bdep_s, gen_helper_sve2_bdep_d,
+};
+TRANS_FEAT_NONSTREAMING(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+ bdep_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const bgrp_fns[4] = {
+ gen_helper_sve2_bgrp_b, gen_helper_sve2_bgrp_h,
+ gen_helper_sve2_bgrp_s, gen_helper_sve2_bgrp_d,
+};
+TRANS_FEAT_NONSTREAMING(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+ bgrp_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const cadd_fns[4] = {
+ gen_helper_sve2_cadd_b, gen_helper_sve2_cadd_h,
+ gen_helper_sve2_cadd_s, gen_helper_sve2_cadd_d,
+};
+TRANS_FEAT(CADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
+ cadd_fns[a->esz], a, 0)
+TRANS_FEAT(CADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
+ cadd_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const sqcadd_fns[4] = {
+ gen_helper_sve2_sqcadd_b, gen_helper_sve2_sqcadd_h,
+ gen_helper_sve2_sqcadd_s, gen_helper_sve2_sqcadd_d,
+};
+TRANS_FEAT(SQCADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqcadd_fns[a->esz], a, 0)
+TRANS_FEAT(SQCADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
+ sqcadd_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const sabal_fns[4] = {
+ NULL, gen_helper_sve2_sabal_h,
+ gen_helper_sve2_sabal_s, gen_helper_sve2_sabal_d,
+};
+TRANS_FEAT(SABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 0)
+TRANS_FEAT(SABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const uabal_fns[4] = {
+ NULL, gen_helper_sve2_uabal_h,
+ gen_helper_sve2_uabal_s, gen_helper_sve2_uabal_d,
+};
+TRANS_FEAT(UABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 0)
+TRANS_FEAT(UABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 1)
+
+static bool do_adcl(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+ static gen_helper_gvec_4 * const fns[2] = {
+ gen_helper_sve2_adcl_s,
+ gen_helper_sve2_adcl_d,
+ };
+ /*
+ * Note that in this case the ESZ field encodes both size and sign.
+ * Split out 'subtract' into bit 1 of the data field for the helper.
+ */
+ return gen_gvec_ool_arg_zzzz(s, fns[a->esz & 1], a, (a->esz & 2) | sel);
+}
+
+TRANS_FEAT(ADCLB, aa64_sve2, do_adcl, a, false)
+TRANS_FEAT(ADCLT, aa64_sve2, do_adcl, a, true)
+
+TRANS_FEAT(SSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ssra, a)
+TRANS_FEAT(USRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_usra, a)
+TRANS_FEAT(SRSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_srsra, a)
+TRANS_FEAT(URSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ursra, a)
+TRANS_FEAT(SRI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sri, a)
+TRANS_FEAT(SLI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sli, a)
+
+TRANS_FEAT(SABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_saba, a)
+TRANS_FEAT(UABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_uaba, a)
+
+static bool do_narrow_extract(DisasContext *s, arg_rri_esz *a,
+ const GVecGen2 ops[3])
+{
+ if (a->esz < 0 || a->esz > MO_32 || a->imm != 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vsz, vsz, &ops[a->esz]);
+ }
+ return true;
+}
+
+static const TCGOpcode sqxtn_list[] = {
+ INDEX_op_shli_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t mask = (1ull << halfbits) - 1;
+ int64_t min = -1ull << (halfbits - 1);
+ int64_t max = -min - 1;
+
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, d, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, d, d, t);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_and_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtnb_ops[3] = {
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops)
+
+static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t mask = (1ull << halfbits) - 1;
+ int64_t min = -1ull << (halfbits - 1);
+ int64_t max = -min - 1;
+
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtnt_ops[3] = {
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQXTNT, aa64_sve2, do_narrow_extract, a, sqxtnt_ops)
+
+static const TCGOpcode uqxtn_list[] = {
+ INDEX_op_shli_vec, INDEX_op_umin_vec, 0
+};
+
+static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 uqxtnb_ops[3] = {
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops)
+
+static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 uqxtnt_ops[3] = {
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(UQXTNT, aa64_sve2, do_narrow_extract, a, uqxtnt_ops)
+
+static const TCGOpcode sqxtun_list[] = {
+ INDEX_op_shli_vec, INDEX_op_umin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, d, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtunb_ops[3] = {
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops)
+
+static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtunt_ops[3] = {
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQXTUNT, aa64_sve2, do_narrow_extract, a, sqxtunt_ops)
+
+static bool do_shr_narrow(DisasContext *s, arg_rri_esz *a,
+ const GVecGen2i ops[3])
+{
+ if (a->esz < 0 || a->esz > MO_32) {
+ return false;
+ }
+ assert(a->imm > 0 && a->imm <= (8 << a->esz));
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vsz, vsz, a->imm, &ops[a->esz]);
+ }
+ return true;
+}
+
+static void gen_shrnb_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
+{
+ int halfbits = 4 << vece;
+ uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
+
+ tcg_gen_shri_i64(d, n, shr);
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+static void gen_shrnb16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ gen_shrnb_i64(MO_16, d, n, shr);
+}
+
+static void gen_shrnb32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ gen_shrnb_i64(MO_32, d, n, shr);
+}
+
+static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ gen_shrnb_i64(MO_64, d, n, shr);
+}
+
+static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
+
+ tcg_gen_shri_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_and_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 };
+static const GVecGen2i shrnb_ops[3] = {
+ { .fni8 = gen_shrnb16_i64,
+ .fniv = gen_shrnb_vec,
+ .opt_opc = shrnb_vec_list,
+ .fno = gen_helper_sve2_shrnb_h,
+ .vece = MO_16 },
+ { .fni8 = gen_shrnb32_i64,
+ .fniv = gen_shrnb_vec,
+ .opt_opc = shrnb_vec_list,
+ .fno = gen_helper_sve2_shrnb_s,
+ .vece = MO_32 },
+ { .fni8 = gen_shrnb64_i64,
+ .fniv = gen_shrnb_vec,
+ .opt_opc = shrnb_vec_list,
+ .fno = gen_helper_sve2_shrnb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SHRNB, aa64_sve2, do_shr_narrow, a, shrnb_ops)
+
+static void gen_shrnt_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
+{
+ int halfbits = 4 << vece;
+ uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
+
+ tcg_gen_shli_i64(n, n, halfbits - shr);
+ tcg_gen_andi_i64(n, n, ~mask);
+ tcg_gen_andi_i64(d, d, mask);
+ tcg_gen_or_i64(d, d, n);
+}
+
+static void gen_shrnt16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ gen_shrnt_i64(MO_16, d, n, shr);
+}
+
+static void gen_shrnt32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ gen_shrnt_i64(MO_32, d, n, shr);
+}
+
+static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+ tcg_gen_shri_i64(n, n, shr);
+ tcg_gen_deposit_i64(d, d, n, 32, 32);
+}
+
+static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
+
+ tcg_gen_shli_vec(vece, n, n, halfbits - shr);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 };
+static const GVecGen2i shrnt_ops[3] = {
+ { .fni8 = gen_shrnt16_i64,
+ .fniv = gen_shrnt_vec,
+ .opt_opc = shrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_shrnt_h,
+ .vece = MO_16 },
+ { .fni8 = gen_shrnt32_i64,
+ .fniv = gen_shrnt_vec,
+ .opt_opc = shrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_shrnt_s,
+ .vece = MO_32 },
+ { .fni8 = gen_shrnt64_i64,
+ .fniv = gen_shrnt_vec,
+ .opt_opc = shrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_shrnt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SHRNT, aa64_sve2, do_shr_narrow, a, shrnt_ops)
+
+static const GVecGen2i rshrnb_ops[3] = {
+ { .fno = gen_helper_sve2_rshrnb_h },
+ { .fno = gen_helper_sve2_rshrnb_s },
+ { .fno = gen_helper_sve2_rshrnb_d },
+};
+TRANS_FEAT(RSHRNB, aa64_sve2, do_shr_narrow, a, rshrnb_ops)
+
+static const GVecGen2i rshrnt_ops[3] = {
+ { .fno = gen_helper_sve2_rshrnt_h },
+ { .fno = gen_helper_sve2_rshrnt_s },
+ { .fno = gen_helper_sve2_rshrnt_d },
+};
+TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops)
+
+static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+
+ tcg_gen_sari_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_umin_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrunb_vec_list[] = {
+ INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i sqshrunb_ops[3] = {
+ { .fniv = gen_sqshrunb_vec,
+ .opt_opc = sqshrunb_vec_list,
+ .fno = gen_helper_sve2_sqshrunb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqshrunb_vec,
+ .opt_opc = sqshrunb_vec_list,
+ .fno = gen_helper_sve2_sqshrunb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqshrunb_vec,
+ .opt_opc = sqshrunb_vec_list,
+ .fno = gen_helper_sve2_sqshrunb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops)
+
+static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+
+ tcg_gen_sari_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrunt_vec_list[] = {
+ INDEX_op_shli_vec, INDEX_op_sari_vec,
+ INDEX_op_smax_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i sqshrunt_ops[3] = {
+ { .fniv = gen_sqshrunt_vec,
+ .opt_opc = sqshrunt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrunt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqshrunt_vec,
+ .opt_opc = sqshrunt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrunt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqshrunt_vec,
+ .opt_opc = sqshrunt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrunt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRUNT, aa64_sve2, do_shr_narrow, a, sqshrunt_ops)
+
+static const GVecGen2i sqrshrunb_ops[3] = {
+ { .fno = gen_helper_sve2_sqrshrunb_h },
+ { .fno = gen_helper_sve2_sqrshrunb_s },
+ { .fno = gen_helper_sve2_sqrshrunb_d },
+};
+TRANS_FEAT(SQRSHRUNB, aa64_sve2, do_shr_narrow, a, sqrshrunb_ops)
+
+static const GVecGen2i sqrshrunt_ops[3] = {
+ { .fno = gen_helper_sve2_sqrshrunt_h },
+ { .fno = gen_helper_sve2_sqrshrunt_s },
+ { .fno = gen_helper_sve2_sqrshrunt_d },
+};
+TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops)
+
+static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
+ int64_t min = -max - 1;
+
+ tcg_gen_sari_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_and_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrnb_vec_list[] = {
+ INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_smin_vec, 0
+};
+static const GVecGen2i sqshrnb_ops[3] = {
+ { .fniv = gen_sqshrnb_vec,
+ .opt_opc = sqshrnb_vec_list,
+ .fno = gen_helper_sve2_sqshrnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqshrnb_vec,
+ .opt_opc = sqshrnb_vec_list,
+ .fno = gen_helper_sve2_sqshrnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqshrnb_vec,
+ .opt_opc = sqshrnb_vec_list,
+ .fno = gen_helper_sve2_sqshrnb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops)
+
+static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
+ int64_t min = -max - 1;
+
+ tcg_gen_sari_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrnt_vec_list[] = {
+ INDEX_op_shli_vec, INDEX_op_sari_vec,
+ INDEX_op_smax_vec, INDEX_op_smin_vec, 0
+};
+static const GVecGen2i sqshrnt_ops[3] = {
+ { .fniv = gen_sqshrnt_vec,
+ .opt_opc = sqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqshrnt_vec,
+ .opt_opc = sqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqshrnt_vec,
+ .opt_opc = sqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqshrnt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRNT, aa64_sve2, do_shr_narrow, a, sqshrnt_ops)
+
+static const GVecGen2i sqrshrnb_ops[3] = {
+ { .fno = gen_helper_sve2_sqrshrnb_h },
+ { .fno = gen_helper_sve2_sqrshrnb_s },
+ { .fno = gen_helper_sve2_sqrshrnb_d },
+};
+TRANS_FEAT(SQRSHRNB, aa64_sve2, do_shr_narrow, a, sqrshrnb_ops)
+
+static const GVecGen2i sqrshrnt_ops[3] = {
+ { .fno = gen_helper_sve2_sqrshrnt_h },
+ { .fno = gen_helper_sve2_sqrshrnt_s },
+ { .fno = gen_helper_sve2_sqrshrnt_d },
+};
+TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops)
+
+static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+
+ tcg_gen_shri_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_umin_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode uqshrnb_vec_list[] = {
+ INDEX_op_shri_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i uqshrnb_ops[3] = {
+ { .fniv = gen_uqshrnb_vec,
+ .opt_opc = uqshrnb_vec_list,
+ .fno = gen_helper_sve2_uqshrnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqshrnb_vec,
+ .opt_opc = uqshrnb_vec_list,
+ .fno = gen_helper_sve2_uqshrnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqshrnb_vec,
+ .opt_opc = uqshrnb_vec_list,
+ .fno = gen_helper_sve2_uqshrnb_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops)
+
+static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec n, int64_t shr)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+
+ tcg_gen_shri_vec(vece, n, n, shr);
+ tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode uqshrnt_vec_list[] = {
+ INDEX_op_shli_vec, INDEX_op_shri_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i uqshrnt_ops[3] = {
+ { .fniv = gen_uqshrnt_vec,
+ .opt_opc = uqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqshrnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqshrnt_vec,
+ .opt_opc = uqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqshrnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqshrnt_vec,
+ .opt_opc = uqshrnt_vec_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqshrnt_d,
+ .vece = MO_64 },
+};
+TRANS_FEAT(UQSHRNT, aa64_sve2, do_shr_narrow, a, uqshrnt_ops)
+
+static const GVecGen2i uqrshrnb_ops[3] = {
+ { .fno = gen_helper_sve2_uqrshrnb_h },
+ { .fno = gen_helper_sve2_uqrshrnb_s },
+ { .fno = gen_helper_sve2_uqrshrnb_d },
+};
+TRANS_FEAT(UQRSHRNB, aa64_sve2, do_shr_narrow, a, uqrshrnb_ops)
+
+static const GVecGen2i uqrshrnt_ops[3] = {
+ { .fno = gen_helper_sve2_uqrshrnt_h },
+ { .fno = gen_helper_sve2_uqrshrnt_s },
+ { .fno = gen_helper_sve2_uqrshrnt_d },
+};
+TRANS_FEAT(UQRSHRNT, aa64_sve2, do_shr_narrow, a, uqrshrnt_ops)
+
+#define DO_SVE2_ZZZ_NARROW(NAME, name) \
+ static gen_helper_gvec_3 * const name##_fns[4] = { \
+ NULL, gen_helper_sve2_##name##_h, \
+ gen_helper_sve2_##name##_s, gen_helper_sve2_##name##_d, \
+ }; \
+ TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzz, \
+ name##_fns[a->esz], a, 0)
+
+DO_SVE2_ZZZ_NARROW(ADDHNB, addhnb)
+DO_SVE2_ZZZ_NARROW(ADDHNT, addhnt)
+DO_SVE2_ZZZ_NARROW(RADDHNB, raddhnb)
+DO_SVE2_ZZZ_NARROW(RADDHNT, raddhnt)
+
+DO_SVE2_ZZZ_NARROW(SUBHNB, subhnb)
+DO_SVE2_ZZZ_NARROW(SUBHNT, subhnt)
+DO_SVE2_ZZZ_NARROW(RSUBHNB, rsubhnb)
+DO_SVE2_ZZZ_NARROW(RSUBHNT, rsubhnt)
+
+static gen_helper_gvec_flags_4 * const match_fns[4] = {
+ gen_helper_sve2_match_ppzz_b, gen_helper_sve2_match_ppzz_h, NULL, NULL
+};
+TRANS_FEAT_NONSTREAMING(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz])
+
+static gen_helper_gvec_flags_4 * const nmatch_fns[4] = {
+ gen_helper_sve2_nmatch_ppzz_b, gen_helper_sve2_nmatch_ppzz_h, NULL, NULL
+};
+TRANS_FEAT_NONSTREAMING(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz])
+
+static gen_helper_gvec_4 * const histcnt_fns[4] = {
+ NULL, NULL, gen_helper_sve2_histcnt_s, gen_helper_sve2_histcnt_d
+};
+TRANS_FEAT_NONSTREAMING(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz,
+ histcnt_fns[a->esz], a, 0)
+
+TRANS_FEAT_NONSTREAMING(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz,
+ a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0)
+
+DO_ZPZZ_FP(FADDP, aa64_sve2, sve2_faddp_zpzz)
+DO_ZPZZ_FP(FMAXNMP, aa64_sve2, sve2_fmaxnmp_zpzz)
+DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz)
+DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz)
+DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz)
+
+/*
+ * SVE Integer Multiply-Add (unpredicated)
+ */
+
+TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz,
+ gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra,
+ 0, FPST_FPCR)
+TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz,
+ gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra,
+ 0, FPST_FPCR)
+
+static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = {
+ NULL, gen_helper_sve2_sqdmlal_zzzw_h,
+ gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d,
+};
+TRANS_FEAT(SQDMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlal_zzzw_fns[a->esz], a, 3)
+TRANS_FEAT(SQDMLALBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlal_zzzw_fns[a->esz], a, 2)
+
+static gen_helper_gvec_4 * const sqdmlsl_zzzw_fns[] = {
+ NULL, gen_helper_sve2_sqdmlsl_zzzw_h,
+ gen_helper_sve2_sqdmlsl_zzzw_s, gen_helper_sve2_sqdmlsl_zzzw_d,
+};
+TRANS_FEAT(SQDMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlsl_zzzw_fns[a->esz], a, 3)
+TRANS_FEAT(SQDMLSLBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqdmlsl_zzzw_fns[a->esz], a, 2)
+
+static gen_helper_gvec_4 * const sqrdmlah_fns[] = {
+ gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h,
+ gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d,
+};
+TRANS_FEAT(SQRDMLAH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqrdmlah_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const sqrdmlsh_fns[] = {
+ gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h,
+ gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d,
+};
+TRANS_FEAT(SQRDMLSH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ sqrdmlsh_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const smlal_zzzw_fns[] = {
+ NULL, gen_helper_sve2_smlal_zzzw_h,
+ gen_helper_sve2_smlal_zzzw_s, gen_helper_sve2_smlal_zzzw_d,
+};
+TRANS_FEAT(SMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ smlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ smlal_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const umlal_zzzw_fns[] = {
+ NULL, gen_helper_sve2_umlal_zzzw_h,
+ gen_helper_sve2_umlal_zzzw_s, gen_helper_sve2_umlal_zzzw_d,
+};
+TRANS_FEAT(UMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ umlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(UMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ umlal_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const smlsl_zzzw_fns[] = {
+ NULL, gen_helper_sve2_smlsl_zzzw_h,
+ gen_helper_sve2_smlsl_zzzw_s, gen_helper_sve2_smlsl_zzzw_d,
+};
+TRANS_FEAT(SMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ smlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ smlsl_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const umlsl_zzzw_fns[] = {
+ NULL, gen_helper_sve2_umlsl_zzzw_h,
+ gen_helper_sve2_umlsl_zzzw_s, gen_helper_sve2_umlsl_zzzw_d,
+};
+TRANS_FEAT(UMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ umlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(UMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+ umlsl_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const cmla_fns[] = {
+ gen_helper_sve2_cmla_zzzz_b, gen_helper_sve2_cmla_zzzz_h,
+ gen_helper_sve2_cmla_zzzz_s, gen_helper_sve2_cmla_zzzz_d,
+};
+TRANS_FEAT(CMLA_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+ cmla_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+static gen_helper_gvec_4 * const cdot_fns[] = {
+ NULL, NULL, gen_helper_sve2_cdot_zzzz_s, gen_helper_sve2_cdot_zzzz_d
+};
+TRANS_FEAT(CDOT_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+ cdot_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+static gen_helper_gvec_4 * const sqrdcmlah_fns[] = {
+ gen_helper_sve2_sqrdcmlah_zzzz_b, gen_helper_sve2_sqrdcmlah_zzzz_h,
+ gen_helper_sve2_sqrdcmlah_zzzz_s, gen_helper_sve2_sqrdcmlah_zzzz_d,
+};
+TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+ sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+ a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0)
+
+TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz,
+ gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt)
+
+TRANS_FEAT_NONSTREAMING(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
+ gen_helper_crypto_aese, a, false)
+TRANS_FEAT_NONSTREAMING(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
+ gen_helper_crypto_aese, a, true)
+
+TRANS_FEAT_NONSTREAMING(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
+ gen_helper_crypto_sm4e, a, 0)
+TRANS_FEAT_NONSTREAMING(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
+ gen_helper_crypto_sm4ekey, a, 0)
+
+TRANS_FEAT_NONSTREAMING(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz,
+ gen_gvec_rax1, a)
+
+TRANS_FEAT(FCVTNT_sh, aa64_sve2, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve2_fcvtnt_sh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTNT_ds, aa64_sve2, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve2_fcvtnt_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(BFCVTNT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve_bfcvtnt, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTLT_hs, aa64_sve2, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve2_fcvtlt_hs, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTLT_sd, aa64_sve2, gen_gvec_fpst_arg_zpz,
+ gen_helper_sve2_fcvtlt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTX_ds, aa64_sve2, do_frint_mode, a,
+ float_round_to_odd, gen_helper_sve_fcvt_ds)
+TRANS_FEAT(FCVTXNT_ds, aa64_sve2, do_frint_mode, a,
+ float_round_to_odd, gen_helper_sve2_fcvtnt_ds)
+
+static gen_helper_gvec_3_ptr * const flogb_fns[] = {
+ NULL, gen_helper_flogb_h,
+ gen_helper_flogb_s, gen_helper_flogb_d
+};
+TRANS_FEAT(FLOGB, aa64_sve2, gen_gvec_fpst_arg_zpz, flogb_fns[a->esz],
+ a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static bool do_FMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sub, bool sel)
+{
+ return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzzw_s,
+ a->rd, a->rn, a->rm, a->ra,
+ (sel << 1) | sub, cpu_env);
+}
+
+TRANS_FEAT(FMLALB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, false)
+TRANS_FEAT(FMLALT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, true)
+TRANS_FEAT(FMLSLB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, false)
+TRANS_FEAT(FMLSLT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, true)
+
+static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel)
+{
+ return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 2) | (sel << 1) | sub, cpu_env);
+}
+
+TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false)
+TRANS_FEAT(FMLALT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, true)
+TRANS_FEAT(FMLSLB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, false)
+TRANS_FEAT(FMLSLT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, true)
+
+TRANS_FEAT_NONSTREAMING(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_smmla_b, a, 0)
+TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_usmmla_b, a, 0)
+TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_ummla_b, a, 0)
+
+TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_bfdot, a, 0)
+TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_bfdot_idx, a)
+
+TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_bfmmla, a, 0)
+
+static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal,
+ a->rd, a->rn, a->rm, a->ra, sel, FPST_FPCR);
+}
+
+TRANS_FEAT(BFMLALB_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, false)
+TRANS_FEAT(BFMLALT_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, true)
+
+static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
+{
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal_idx,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sel, FPST_FPCR);
+}
+
+TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false)
+TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true)
+
+static bool trans_PSEL(DisasContext *s, arg_psel *a)
+{
+ int vl = vec_full_reg_size(s);
+ int pl = pred_gvec_reg_size(s);
+ int elements = vl >> a->esz;
+ TCGv_i64 tmp, didx, dbit;
+ TCGv_ptr ptr;
+
+ if (!dc_isar_feature(aa64_sme, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i64();
+ dbit = tcg_temp_new_i64();
+ didx = tcg_temp_new_i64();
+ ptr = tcg_temp_new_ptr();
+
+ /* Compute the predicate element. */
+ tcg_gen_addi_i64(tmp, cpu_reg(s, a->rv), a->imm);
+ if (is_power_of_2(elements)) {
+ tcg_gen_andi_i64(tmp, tmp, elements - 1);
+ } else {
+ tcg_gen_remu_i64(tmp, tmp, tcg_constant_i64(elements));
+ }
+
+ /* Extract the predicate byte and bit indices. */
+ tcg_gen_shli_i64(tmp, tmp, a->esz);
+ tcg_gen_andi_i64(dbit, tmp, 7);
+ tcg_gen_shri_i64(didx, tmp, 3);
+ if (HOST_BIG_ENDIAN) {
+ tcg_gen_xori_i64(didx, didx, 7);
+ }
+
+ /* Load the predicate word. */
+ tcg_gen_trunc_i64_ptr(ptr, didx);
+ tcg_gen_add_ptr(ptr, ptr, cpu_env);
+ tcg_gen_ld8u_i64(tmp, ptr, pred_full_reg_offset(s, a->pm));
+
+ /* Extract the predicate bit and replicate to MO_64. */
+ tcg_gen_shr_i64(tmp, tmp, dbit);
+ tcg_gen_andi_i64(tmp, tmp, 1);
+ tcg_gen_neg_i64(tmp, tmp);
+
+ /* Apply to either copy the source, or write zeros. */
+ tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd),
+ pred_full_reg_offset(s, a->pn), tmp, pl, pl);
+
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i64(dbit);
+ tcg_temp_free_i64(didx);
+ tcg_temp_free_ptr(ptr);
+ return true;
+}
+
+static void gen_sclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
+{
+ tcg_gen_smax_i32(d, a, n);
+ tcg_gen_smin_i32(d, d, m);
+}
+
+static void gen_sclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
+{
+ tcg_gen_smax_i64(d, a, n);
+ tcg_gen_smin_i64(d, d, m);
+}
+
+static void gen_sclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec a)
+{
+ tcg_gen_smax_vec(vece, d, a, n);
+ tcg_gen_smin_vec(vece, d, d, m);
+}
+
+static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop[] = {
+ INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_sclamp_vec,
+ .fno = gen_helper_gvec_sclamp_b,
+ .opt_opc = vecop,
+ .vece = MO_8 },
+ { .fniv = gen_sclamp_vec,
+ .fno = gen_helper_gvec_sclamp_h,
+ .opt_opc = vecop,
+ .vece = MO_16 },
+ { .fni4 = gen_sclamp_i32,
+ .fniv = gen_sclamp_vec,
+ .fno = gen_helper_gvec_sclamp_s,
+ .opt_opc = vecop,
+ .vece = MO_32 },
+ { .fni8 = gen_sclamp_i64,
+ .fniv = gen_sclamp_vec,
+ .fno = gen_helper_gvec_sclamp_d,
+ .opt_opc = vecop,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
+}
+
+TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a)
+
+static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
+{
+ tcg_gen_umax_i32(d, a, n);
+ tcg_gen_umin_i32(d, d, m);
+}
+
+static void gen_uclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
+{
+ tcg_gen_umax_i64(d, a, n);
+ tcg_gen_umin_i64(d, d, m);
+}
+
+static void gen_uclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+ TCGv_vec m, TCGv_vec a)
+{
+ tcg_gen_umax_vec(vece, d, a, n);
+ tcg_gen_umin_vec(vece, d, d, m);
+}
+
+static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+ uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop[] = {
+ INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_uclamp_vec,
+ .fno = gen_helper_gvec_uclamp_b,
+ .opt_opc = vecop,
+ .vece = MO_8 },
+ { .fniv = gen_uclamp_vec,
+ .fno = gen_helper_gvec_uclamp_h,
+ .opt_opc = vecop,
+ .vece = MO_16 },
+ { .fni4 = gen_uclamp_i32,
+ .fniv = gen_uclamp_vec,
+ .fno = gen_helper_gvec_uclamp_s,
+ .opt_opc = vecop,
+ .vece = MO_32 },
+ { .fni8 = gen_uclamp_i64,
+ .fniv = gen_uclamp_vec,
+ .fno = gen_helper_gvec_uclamp_d,
+ .opt_opc = vecop,
+ .vece = MO_64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
+ };
+ tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
+}
+
+TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a)
diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
new file mode 100644
index 0000000..5c5d58d
--- /dev/null
+++ b/target/arm/tcg/translate-vfp.c
@@ -0,0 +1,3619 @@
+/*
+ * ARM translation: AArch32 VFP instructions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2005-2007 CodeSourcery
+ * Copyright (c) 2007 OpenedHand, Ltd.
+ * Copyright (c) 2019 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+/* Include the generated VFP decoder */
+#include "decode-vfp.c.inc"
+#include "decode-vfp-uncond.c.inc"
+
+static inline void vfp_load_reg64(TCGv_i64 var, int reg)
+{
+ tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_store_reg64(TCGv_i64 var, int reg)
+{
+ tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_load_reg32(TCGv_i32 var, int reg)
+{
+ tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
+static inline void vfp_store_reg32(TCGv_i32 var, int reg)
+{
+ tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
+/*
+ * The imm8 encodes the sign bit, enough bits to represent an exponent in
+ * the range 01....1xx to 10....0xx, and the most significant 4 bits of
+ * the mantissa; see VFPExpandImm() in the v8 ARM ARM.
+ */
+uint64_t vfp_expand_imm(int size, uint8_t imm8)
+{
+ uint64_t imm;
+
+ switch (size) {
+ case MO_64:
+ imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+ (extract32(imm8, 6, 1) ? 0x3fc0 : 0x4000) |
+ extract32(imm8, 0, 6);
+ imm <<= 48;
+ break;
+ case MO_32:
+ imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+ (extract32(imm8, 6, 1) ? 0x3e00 : 0x4000) |
+ (extract32(imm8, 0, 6) << 3);
+ imm <<= 16;
+ break;
+ case MO_16:
+ imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+ (extract32(imm8, 6, 1) ? 0x3000 : 0x4000) |
+ (extract32(imm8, 0, 6) << 6);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return imm;
+}
+
+/*
+ * Return the offset of a 16-bit half of the specified VFP single-precision
+ * register. If top is true, returns the top 16 bits; otherwise the bottom
+ * 16 bits.
+ */
+static inline long vfp_f16_offset(unsigned reg, bool top)
+{
+ long offs = vfp_reg_offset(false, reg);
+#if HOST_BIG_ENDIAN
+ if (!top) {
+ offs += 2;
+ }
+#else
+ if (top) {
+ offs += 2;
+ }
+#endif
+ return offs;
+}
+
+/*
+ * Generate code for M-profile lazy FP state preservation if needed;
+ * this corresponds to the pseudocode PreserveFPState() function.
+ */
+static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
+{
+ if (s->v7m_lspact) {
+ /*
+ * Lazy state saving affects external memory and also the NVIC,
+ * so we must mark it as an IO operation for icount (and cause
+ * this to be the last insn in the TB).
+ */
+ if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ gen_io_start();
+ }
+ gen_helper_v7m_preserve_fp_state(cpu_env);
+ /*
+ * If the preserve_fp_state helper doesn't throw an exception
+ * then it will clear LSPACT; we don't need to repeat this for
+ * any further FP insns in this TB.
+ */
+ s->v7m_lspact = false;
+ /*
+ * The helper might have zeroed VPR, so we do not know the
+ * correct value for the MVE_NO_PRED TB flag any more.
+ * If we're about to create a new fp context then that
+ * will precisely determine the MVE_NO_PRED value (see
+ * gen_update_fp_context()). Otherwise, we must:
+ * - set s->mve_no_pred to false, so this instruction
+ * is generated to use helper functions
+ * - end the TB now, without chaining to the next TB
+ */
+ if (skip_context_update || !s->v7m_new_fp_ctxt_needed) {
+ s->mve_no_pred = false;
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ }
+ }
+}
+
+/*
+ * Generate code for M-profile FP context handling: update the
+ * ownership of the FP context, and create a new context if
+ * necessary. This corresponds to the parts of the pseudocode
+ * ExecuteFPCheck() after the inital PreserveFPState() call.
+ */
+static void gen_update_fp_context(DisasContext *s)
+{
+ /* Update ownership of FP context: set FPCCR.S to match current state */
+ if (s->v8m_fpccr_s_wrong) {
+ TCGv_i32 tmp;
+
+ tmp = load_cpu_field(v7m.fpccr[M_REG_S]);
+ if (s->v8m_secure) {
+ tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK);
+ } else {
+ tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK);
+ }
+ store_cpu_field(tmp, v7m.fpccr[M_REG_S]);
+ /* Don't need to do this for any further FP insns in this TB */
+ s->v8m_fpccr_s_wrong = false;
+ }
+
+ if (s->v7m_new_fp_ctxt_needed) {
+ /*
+ * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
+ * the FPSCR, and VPR.
+ */
+ TCGv_i32 control, fpscr;
+ uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
+
+ fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(fpscr);
+ if (dc_isar_feature(aa32_mve, s)) {
+ store_cpu_field(tcg_constant_i32(0), v7m.vpr);
+ }
+ /*
+ * We just updated the FPSCR and VPR. Some of this state is cached
+ * in the MVE_NO_PRED TB flag. We want to avoid having to end the
+ * TB here, which means we need the new value of the MVE_NO_PRED
+ * flag to be exactly known here and the same for all executions.
+ * Luckily FPDSCR.LTPSIZE is always constant 4 and the VPR is
+ * always set to 0, so the new MVE_NO_PRED flag is always 1
+ * if and only if we have MVE.
+ *
+ * (The other FPSCR state cached in TB flags is VECLEN and VECSTRIDE,
+ * but those do not exist for M-profile, so are not relevant here.)
+ */
+ s->mve_no_pred = dc_isar_feature(aa32_mve, s);
+
+ if (s->v8m_secure) {
+ bits |= R_V7M_CONTROL_SFPA_MASK;
+ }
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_ori_i32(control, control, bits);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ /* Don't need to do this for any further FP insns in this TB */
+ s->v7m_new_fp_ctxt_needed = false;
+ }
+}
+
+/*
+ * Check that VFP access is enabled, A-profile specific version.
+ *
+ * If VFP is enabled, return true. If not, emit code to generate an
+ * appropriate exception and return false.
+ * The ignore_vfp_enabled argument specifies that we should ignore
+ * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX
+ * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns.
+ */
+static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled)
+{
+ if (s->fp_excp_el) {
+ /*
+ * The full syndrome is only used for HSR when HCPTR traps:
+ * For v8, when TA==0, coproc is RES0.
+ * For v7, any use of a Floating-point instruction or access
+ * to a Floating-point Extension register that is trapped to
+ * Hyp mode because of a trap configured in the HCPTR sets
+ * this field to 0xA.
+ */
+ int coproc = arm_dc_feature(s, ARM_FEATURE_V8) ? 0 : 0xa;
+ uint32_t syn = syn_fp_access_trap(1, 0xe, false, coproc);
+
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syn, s->fp_excp_el);
+ return false;
+ }
+
+ /*
+ * Note that rebuild_hflags_a32 has already accounted for being in EL0
+ * and the higher EL in A64 mode, etc. Unlike A64 mode, there do not
+ * appear to be any insns which touch VFP which are allowed.
+ */
+ if (s->sme_trap_nonstreaming) {
+ gen_exception_insn(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_Streaming,
+ curr_insn_len(s) == 2));
+ return false;
+ }
+
+ if (!s->vfp_enabled && !ignore_vfp_enabled) {
+ assert(!arm_dc_feature(s, ARM_FEATURE_M));
+ unallocated_encoding(s);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Check that VFP access is enabled, M-profile specific version.
+ *
+ * If VFP is enabled, do the necessary M-profile lazy-FP handling and then
+ * return true. If not, emit code to generate an appropriate exception and
+ * return false.
+ * skip_context_update is true to skip the "update FP context" part of this.
+ */
+bool vfp_access_check_m(DisasContext *s, bool skip_context_update)
+{
+ if (s->fp_excp_el) {
+ /*
+ * M-profile mostly catches the "FPU disabled" case early, in
+ * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
+ * which do coprocessor-checks are outside the large ranges of
+ * the encoding space handled by the patterns in m-nocp.decode,
+ * and for them we may need to raise NOCP here.
+ */
+ gen_exception_insn_el(s, 0, EXCP_NOCP,
+ syn_uncategorized(), s->fp_excp_el);
+ return false;
+ }
+
+ /* Handle M-profile lazy FP state mechanics */
+
+ /* Trigger lazy-state preservation if necessary */
+ gen_preserve_fp_state(s, skip_context_update);
+
+ if (!skip_context_update) {
+ /* Update ownership of FP context and create new FP context if needed */
+ gen_update_fp_context(s);
+ }
+
+ return true;
+}
+
+/*
+ * The most usual kind of VFP access check, for everything except
+ * FMXR/FMRX to the always-available special registers.
+ */
+bool vfp_access_check(DisasContext *s)
+{
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return vfp_access_check_m(s, false);
+ } else {
+ return vfp_access_check_a(s, false);
+ }
+}
+
+static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
+{
+ uint32_t rd, rn, rm;
+ int sz = a->sz;
+
+ if (!dc_isar_feature(aa32_vsel, s)) {
+ return false;
+ }
+
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vm | a->vn | a->vd) & 0x10)) {
+ return false;
+ }
+
+ rd = a->vd;
+ rn = a->vn;
+ rm = a->vm;
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (sz == 3) {
+ TCGv_i64 frn, frm, dest;
+ TCGv_i64 tmp, zero, zf, nf, vf;
+
+ zero = tcg_constant_i64(0);
+
+ frn = tcg_temp_new_i64();
+ frm = tcg_temp_new_i64();
+ dest = tcg_temp_new_i64();
+
+ zf = tcg_temp_new_i64();
+ nf = tcg_temp_new_i64();
+ vf = tcg_temp_new_i64();
+
+ tcg_gen_extu_i32_i64(zf, cpu_ZF);
+ tcg_gen_ext_i32_i64(nf, cpu_NF);
+ tcg_gen_ext_i32_i64(vf, cpu_VF);
+
+ vfp_load_reg64(frn, rn);
+ vfp_load_reg64(frm, rm);
+ switch (a->cc) {
+ case 0: /* eq: Z */
+ tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero, frn, frm);
+ break;
+ case 1: /* vs: V */
+ tcg_gen_movcond_i64(TCG_COND_LT, dest, vf, zero, frn, frm);
+ break;
+ case 2: /* ge: N == V -> N ^ V == 0 */
+ tmp = tcg_temp_new_i64();
+ tcg_gen_xor_i64(tmp, vf, nf);
+ tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, frn, frm);
+ tcg_temp_free_i64(tmp);
+ break;
+ case 3: /* gt: !Z && N == V */
+ tcg_gen_movcond_i64(TCG_COND_NE, dest, zf, zero, frn, frm);
+ tmp = tcg_temp_new_i64();
+ tcg_gen_xor_i64(tmp, vf, nf);
+ tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, dest, frm);
+ tcg_temp_free_i64(tmp);
+ break;
+ }
+ vfp_store_reg64(dest, rd);
+ tcg_temp_free_i64(frn);
+ tcg_temp_free_i64(frm);
+ tcg_temp_free_i64(dest);
+
+ tcg_temp_free_i64(zf);
+ tcg_temp_free_i64(nf);
+ tcg_temp_free_i64(vf);
+ } else {
+ TCGv_i32 frn, frm, dest;
+ TCGv_i32 tmp, zero;
+
+ zero = tcg_constant_i32(0);
+
+ frn = tcg_temp_new_i32();
+ frm = tcg_temp_new_i32();
+ dest = tcg_temp_new_i32();
+ vfp_load_reg32(frn, rn);
+ vfp_load_reg32(frm, rm);
+ switch (a->cc) {
+ case 0: /* eq: Z */
+ tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero, frn, frm);
+ break;
+ case 1: /* vs: V */
+ tcg_gen_movcond_i32(TCG_COND_LT, dest, cpu_VF, zero, frn, frm);
+ break;
+ case 2: /* ge: N == V -> N ^ V == 0 */
+ tmp = tcg_temp_new_i32();
+ tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+ tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, frn, frm);
+ tcg_temp_free_i32(tmp);
+ break;
+ case 3: /* gt: !Z && N == V */
+ tcg_gen_movcond_i32(TCG_COND_NE, dest, cpu_ZF, zero, frn, frm);
+ tmp = tcg_temp_new_i32();
+ tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+ tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, dest, frm);
+ tcg_temp_free_i32(tmp);
+ break;
+ }
+ /* For fp16 the top half is always zeroes */
+ if (sz == 1) {
+ tcg_gen_andi_i32(dest, dest, 0xffff);
+ }
+ vfp_store_reg32(dest, rd);
+ tcg_temp_free_i32(frn);
+ tcg_temp_free_i32(frm);
+ tcg_temp_free_i32(dest);
+ }
+
+ return true;
+}
+
+/*
+ * Table for converting the most common AArch32 encoding of
+ * rounding mode to arm_fprounding order (which matches the
+ * common AArch64 order); see ARM ARM pseudocode FPDecodeRM().
+ */
+static const uint8_t fp_decode_rm[] = {
+ FPROUNDING_TIEAWAY,
+ FPROUNDING_TIEEVEN,
+ FPROUNDING_POSINF,
+ FPROUNDING_NEGINF,
+};
+
+static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
+{
+ uint32_t rd, rm;
+ int sz = a->sz;
+ TCGv_ptr fpst;
+ TCGv_i32 tcg_rmode;
+ int rounding = fp_decode_rm[a->rm];
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vm | a->vd) & 0x10)) {
+ return false;
+ }
+
+ rd = a->vd;
+ rm = a->vm;
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (sz == 1) {
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ } else {
+ fpst = fpstatus_ptr(FPST_FPCR);
+ }
+
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+
+ if (sz == 3) {
+ TCGv_i64 tcg_op;
+ TCGv_i64 tcg_res;
+ tcg_op = tcg_temp_new_i64();
+ tcg_res = tcg_temp_new_i64();
+ vfp_load_reg64(tcg_op, rm);
+ gen_helper_rintd(tcg_res, tcg_op, fpst);
+ vfp_store_reg64(tcg_res, rd);
+ tcg_temp_free_i64(tcg_op);
+ tcg_temp_free_i64(tcg_res);
+ } else {
+ TCGv_i32 tcg_op;
+ TCGv_i32 tcg_res;
+ tcg_op = tcg_temp_new_i32();
+ tcg_res = tcg_temp_new_i32();
+ vfp_load_reg32(tcg_op, rm);
+ if (sz == 1) {
+ gen_helper_rinth(tcg_res, tcg_op, fpst);
+ } else {
+ gen_helper_rints(tcg_res, tcg_op, fpst);
+ }
+ vfp_store_reg32(tcg_res, rd);
+ tcg_temp_free_i32(tcg_op);
+ tcg_temp_free_i32(tcg_res);
+ }
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ tcg_temp_free_i32(tcg_rmode);
+
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
+{
+ uint32_t rd, rm;
+ int sz = a->sz;
+ TCGv_ptr fpst;
+ TCGv_i32 tcg_rmode, tcg_shift;
+ int rounding = fp_decode_rm[a->rm];
+ bool is_signed = a->op;
+
+ if (!dc_isar_feature(aa32_vcvt_dr, s)) {
+ return false;
+ }
+
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ rd = a->vd;
+ rm = a->vm;
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (sz == 1) {
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ } else {
+ fpst = fpstatus_ptr(FPST_FPCR);
+ }
+
+ tcg_shift = tcg_constant_i32(0);
+
+ tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+
+ if (sz == 3) {
+ TCGv_i64 tcg_double, tcg_res;
+ TCGv_i32 tcg_tmp;
+ tcg_double = tcg_temp_new_i64();
+ tcg_res = tcg_temp_new_i64();
+ tcg_tmp = tcg_temp_new_i32();
+ vfp_load_reg64(tcg_double, rm);
+ if (is_signed) {
+ gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst);
+ } else {
+ gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
+ }
+ tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res);
+ vfp_store_reg32(tcg_tmp, rd);
+ tcg_temp_free_i32(tcg_tmp);
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_double);
+ } else {
+ TCGv_i32 tcg_single, tcg_res;
+ tcg_single = tcg_temp_new_i32();
+ tcg_res = tcg_temp_new_i32();
+ vfp_load_reg32(tcg_single, rm);
+ if (sz == 1) {
+ if (is_signed) {
+ gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
+ } else {
+ gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst);
+ }
+ } else {
+ if (is_signed) {
+ gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
+ } else {
+ gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
+ }
+ }
+ vfp_store_reg32(tcg_res, rd);
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_single);
+ }
+
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ tcg_temp_free_i32(tcg_rmode);
+
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+bool mve_skip_vmov(DisasContext *s, int vn, int index, int size)
+{
+ /*
+ * In a CPU with MVE, the VMOV (vector lane to general-purpose register)
+ * and VMOV (general-purpose register to vector lane) insns are not
+ * predicated, but they are subject to beatwise execution if they are
+ * not in an IT block.
+ *
+ * Since our implementation always executes all 4 beats in one tick,
+ * this means only that if PSR.ECI says we should not be executing
+ * the beat corresponding to the lane of the vector register being
+ * accessed then we should skip performing the move, and that we need
+ * to do the usual check for bad ECI state and advance of ECI state.
+ *
+ * Note that if PSR.ECI is non-zero then we cannot be in an IT block.
+ *
+ * Return true if this VMOV scalar <-> gpreg should be skipped because
+ * the MVE PSR.ECI state says we skip the beat where the store happens.
+ */
+
+ /* Calculate the byte offset into Qn which we're going to access */
+ int ofs = (index << size) + ((vn & 1) * 8);
+
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ switch (s->eci) {
+ case ECI_NONE:
+ return false;
+ case ECI_A0:
+ return ofs < 4;
+ case ECI_A0A1:
+ return ofs < 8;
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return ofs < 12;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
+{
+ /* VMOV scalar to general purpose register */
+ TCGv_i32 tmp;
+
+ /*
+ * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
+ * all sizes, whether the CPU has fp or not.
+ */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ if (a->size == MO_32
+ ? !dc_isar_feature(aa32_fpsp_v2, s)
+ : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+ return false;
+ }
+
+ if (dc_isar_feature(aa32_mve, s)) {
+ if (!mve_eci_check(s)) {
+ return true;
+ }
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vn, a->index,
+ a->size | (a->u ? 0 : MO_SIGN));
+ store_reg(s, a->rt, tmp);
+ }
+
+ if (dc_isar_feature(aa32_mve, s)) {
+ mve_update_and_store_eci(s);
+ }
+ return true;
+}
+
+static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
+{
+ /* VMOV general purpose register to scalar */
+ TCGv_i32 tmp;
+
+ /*
+ * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
+ * all sizes, whether the CPU has fp or not.
+ */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ if (a->size == MO_32
+ ? !dc_isar_feature(aa32_fpsp_v2, s)
+ : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+ return false;
+ }
+
+ if (dc_isar_feature(aa32_mve, s)) {
+ if (!mve_eci_check(s)) {
+ return true;
+ }
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+ tmp = load_reg(s, a->rt);
+ write_neon_element32(tmp, a->vn, a->index, a->size);
+ tcg_temp_free_i32(tmp);
+ }
+
+ if (dc_isar_feature(aa32_mve, s)) {
+ mve_update_and_store_eci(s);
+ }
+ return true;
+}
+
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+ /* VDUP (general purpose register) */
+ TCGv_i32 tmp;
+ int size, vec_size;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+ return false;
+ }
+
+ if (a->b && a->e) {
+ return false;
+ }
+
+ if (a->q && (a->vn & 1)) {
+ return false;
+ }
+
+ vec_size = a->q ? 16 : 8;
+ if (a->b) {
+ size = 0;
+ } else if (a->e) {
+ size = 1;
+ } else {
+ size = 2;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = load_reg(s, a->rt);
+ tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(a->vn),
+ vec_size, vec_size, tmp);
+ tcg_temp_free_i32(tmp);
+
+ return true;
+}
+
+static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
+{
+ TCGv_i32 tmp;
+ bool ignore_vfp_enabled = false;
+
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ /* M profile version was already handled in m-nocp.decode */
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ switch (a->reg) {
+ case ARM_VFP_FPSID:
+ /*
+ * VFPv2 allows access to FPSID from userspace; VFPv3 restricts
+ * all ID registers to privileged access only.
+ */
+ if (IS_USER(s) && dc_isar_feature(aa32_fpsp_v3, s)) {
+ return false;
+ }
+ ignore_vfp_enabled = true;
+ break;
+ case ARM_VFP_MVFR0:
+ case ARM_VFP_MVFR1:
+ if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_MVFR)) {
+ return false;
+ }
+ ignore_vfp_enabled = true;
+ break;
+ case ARM_VFP_MVFR2:
+ if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+ ignore_vfp_enabled = true;
+ break;
+ case ARM_VFP_FPSCR:
+ break;
+ case ARM_VFP_FPEXC:
+ if (IS_USER(s)) {
+ return false;
+ }
+ ignore_vfp_enabled = true;
+ break;
+ case ARM_VFP_FPINST:
+ case ARM_VFP_FPINST2:
+ /* Not present in VFPv3 */
+ if (IS_USER(s) || dc_isar_feature(aa32_fpsp_v3, s)) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * Call vfp_access_check_a() directly, because we need to tell
+ * it to ignore FPEXC.EN for some register accesses.
+ */
+ if (!vfp_access_check_a(s, ignore_vfp_enabled)) {
+ return true;
+ }
+
+ if (a->l) {
+ /* VMRS, move VFP special register to gp register */
+ switch (a->reg) {
+ case ARM_VFP_MVFR0:
+ case ARM_VFP_MVFR1:
+ case ARM_VFP_MVFR2:
+ case ARM_VFP_FPSID:
+ if (s->current_el == 1) {
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ gen_helper_check_hcr_el2_trap(cpu_env,
+ tcg_constant_i32(a->rt),
+ tcg_constant_i32(a->reg));
+ }
+ /* fall through */
+ case ARM_VFP_FPEXC:
+ case ARM_VFP_FPINST:
+ case ARM_VFP_FPINST2:
+ tmp = load_cpu_field(vfp.xregs[a->reg]);
+ break;
+ case ARM_VFP_FPSCR:
+ if (a->rt == 15) {
+ tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+ } else {
+ tmp = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (a->rt == 15) {
+ /* Set the 4 flag bits in the CPSR. */
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp);
+ } else {
+ store_reg(s, a->rt, tmp);
+ }
+ } else {
+ /* VMSR, move gp register to VFP special register */
+ switch (a->reg) {
+ case ARM_VFP_FPSID:
+ case ARM_VFP_MVFR0:
+ case ARM_VFP_MVFR1:
+ case ARM_VFP_MVFR2:
+ /* Writes are ignored. */
+ break;
+ case ARM_VFP_FPSCR:
+ tmp = load_reg(s, a->rt);
+ gen_helper_vfp_set_fpscr(cpu_env, tmp);
+ tcg_temp_free_i32(tmp);
+ gen_lookup_tb(s);
+ break;
+ case ARM_VFP_FPEXC:
+ /*
+ * TODO: VFP subarchitecture support.
+ * For now, keep the EN bit only
+ */
+ tmp = load_reg(s, a->rt);
+ tcg_gen_andi_i32(tmp, tmp, 1 << 30);
+ store_cpu_field(tmp, vfp.xregs[a->reg]);
+ gen_lookup_tb(s);
+ break;
+ case ARM_VFP_FPINST:
+ case ARM_VFP_FPINST2:
+ tmp = load_reg(s, a->rt);
+ store_cpu_field(tmp, vfp.xregs[a->reg]);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ return true;
+}
+
+
+static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
+{
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (a->rt == 15) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (a->l) {
+ /* VFP to general purpose register */
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vn);
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
+ store_reg(s, a->rt, tmp);
+ } else {
+ /* general purpose register to VFP */
+ tmp = load_reg(s, a->rt);
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
+ vfp_store_reg32(tmp, a->vn);
+ tcg_temp_free_i32(tmp);
+ }
+
+ return true;
+}
+
+static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
+{
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (a->l) {
+ /* VFP to general purpose register */
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vn);
+ if (a->rt == 15) {
+ /* Set the 4 flag bits in the CPSR. */
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp);
+ } else {
+ store_reg(s, a->rt, tmp);
+ }
+ } else {
+ /* general purpose register to VFP */
+ tmp = load_reg(s, a->rt);
+ vfp_store_reg32(tmp, a->vn);
+ tcg_temp_free_i32(tmp);
+ }
+
+ return true;
+}
+
+static bool trans_VMOV_64_sp(DisasContext *s, arg_VMOV_64_sp *a)
+{
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ /*
+ * VMOV between two general-purpose registers and two single precision
+ * floating point registers
+ */
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (a->op) {
+ /* fpreg to gpreg */
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ store_reg(s, a->rt, tmp);
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm + 1);
+ store_reg(s, a->rt2, tmp);
+ } else {
+ /* gpreg to fpreg */
+ tmp = load_reg(s, a->rt);
+ vfp_store_reg32(tmp, a->vm);
+ tcg_temp_free_i32(tmp);
+ tmp = load_reg(s, a->rt2);
+ vfp_store_reg32(tmp, a->vm + 1);
+ tcg_temp_free_i32(tmp);
+ }
+
+ return true;
+}
+
+static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
+{
+ TCGv_i32 tmp;
+
+ /*
+ * VMOV between two general-purpose registers and one double precision
+ * floating point register. Note that this does not require support
+ * for double precision arithmetic.
+ */
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (a->op) {
+ /* fpreg to gpreg */
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm * 2);
+ store_reg(s, a->rt, tmp);
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm * 2 + 1);
+ store_reg(s, a->rt2, tmp);
+ } else {
+ /* gpreg to fpreg */
+ tmp = load_reg(s, a->rt);
+ vfp_store_reg32(tmp, a->vm * 2);
+ tcg_temp_free_i32(tmp);
+ tmp = load_reg(s, a->rt2);
+ vfp_store_reg32(tmp, a->vm * 2 + 1);
+ tcg_temp_free_i32(tmp);
+ }
+
+ return true;
+}
+
+static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr, tmp;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */
+ offset = a->imm << 1;
+ if (!a->u) {
+ offset = -offset;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, offset);
+ tmp = tcg_temp_new_i32();
+ if (a->l) {
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
+ vfp_store_reg32(tmp, a->vd);
+ } else {
+ vfp_load_reg32(tmp, a->vd);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(addr);
+
+ return true;
+}
+
+static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr, tmp;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ offset = a->imm << 2;
+ if (!a->u) {
+ offset = -offset;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, offset);
+ tmp = tcg_temp_new_i32();
+ if (a->l) {
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ vfp_store_reg32(tmp, a->vd);
+ } else {
+ vfp_load_reg32(tmp, a->vd);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(addr);
+
+ return true;
+}
+
+static bool trans_VLDR_VSTR_dp(DisasContext *s, arg_VLDR_VSTR_dp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr;
+ TCGv_i64 tmp;
+
+ /* Note that this does not require support for double arithmetic. */
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ offset = a->imm << 2;
+ if (!a->u) {
+ offset = -offset;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, offset);
+ tmp = tcg_temp_new_i64();
+ if (a->l) {
+ gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+ vfp_store_reg64(tmp, a->vd);
+ } else {
+ vfp_load_reg64(tmp, a->vd);
+ gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+ }
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i32(addr);
+
+ return true;
+}
+
+static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr, tmp;
+ int i, n;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ n = a->imm;
+
+ if (n == 0 || (a->vd + n) > 32) {
+ /*
+ * UNPREDICTABLE cases for bad immediates: we choose to
+ * UNDEF to avoid generating huge numbers of TCG ops
+ */
+ return false;
+ }
+ if (a->rn == 15 && a->w) {
+ /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
+ return false;
+ }
+
+ s->eci_handled = true;
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, 0);
+ if (a->p) {
+ /* pre-decrement */
+ tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ /*
+ * Here 'addr' is the lowest address we will store to,
+ * and is either the old SP (if post-increment) or
+ * the new SP (if pre-decrement). For post-increment
+ * where the old value is below the limit and the new
+ * value is above, it is UNKNOWN whether the limit check
+ * triggers; we choose to trigger.
+ */
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ offset = 4;
+ tmp = tcg_temp_new_i32();
+ for (i = 0; i < n; i++) {
+ if (a->l) {
+ /* load */
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ vfp_store_reg32(tmp, a->vd + i);
+ } else {
+ /* store */
+ vfp_load_reg32(tmp, a->vd + i);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ }
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ tcg_temp_free_i32(tmp);
+ if (a->w) {
+ /* writeback */
+ if (a->p) {
+ offset = -offset * n;
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+
+ clear_eci_state(s);
+ return true;
+}
+
+static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr;
+ TCGv_i64 tmp;
+ int i, n;
+
+ /* Note that this does not require support for double arithmetic. */
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ n = a->imm >> 1;
+
+ if (n == 0 || (a->vd + n) > 32 || n > 16) {
+ /*
+ * UNPREDICTABLE cases for bad immediates: we choose to
+ * UNDEF to avoid generating huge numbers of TCG ops
+ */
+ return false;
+ }
+ if (a->rn == 15 && a->w) {
+ /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd + n) > 16) {
+ return false;
+ }
+
+ s->eci_handled = true;
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, 0);
+ if (a->p) {
+ /* pre-decrement */
+ tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ /*
+ * Here 'addr' is the lowest address we will store to,
+ * and is either the old SP (if post-increment) or
+ * the new SP (if pre-decrement). For post-increment
+ * where the old value is below the limit and the new
+ * value is above, it is UNKNOWN whether the limit check
+ * triggers; we choose to trigger.
+ */
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ offset = 8;
+ tmp = tcg_temp_new_i64();
+ for (i = 0; i < n; i++) {
+ if (a->l) {
+ /* load */
+ gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+ vfp_store_reg64(tmp, a->vd + i);
+ } else {
+ /* store */
+ vfp_load_reg64(tmp, a->vd + i);
+ gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+ }
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ tcg_temp_free_i64(tmp);
+ if (a->w) {
+ /* writeback */
+ if (a->p) {
+ offset = -offset * n;
+ } else if (a->imm & 1) {
+ offset = 4;
+ } else {
+ offset = 0;
+ }
+
+ if (offset != 0) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+
+ clear_eci_state(s);
+ return true;
+}
+
+/*
+ * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp().
+ * The callback should emit code to write a value to vd. If
+ * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd
+ * will contain the old value of the relevant VFP register;
+ * otherwise it must be written to only.
+ */
+typedef void VFPGen3OpSPFn(TCGv_i32 vd,
+ TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst);
+typedef void VFPGen3OpDPFn(TCGv_i64 vd,
+ TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst);
+
+/*
+ * Types for callbacks for do_vfp_2op_sp() and do_vfp_2op_dp().
+ * The callback should emit code to write a value to vd (which
+ * should be written to only).
+ */
+typedef void VFPGen2OpSPFn(TCGv_i32 vd, TCGv_i32 vm);
+typedef void VFPGen2OpDPFn(TCGv_i64 vd, TCGv_i64 vm);
+
+/*
+ * Return true if the specified S reg is in a scalar bank
+ * (ie if it is s0..s7)
+ */
+static inline bool vfp_sreg_is_scalar(int reg)
+{
+ return (reg & 0x18) == 0;
+}
+
+/*
+ * Return true if the specified D reg is in a scalar bank
+ * (ie if it is d0..d3 or d16..d19)
+ */
+static inline bool vfp_dreg_is_scalar(int reg)
+{
+ return (reg & 0xc) == 0;
+}
+
+/*
+ * Advance the S reg number forwards by delta within its bank
+ * (ie increment the low 3 bits but leave the rest the same)
+ */
+static inline int vfp_advance_sreg(int reg, int delta)
+{
+ return ((reg + delta) & 0x7) | (reg & ~0x7);
+}
+
+/*
+ * Advance the D reg number forwards by delta within its bank
+ * (ie increment the low 2 bits but leave the rest the same)
+ */
+static inline int vfp_advance_dreg(int reg, int delta)
+{
+ return ((reg + delta) & 0x3) | (reg & ~0x3);
+}
+
+/*
+ * Perform a 3-operand VFP data processing instruction. fn is the
+ * callback to do the actual operation; this function deals with the
+ * code to handle looping around for VFP vector processing.
+ */
+static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i32 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_sreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = s->vec_stride + 1;
+
+ if (vfp_sreg_is_scalar(vm)) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i32();
+ f1 = tcg_temp_new_i32();
+ fd = tcg_temp_new_i32();
+ fpst = fpstatus_ptr(FPST_FPCR);
+
+ vfp_load_reg32(f0, vn);
+ vfp_load_reg32(f1, vm);
+
+ for (;;) {
+ if (reads_vd) {
+ vfp_load_reg32(fd, vd);
+ }
+ fn(fd, f0, f1, fpst);
+ vfp_store_reg32(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_sreg(vd, delta_d);
+ vn = vfp_advance_sreg(vn, delta_d);
+ vfp_load_reg32(f0, vn);
+ if (delta_m) {
+ vm = vfp_advance_sreg(vm, delta_m);
+ vfp_load_reg32(f1, vm);
+ }
+ }
+
+ tcg_temp_free_i32(f0);
+ tcg_temp_free_i32(f1);
+ tcg_temp_free_i32(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ /*
+ * Do a half-precision operation. Functionally this is
+ * the same as do_vfp_3op_sp(), except:
+ * - it uses the FPST_FPCR_F16
+ * - it doesn't need the VFP vector handling (fp16 is a
+ * v8 feature, and in v8 VFP vectors don't exist)
+ * - it does the aa32_fp16_arith feature test
+ */
+ TCGv_i32 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ f0 = tcg_temp_new_i32();
+ f1 = tcg_temp_new_i32();
+ fd = tcg_temp_new_i32();
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ vfp_load_reg32(f0, vn);
+ vfp_load_reg32(f1, vm);
+
+ if (reads_vd) {
+ vfp_load_reg32(fd, vd);
+ }
+ fn(fd, f0, f1, fpst);
+ vfp_store_reg32(fd, vd);
+
+ tcg_temp_free_i32(f0);
+ tcg_temp_free_i32(f1);
+ tcg_temp_free_i32(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i64 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vn | vm) & 0x10)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_dreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = (s->vec_stride >> 1) + 1;
+
+ if (vfp_dreg_is_scalar(vm)) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i64();
+ f1 = tcg_temp_new_i64();
+ fd = tcg_temp_new_i64();
+ fpst = fpstatus_ptr(FPST_FPCR);
+
+ vfp_load_reg64(f0, vn);
+ vfp_load_reg64(f1, vm);
+
+ for (;;) {
+ if (reads_vd) {
+ vfp_load_reg64(fd, vd);
+ }
+ fn(fd, f0, f1, fpst);
+ vfp_store_reg64(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_dreg(vd, delta_d);
+ vn = vfp_advance_dreg(vn, delta_d);
+ vfp_load_reg64(f0, vn);
+ if (delta_m) {
+ vm = vfp_advance_dreg(vm, delta_m);
+ vfp_load_reg64(f1, vm);
+ }
+ }
+
+ tcg_temp_free_i64(f0);
+ tcg_temp_free_i64(f1);
+ tcg_temp_free_i64(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i32 f0, fd;
+
+ /* Note that the caller must check the aa32_fpsp_v2 feature. */
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_sreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = s->vec_stride + 1;
+
+ if (vfp_sreg_is_scalar(vm)) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i32();
+ fd = tcg_temp_new_i32();
+
+ vfp_load_reg32(f0, vm);
+
+ for (;;) {
+ fn(fd, f0);
+ vfp_store_reg32(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+
+ if (delta_m == 0) {
+ /* single source one-many */
+ while (veclen--) {
+ vd = vfp_advance_sreg(vd, delta_d);
+ vfp_store_reg32(fd, vd);
+ }
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_sreg(vd, delta_d);
+ vm = vfp_advance_sreg(vm, delta_m);
+ vfp_load_reg32(f0, vm);
+ }
+
+ tcg_temp_free_i32(f0);
+ tcg_temp_free_i32(fd);
+
+ return true;
+}
+
+static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
+{
+ /*
+ * Do a half-precision operation. Functionally this is
+ * the same as do_vfp_2op_sp(), except:
+ * - it doesn't need the VFP vector handling (fp16 is a
+ * v8 feature, and in v8 VFP vectors don't exist)
+ * - it does the aa32_fp16_arith feature test
+ */
+ TCGv_i32 f0;
+
+ /* Note that the caller must check the aa32_fp16_arith feature */
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ f0 = tcg_temp_new_i32();
+ vfp_load_reg32(f0, vm);
+ fn(f0, f0);
+ vfp_store_reg32(f0, vd);
+ tcg_temp_free_i32(f0);
+
+ return true;
+}
+
+static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i64 f0, fd;
+
+ /* Note that the caller must check the aa32_fpdp_v2 feature. */
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vm) & 0x10)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_dreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = (s->vec_stride >> 1) + 1;
+
+ if (vfp_dreg_is_scalar(vm)) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i64();
+ fd = tcg_temp_new_i64();
+
+ vfp_load_reg64(f0, vm);
+
+ for (;;) {
+ fn(fd, f0);
+ vfp_store_reg64(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+
+ if (delta_m == 0) {
+ /* single source one-many */
+ while (veclen--) {
+ vd = vfp_advance_dreg(vd, delta_d);
+ vfp_store_reg64(fd, vd);
+ }
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_dreg(vd, delta_d);
+ vd = vfp_advance_dreg(vm, delta_m);
+ vfp_load_reg64(f0, vm);
+ }
+
+ tcg_temp_free_i64(f0);
+ tcg_temp_free_i64(fd);
+
+ return true;
+}
+
+static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_muls(tmp, vn, vm, fpst);
+ gen_helper_vfp_adds(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ gen_helper_vfp_muld(tmp, vn, vm, fpst);
+ gen_helper_vfp_addd(vd, vd, tmp, fpst);
+ tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VMLS: vd = vd + -(vn * vm)
+ * Note that order of inputs to the add matters for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(tmp, tmp);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VMLS: vd = vd + -(vn * vm)
+ * Note that order of inputs to the add matters for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_muls(tmp, vn, vm, fpst);
+ gen_helper_vfp_negs(tmp, tmp);
+ gen_helper_vfp_adds(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLS_sp(DisasContext *s, arg_VMLS_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VMLS_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /*
+ * VMLS: vd = vd + -(vn * vm)
+ * Note that order of inputs to the add matters for NaNs.
+ */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ gen_helper_vfp_muld(tmp, vn, vm, fpst);
+ gen_helper_vfp_negd(tmp, tmp);
+ gen_helper_vfp_addd(vd, vd, tmp, fpst);
+ tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VNMLS: -fd + (fn * fm)
+ * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+ * plausible looking simplifications because this will give wrong results
+ * for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(vd, vd);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VNMLS: -fd + (fn * fm)
+ * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+ * plausible looking simplifications because this will give wrong results
+ * for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_muls(tmp, vn, vm, fpst);
+ gen_helper_vfp_negs(vd, vd);
+ gen_helper_vfp_adds(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLS_sp(DisasContext *s, arg_VNMLS_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VNMLS_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /*
+ * VNMLS: -fd + (fn * fm)
+ * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+ * plausible looking simplifications because this will give wrong results
+ * for NaNs.
+ */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ gen_helper_vfp_muld(tmp, vn, vm, fpst);
+ gen_helper_vfp_negd(vd, vd);
+ gen_helper_vfp_addd(vd, vd, tmp, fpst);
+ tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMLA: -fd + -(fn * fm) */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(tmp, tmp);
+ gen_helper_vfp_negh(vd, vd);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMLA: -fd + -(fn * fm) */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_muls(tmp, vn, vm, fpst);
+ gen_helper_vfp_negs(tmp, tmp);
+ gen_helper_vfp_negs(vd, vd);
+ gen_helper_vfp_adds(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLA_sp(DisasContext *s, arg_VNMLA_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VNMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /* VNMLA: -fd + (fn * fm) */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ gen_helper_vfp_muld(tmp, vn, vm, fpst);
+ gen_helper_vfp_negd(tmp, tmp);
+ gen_helper_vfp_negd(vd, vd);
+ gen_helper_vfp_addd(vd, vd, tmp, fpst);
+ tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true);
+}
+
+static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMUL: -(fn * fm) */
+ gen_helper_vfp_mulh(vd, vn, vm, fpst);
+ gen_helper_vfp_negh(vd, vd);
+}
+
+static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMUL: -(fn * fm) */
+ gen_helper_vfp_muls(vd, vn, vm, fpst);
+ gen_helper_vfp_negs(vd, vd);
+}
+
+static bool trans_VNMUL_sp(DisasContext *s, arg_VNMUL_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VNMUL_sp, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /* VNMUL: -(fn * fm) */
+ gen_helper_vfp_muld(vd, vn, vm, fpst);
+ gen_helper_vfp_negd(vd, vd);
+}
+
+static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a)
+{
+ return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_hp(s, gen_helper_vfp_minnumh,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_sp(s, gen_helper_vfp_minnums,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_sp(DisasContext *s, arg_VMAXNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_sp(s, gen_helper_vfp_maxnums,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_dp(DisasContext *s, arg_VMINNM_dp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_dp(s, gen_helper_vfp_minnumd,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_dp(s, gen_helper_vfp_maxnumd,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
+{
+ /*
+ * VFNMA : fd = muladd(-fd, fn, fm)
+ * VFNMS : fd = muladd(-fd, -fn, fm)
+ * VFMA : fd = muladd( fd, fn, fm)
+ * VFMS : fd = muladd( fd, -fn, fm)
+ *
+ * These are fused multiply-add, and must be done as one floating
+ * point operation with no rounding between the multiplication and
+ * addition steps. NB that doing the negations here as separate
+ * steps is correct : an input NaN should come out with its sign
+ * bit flipped if it is a negated-input.
+ */
+ TCGv_ptr fpst;
+ TCGv_i32 vn, vm, vd;
+
+ /*
+ * Present in VFPv4 only, and only with the FP16 extension.
+ * Note that we can't rely on the SIMDFMAC check alone, because
+ * in a Neon-no-VFP core that ID register field will be non-zero.
+ */
+ if (!dc_isar_feature(aa32_fp16_arith, s) ||
+ !dc_isar_feature(aa32_simdfmac, s) ||
+ !dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vn = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+ vd = tcg_temp_new_i32();
+
+ vfp_load_reg32(vn, a->vn);
+ vfp_load_reg32(vm, a->vm);
+ if (neg_n) {
+ /* VFNMS, VFMS */
+ gen_helper_vfp_negh(vn, vn);
+ }
+ vfp_load_reg32(vd, a->vd);
+ if (neg_d) {
+ /* VFNMA, VFNMS */
+ gen_helper_vfp_negh(vd, vd);
+ }
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
+ vfp_store_reg32(vd, a->vd);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(vn);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_i32(vd);
+
+ return true;
+}
+
+static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
+{
+ /*
+ * VFNMA : fd = muladd(-fd, fn, fm)
+ * VFNMS : fd = muladd(-fd, -fn, fm)
+ * VFMA : fd = muladd( fd, fn, fm)
+ * VFMS : fd = muladd( fd, -fn, fm)
+ *
+ * These are fused multiply-add, and must be done as one floating
+ * point operation with no rounding between the multiplication and
+ * addition steps. NB that doing the negations here as separate
+ * steps is correct : an input NaN should come out with its sign
+ * bit flipped if it is a negated-input.
+ */
+ TCGv_ptr fpst;
+ TCGv_i32 vn, vm, vd;
+
+ /*
+ * Present in VFPv4 only.
+ * Note that we can't rely on the SIMDFMAC check alone, because
+ * in a Neon-no-VFP core that ID register field will be non-zero.
+ */
+ if (!dc_isar_feature(aa32_simdfmac, s) ||
+ !dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+ /*
+ * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
+ * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
+ */
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vn = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+ vd = tcg_temp_new_i32();
+
+ vfp_load_reg32(vn, a->vn);
+ vfp_load_reg32(vm, a->vm);
+ if (neg_n) {
+ /* VFNMS, VFMS */
+ gen_helper_vfp_negs(vn, vn);
+ }
+ vfp_load_reg32(vd, a->vd);
+ if (neg_d) {
+ /* VFNMA, VFNMS */
+ gen_helper_vfp_negs(vd, vd);
+ }
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_vfp_muladds(vd, vn, vm, vd, fpst);
+ vfp_store_reg32(vd, a->vd);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(vn);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_i32(vd);
+
+ return true;
+}
+
+static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
+{
+ /*
+ * VFNMA : fd = muladd(-fd, fn, fm)
+ * VFNMS : fd = muladd(-fd, -fn, fm)
+ * VFMA : fd = muladd( fd, fn, fm)
+ * VFMS : fd = muladd( fd, -fn, fm)
+ *
+ * These are fused multiply-add, and must be done as one floating
+ * point operation with no rounding between the multiplication and
+ * addition steps. NB that doing the negations here as separate
+ * steps is correct : an input NaN should come out with its sign
+ * bit flipped if it is a negated-input.
+ */
+ TCGv_ptr fpst;
+ TCGv_i64 vn, vm, vd;
+
+ /*
+ * Present in VFPv4 only.
+ * Note that we can't rely on the SIMDFMAC check alone, because
+ * in a Neon-no-VFP core that ID register field will be non-zero.
+ */
+ if (!dc_isar_feature(aa32_simdfmac, s) ||
+ !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+ /*
+ * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
+ * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
+ */
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vn = tcg_temp_new_i64();
+ vm = tcg_temp_new_i64();
+ vd = tcg_temp_new_i64();
+
+ vfp_load_reg64(vn, a->vn);
+ vfp_load_reg64(vm, a->vm);
+ if (neg_n) {
+ /* VFNMS, VFMS */
+ gen_helper_vfp_negd(vn, vn);
+ }
+ vfp_load_reg64(vd, a->vd);
+ if (neg_d) {
+ /* VFNMA, VFNMS */
+ gen_helper_vfp_negd(vd, vd);
+ }
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_vfp_muladdd(vd, vn, vm, vd, fpst);
+ vfp_store_reg64(vd, a->vd);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(vn);
+ tcg_temp_free_i64(vm);
+ tcg_temp_free_i64(vd);
+
+ return true;
+}
+
+#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD) \
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
+ arg_##INSN##_##PREC *a) \
+ { \
+ return do_vfm_##PREC(s, a, NEGN, NEGD); \
+ }
+
+#define MAKE_VFM_TRANS_FNS(PREC) \
+ MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \
+ MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \
+ MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
+ MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
+
+MAKE_VFM_TRANS_FNS(hp)
+MAKE_VFM_TRANS_FNS(sp)
+MAKE_VFM_TRANS_FNS(dp)
+
+static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
+{
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vfp_store_reg32(tcg_constant_i32(vfp_expand_imm(MO_16, a->imm)), a->vd);
+ return true;
+}
+
+static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
+{
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i32 fd;
+ uint32_t vd;
+
+ vd = a->vd;
+
+ if (!dc_isar_feature(aa32_fpsp_v3, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_sreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = s->vec_stride + 1;
+ }
+ }
+
+ fd = tcg_constant_i32(vfp_expand_imm(MO_32, a->imm));
+
+ for (;;) {
+ vfp_store_reg32(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_sreg(vd, delta_d);
+ }
+
+ return true;
+}
+
+static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
+{
+ uint32_t delta_d = 0;
+ int veclen = s->vec_len;
+ TCGv_i64 fd;
+ uint32_t vd;
+
+ vd = a->vd;
+
+ if (!dc_isar_feature(aa32_fpdp_v3, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (vd & 0x10)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ /* Figure out what type of vector operation this is. */
+ if (vfp_dreg_is_scalar(vd)) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = (s->vec_stride >> 1) + 1;
+ }
+ }
+
+ fd = tcg_constant_i64(vfp_expand_imm(MO_64, a->imm));
+
+ for (;;) {
+ vfp_store_reg64(fd, vd);
+
+ if (veclen == 0) {
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = vfp_advance_dreg(vd, delta_d);
+ }
+
+ return true;
+}
+
+#define DO_VFP_2OP(INSN, PREC, FN, CHECK) \
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
+ arg_##INSN##_##PREC *a) \
+ { \
+ if (!dc_isar_feature(CHECK, s)) { \
+ return false; \
+ } \
+ return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \
+ }
+
+#define DO_VFP_VMOV(INSN, PREC, FN) \
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
+ arg_##INSN##_##PREC *a) \
+ { \
+ if (!dc_isar_feature(aa32_fp##PREC##_v2, s) && \
+ !dc_isar_feature(aa32_mve, s)) { \
+ return false; \
+ } \
+ return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \
+ }
+
+DO_VFP_VMOV(VMOV_reg, sp, tcg_gen_mov_i32)
+DO_VFP_VMOV(VMOV_reg, dp, tcg_gen_mov_i64)
+
+DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh, aa32_fp16_arith)
+DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss, aa32_fpsp_v2)
+DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd, aa32_fpdp_v2)
+
+DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh, aa32_fp16_arith)
+DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs, aa32_fpsp_v2)
+DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd, aa32_fpdp_v2)
+
+static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm)
+{
+ gen_helper_vfp_sqrth(vd, vm, cpu_env);
+}
+
+static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
+{
+ gen_helper_vfp_sqrts(vd, vm, cpu_env);
+}
+
+static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
+{
+ gen_helper_vfp_sqrtd(vd, vm, cpu_env);
+}
+
+DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp, aa32_fp16_arith)
+DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp, aa32_fpsp_v2)
+DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp, aa32_fpdp_v2)
+
+static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
+{
+ TCGv_i32 vd, vm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ /* Vm/M bits must be zero for the Z variant */
+ if (a->z && a->vm != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vd = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+
+ vfp_load_reg32(vd, a->vd);
+ if (a->z) {
+ tcg_gen_movi_i32(vm, 0);
+ } else {
+ vfp_load_reg32(vm, a->vm);
+ }
+
+ if (a->e) {
+ gen_helper_vfp_cmpeh(vd, vm, cpu_env);
+ } else {
+ gen_helper_vfp_cmph(vd, vm, cpu_env);
+ }
+
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i32(vm);
+
+ return true;
+}
+
+static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
+{
+ TCGv_i32 vd, vm;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ /* Vm/M bits must be zero for the Z variant */
+ if (a->z && a->vm != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vd = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+
+ vfp_load_reg32(vd, a->vd);
+ if (a->z) {
+ tcg_gen_movi_i32(vm, 0);
+ } else {
+ vfp_load_reg32(vm, a->vm);
+ }
+
+ if (a->e) {
+ gen_helper_vfp_cmpes(vd, vm, cpu_env);
+ } else {
+ gen_helper_vfp_cmps(vd, vm, cpu_env);
+ }
+
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i32(vm);
+
+ return true;
+}
+
+static bool trans_VCMP_dp(DisasContext *s, arg_VCMP_dp *a)
+{
+ TCGv_i64 vd, vm;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* Vm/M bits must be zero for the Z variant */
+ if (a->z && a->vm != 0) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vd = tcg_temp_new_i64();
+ vm = tcg_temp_new_i64();
+
+ vfp_load_reg64(vd, a->vd);
+ if (a->z) {
+ tcg_gen_movi_i64(vm, 0);
+ } else {
+ vfp_load_reg64(vm, a->vm);
+ }
+
+ if (a->e) {
+ gen_helper_vfp_cmped(vd, vm, cpu_env);
+ } else {
+ gen_helper_vfp_cmpd(vd, vm, cpu_env);
+ }
+
+ tcg_temp_free_i64(vd);
+ tcg_temp_free_i64(vm);
+
+ return true;
+}
+
+static bool trans_VCVT_f32_f16(DisasContext *s, arg_VCVT_f32_f16 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp_mode;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ ahp_mode = get_ahp_flag();
+ tmp = tcg_temp_new_i32();
+ /* The T bit tells us if we want the low or high 16 bits of Vm */
+ tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
+ gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp_mode);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_i32(ahp_mode);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp_mode;
+ TCGv_i32 tmp;
+ TCGv_i64 vd;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ ahp_mode = get_ahp_flag();
+ tmp = tcg_temp_new_i32();
+ /* The T bit tells us if we want the low or high 16 bits of Vm */
+ tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
+ vd = tcg_temp_new_i64();
+ gen_helper_vfp_fcvt_f16_to_f64(vd, tmp, fpst, ahp_mode);
+ vfp_store_reg64(vd, a->vd);
+ tcg_temp_free_i32(ahp_mode);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i64(vd);
+ return true;
+}
+
+static bool trans_VCVT_b16_f32(DisasContext *s, arg_VCVT_b16_f32 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tmp = tcg_temp_new_i32();
+
+ vfp_load_reg32(tmp, a->vm);
+ gen_helper_bfcvt(tmp, tmp, fpst);
+ tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp_mode;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ ahp_mode = get_ahp_flag();
+ tmp = tcg_temp_new_i32();
+
+ vfp_load_reg32(tmp, a->vm);
+ gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp_mode);
+ tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+ tcg_temp_free_i32(ahp_mode);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 ahp_mode;
+ TCGv_i32 tmp;
+ TCGv_i64 vm;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ ahp_mode = get_ahp_flag();
+ tmp = tcg_temp_new_i32();
+ vm = tcg_temp_new_i64();
+
+ vfp_load_reg64(vm, a->vm);
+ gen_helper_vfp_fcvt_f64_to_f16(tmp, vm, fpst, ahp_mode);
+ tcg_temp_free_i64(vm);
+ tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+ tcg_temp_free_i32(ahp_mode);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_rinth(tmp, tmp, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_rints(tmp, tmp, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i64 tmp;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i64();
+ vfp_load_reg64(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_rintd(tmp, tmp, fpst);
+ vfp_store_reg64(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(tmp);
+ return true;
+}
+
+static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+ TCGv_i32 tcg_rmode;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ tcg_rmode = tcg_const_i32(float_round_to_zero);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_helper_rinth(tmp, tmp, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+ TCGv_i32 tcg_rmode;
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tcg_rmode = tcg_const_i32(float_round_to_zero);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_helper_rints(tmp, tmp, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i64 tmp;
+ TCGv_i32 tcg_rmode;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i64();
+ vfp_load_reg64(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tcg_rmode = tcg_const_i32(float_round_to_zero);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_helper_rintd(tmp, tmp, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ vfp_store_reg64(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i32(tcg_rmode);
+ return true;
+}
+
+static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_rinth_exact(tmp, tmp, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ vfp_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_rints_exact(tmp, tmp, fpst);
+ vfp_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
+static bool trans_VRINTX_dp(DisasContext *s, arg_VRINTX_dp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i64 tmp;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_vrint, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i64();
+ vfp_load_reg64(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_rintd_exact(tmp, tmp, fpst);
+ vfp_store_reg64(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i64(tmp);
+ return true;
+}
+
+static bool trans_VCVT_sp(DisasContext *s, arg_VCVT_sp *a)
+{
+ TCGv_i64 vd;
+ TCGv_i32 vm;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i32();
+ vd = tcg_temp_new_i64();
+ vfp_load_reg32(vm, a->vm);
+ gen_helper_vfp_fcvtds(vd, vm, cpu_env);
+ vfp_store_reg64(vd, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_i64(vd);
+ return true;
+}
+
+static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)
+{
+ TCGv_i64 vm;
+ TCGv_i32 vd;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vd = tcg_temp_new_i32();
+ vm = tcg_temp_new_i64();
+ vfp_load_reg64(vm, a->vm);
+ gen_helper_vfp_fcvtsd(vd, vm, cpu_env);
+ vfp_store_reg32(vd, a->vd);
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i64(vm);
+ return true;
+}
+
+static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i32();
+ vfp_load_reg32(vm, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ if (a->s) {
+ /* i32 -> f16 */
+ gen_helper_vfp_sitoh(vm, vm, fpst);
+ } else {
+ /* u32 -> f16 */
+ gen_helper_vfp_uitoh(vm, vm, fpst);
+ }
+ vfp_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i32();
+ vfp_load_reg32(vm, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ if (a->s) {
+ /* i32 -> f32 */
+ gen_helper_vfp_sitos(vm, vm, fpst);
+ } else {
+ /* u32 -> f32 */
+ gen_helper_vfp_uitos(vm, vm, fpst);
+ }
+ vfp_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a)
+{
+ TCGv_i32 vm;
+ TCGv_i64 vd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i32();
+ vd = tcg_temp_new_i64();
+ vfp_load_reg32(vm, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR);
+ if (a->s) {
+ /* i32 -> f64 */
+ gen_helper_vfp_sitod(vd, vm, fpst);
+ } else {
+ /* u32 -> f64 */
+ gen_helper_vfp_uitod(vd, vm, fpst);
+ }
+ vfp_store_reg64(vd, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_i64(vd);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)
+{
+ TCGv_i32 vd;
+ TCGv_i64 vm;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_jscvt, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i64();
+ vd = tcg_temp_new_i32();
+ vfp_load_reg64(vm, a->vm);
+ gen_helper_vjcvt(vd, vm, cpu_env);
+ vfp_store_reg32(vd, a->vd);
+ tcg_temp_free_i64(vm);
+ tcg_temp_free_i32(vd);
+ return true;
+}
+
+static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
+{
+ TCGv_i32 vd, shift;
+ TCGv_ptr fpst;
+ int frac_bits;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+ vd = tcg_temp_new_i32();
+ vfp_load_reg32(vd, a->vd);
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ shift = tcg_constant_i32(frac_bits);
+
+ /* Switch on op:U:sx bits */
+ switch (a->opc) {
+ case 0:
+ gen_helper_vfp_shtoh_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 1:
+ gen_helper_vfp_sltoh_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 2:
+ gen_helper_vfp_uhtoh_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 3:
+ gen_helper_vfp_ultoh_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 4:
+ gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 5:
+ gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 6:
+ gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 7:
+ gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ vfp_store_reg32(vd, a->vd);
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
+{
+ TCGv_i32 vd, shift;
+ TCGv_ptr fpst;
+ int frac_bits;
+
+ if (!dc_isar_feature(aa32_fpsp_v3, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+ vd = tcg_temp_new_i32();
+ vfp_load_reg32(vd, a->vd);
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ shift = tcg_constant_i32(frac_bits);
+
+ /* Switch on op:U:sx bits */
+ switch (a->opc) {
+ case 0:
+ gen_helper_vfp_shtos_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 1:
+ gen_helper_vfp_sltos_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 2:
+ gen_helper_vfp_uhtos_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 3:
+ gen_helper_vfp_ultos_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 4:
+ gen_helper_vfp_toshs_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 5:
+ gen_helper_vfp_tosls_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 6:
+ gen_helper_vfp_touhs_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 7:
+ gen_helper_vfp_touls_round_to_zero(vd, vd, shift, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ vfp_store_reg32(vd, a->vd);
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
+{
+ TCGv_i64 vd;
+ TCGv_i32 shift;
+ TCGv_ptr fpst;
+ int frac_bits;
+
+ if (!dc_isar_feature(aa32_fpdp_v3, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+ vd = tcg_temp_new_i64();
+ vfp_load_reg64(vd, a->vd);
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ shift = tcg_constant_i32(frac_bits);
+
+ /* Switch on op:U:sx bits */
+ switch (a->opc) {
+ case 0:
+ gen_helper_vfp_shtod_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 1:
+ gen_helper_vfp_sltod_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 2:
+ gen_helper_vfp_uhtod_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 3:
+ gen_helper_vfp_ultod_round_to_nearest(vd, vd, shift, fpst);
+ break;
+ case 4:
+ gen_helper_vfp_toshd_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 5:
+ gen_helper_vfp_tosld_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 6:
+ gen_helper_vfp_touhd_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 7:
+ gen_helper_vfp_tould_round_to_zero(vd, vd, shift, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ vfp_store_reg64(vd, a->vd);
+ tcg_temp_free_i64(vd);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ vm = tcg_temp_new_i32();
+ vfp_load_reg32(vm, a->vm);
+
+ if (a->s) {
+ if (a->rz) {
+ gen_helper_vfp_tosizh(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_tosih(vm, vm, fpst);
+ }
+ } else {
+ if (a->rz) {
+ gen_helper_vfp_touizh(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_touih(vm, vm, fpst);
+ }
+ }
+ vfp_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ vm = tcg_temp_new_i32();
+ vfp_load_reg32(vm, a->vm);
+
+ if (a->s) {
+ if (a->rz) {
+ gen_helper_vfp_tosizs(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_tosis(vm, vm, fpst);
+ }
+ } else {
+ if (a->rz) {
+ gen_helper_vfp_touizs(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_touis(vm, vm, fpst);
+ }
+ }
+ vfp_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
+{
+ TCGv_i32 vd;
+ TCGv_i64 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ vm = tcg_temp_new_i64();
+ vd = tcg_temp_new_i32();
+ vfp_load_reg64(vm, a->vm);
+
+ if (a->s) {
+ if (a->rz) {
+ gen_helper_vfp_tosizd(vd, vm, fpst);
+ } else {
+ gen_helper_vfp_tosid(vd, vm, fpst);
+ }
+ } else {
+ if (a->rz) {
+ gen_helper_vfp_touizd(vd, vm, fpst);
+ } else {
+ gen_helper_vfp_touid(vd, vm, fpst);
+ }
+ }
+ vfp_store_reg32(vd, a->vd);
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i64(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
+static bool trans_VINS(DisasContext *s, arg_VINS *a)
+{
+ TCGv_i32 rd, rm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Insert low half of Vm into high half of Vd */
+ rm = tcg_temp_new_i32();
+ rd = tcg_temp_new_i32();
+ vfp_load_reg32(rm, a->vm);
+ vfp_load_reg32(rd, a->vd);
+ tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
+ vfp_store_reg32(rd, a->vd);
+ tcg_temp_free_i32(rm);
+ tcg_temp_free_i32(rd);
+ return true;
+}
+
+static bool trans_VMOVX(DisasContext *s, arg_VINS *a)
+{
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Set Vd to high half of Vm */
+ rm = tcg_temp_new_i32();
+ vfp_load_reg32(rm, a->vm);
+ tcg_gen_shri_i32(rm, rm, 16);
+ vfp_store_reg32(rm, a->vd);
+ tcg_temp_free_i32(rm);
+ return true;
+}
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
new file mode 100644
index 0000000..c23a346
--- /dev/null
+++ b/target/arm/tcg/translate.c
@@ -0,0 +1,9990 @@
+/*
+ * ARM translation
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2005-2007 CodeSourcery
+ * Copyright (c) 2007 OpenedHand, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "internals.h"
+#include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "qemu/log.h"
+#include "qemu/bitops.h"
+#include "arm_ldst.h"
+#include "semihosting/semihost.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "cpregs.h"
+
+
+#define ENABLE_ARCH_4T arm_dc_feature(s, ARM_FEATURE_V4T)
+#define ENABLE_ARCH_5 arm_dc_feature(s, ARM_FEATURE_V5)
+/* currently all emulated v5 cores are also v5TE, so don't bother */
+#define ENABLE_ARCH_5TE arm_dc_feature(s, ARM_FEATURE_V5)
+#define ENABLE_ARCH_5J dc_isar_feature(aa32_jazelle, s)
+#define ENABLE_ARCH_6 arm_dc_feature(s, ARM_FEATURE_V6)
+#define ENABLE_ARCH_6K arm_dc_feature(s, ARM_FEATURE_V6K)
+#define ENABLE_ARCH_6T2 arm_dc_feature(s, ARM_FEATURE_THUMB2)
+#define ENABLE_ARCH_7 arm_dc_feature(s, ARM_FEATURE_V7)
+#define ENABLE_ARCH_8 arm_dc_feature(s, ARM_FEATURE_V8)
+
+#include "translate.h"
+#include "translate-a32.h"
+
+/* These are TCG temporaries used only by the legacy iwMMXt decoder */
+static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
+/* These are TCG globals which alias CPUARMState fields */
+static TCGv_i32 cpu_R[16];
+TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
+TCGv_i64 cpu_exclusive_addr;
+TCGv_i64 cpu_exclusive_val;
+
+#include "exec/gen-icount.h"
+
+static const char * const regnames[] =
+ { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
+
+
+/* initialize TCG globals. */
+void arm_translate_init(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
+ offsetof(CPUARMState, regs[i]),
+ regnames[i]);
+ }
+ cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
+ cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
+ cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF");
+ cpu_ZF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, ZF), "ZF");
+
+ cpu_exclusive_addr = tcg_global_mem_new_i64(cpu_env,
+ offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
+ cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env,
+ offsetof(CPUARMState, exclusive_val), "exclusive_val");
+
+ a64_translate_init();
+}
+
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
+{
+ /* Expand the encoded constant as per AdvSIMDExpandImm pseudocode */
+ switch (cmode) {
+ case 0: case 1:
+ /* no-op */
+ break;
+ case 2: case 3:
+ imm <<= 8;
+ break;
+ case 4: case 5:
+ imm <<= 16;
+ break;
+ case 6: case 7:
+ imm <<= 24;
+ break;
+ case 8: case 9:
+ imm |= imm << 16;
+ break;
+ case 10: case 11:
+ imm = (imm << 8) | (imm << 24);
+ break;
+ case 12:
+ imm = (imm << 8) | 0xff;
+ break;
+ case 13:
+ imm = (imm << 16) | 0xffff;
+ break;
+ case 14:
+ if (op) {
+ /*
+ * This and cmode == 15 op == 1 are the only cases where
+ * the top and bottom 32 bits of the encoded constant differ.
+ */
+ uint64_t imm64 = 0;
+ int n;
+
+ for (n = 0; n < 8; n++) {
+ if (imm & (1 << n)) {
+ imm64 |= (0xffULL << (n * 8));
+ }
+ }
+ return imm64;
+ }
+ imm |= (imm << 8) | (imm << 16) | (imm << 24);
+ break;
+ case 15:
+ if (op) {
+ /* Reserved encoding for AArch32; valid for AArch64 */
+ uint64_t imm64 = (uint64_t)(imm & 0x3f) << 48;
+ if (imm & 0x80) {
+ imm64 |= 0x8000000000000000ULL;
+ }
+ if (imm & 0x40) {
+ imm64 |= 0x3fc0000000000000ULL;
+ } else {
+ imm64 |= 0x4000000000000000ULL;
+ }
+ return imm64;
+ }
+ imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
+ | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
+ break;
+ }
+ if (op) {
+ imm = ~imm;
+ }
+ return dup_const(MO_32, imm);
+}
+
+/* Generate a label used for skipping this instruction */
+void arm_gen_condlabel(DisasContext *s)
+{
+ if (!s->condjmp) {
+ s->condlabel = gen_disas_label(s);
+ s->condjmp = 1;
+ }
+}
+
+/* Flags for the disas_set_da_iss info argument:
+ * lower bits hold the Rt register number, higher bits are flags.
+ */
+typedef enum ISSInfo {
+ ISSNone = 0,
+ ISSRegMask = 0x1f,
+ ISSInvalid = (1 << 5),
+ ISSIsAcqRel = (1 << 6),
+ ISSIsWrite = (1 << 7),
+ ISSIs16Bit = (1 << 8),
+} ISSInfo;
+
+/*
+ * Store var into env + offset to a member with size bytes.
+ * Free var after use.
+ */
+void store_cpu_offset(TCGv_i32 var, int offset, int size)
+{
+ switch (size) {
+ case 1:
+ tcg_gen_st8_i32(var, cpu_env, offset);
+ break;
+ case 4:
+ tcg_gen_st_i32(var, cpu_env, offset);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i32(var);
+}
+
+/* Save the syndrome information for a Data Abort */
+static void disas_set_da_iss(DisasContext *s, MemOp memop, ISSInfo issinfo)
+{
+ uint32_t syn;
+ int sas = memop & MO_SIZE;
+ bool sse = memop & MO_SIGN;
+ bool is_acqrel = issinfo & ISSIsAcqRel;
+ bool is_write = issinfo & ISSIsWrite;
+ bool is_16bit = issinfo & ISSIs16Bit;
+ int srt = issinfo & ISSRegMask;
+
+ if (issinfo & ISSInvalid) {
+ /* Some callsites want to conditionally provide ISS info,
+ * eg "only if this was not a writeback"
+ */
+ return;
+ }
+
+ if (srt == 15) {
+ /* For AArch32, insns where the src/dest is R15 never generate
+ * ISS information. Catching that here saves checking at all
+ * the call sites.
+ */
+ return;
+ }
+
+ syn = syn_data_abort_with_iss(0, sas, sse, srt, 0, is_acqrel,
+ 0, 0, 0, is_write, 0, is_16bit);
+ disas_set_insn_syndrome(s, syn);
+}
+
+static inline int get_a32_user_mem_index(DisasContext *s)
+{
+ /* Return the core mmu_idx to use for A32/T32 "unprivileged load/store"
+ * insns:
+ * if PL2, UNPREDICTABLE (we choose to implement as if PL0)
+ * otherwise, access as if at PL0.
+ */
+ switch (s->mmu_idx) {
+ case ARMMMUIdx_E3:
+ case ARMMMUIdx_E2: /* this one is UNPREDICTABLE */
+ case ARMMMUIdx_E10_0:
+ case ARMMMUIdx_E10_1:
+ case ARMMMUIdx_E10_1_PAN:
+ return arm_to_core_mmu_idx(ARMMMUIdx_E10_0);
+ case ARMMMUIdx_MUser:
+ case ARMMMUIdx_MPriv:
+ return arm_to_core_mmu_idx(ARMMMUIdx_MUser);
+ case ARMMMUIdx_MUserNegPri:
+ case ARMMMUIdx_MPrivNegPri:
+ return arm_to_core_mmu_idx(ARMMMUIdx_MUserNegPri);
+ case ARMMMUIdx_MSUser:
+ case ARMMMUIdx_MSPriv:
+ return arm_to_core_mmu_idx(ARMMMUIdx_MSUser);
+ case ARMMMUIdx_MSUserNegPri:
+ case ARMMMUIdx_MSPrivNegPri:
+ return arm_to_core_mmu_idx(ARMMMUIdx_MSUserNegPri);
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* The pc_curr difference for an architectural jump. */
+static target_long jmp_diff(DisasContext *s, target_long diff)
+{
+ return diff + (s->thumb ? 4 : 8);
+}
+
+static void gen_pc_plus_diff(DisasContext *s, TCGv_i32 var, target_long diff)
+{
+ assert(s->pc_save != -1);
+ if (TARGET_TB_PCREL) {
+ tcg_gen_addi_i32(var, cpu_R[15], (s->pc_curr - s->pc_save) + diff);
+ } else {
+ tcg_gen_movi_i32(var, s->pc_curr + diff);
+ }
+}
+
+/* Set a variable to the value of a CPU register. */
+void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
+{
+ if (reg == 15) {
+ gen_pc_plus_diff(s, var, jmp_diff(s, 0));
+ } else {
+ tcg_gen_mov_i32(var, cpu_R[reg]);
+ }
+}
+
+/*
+ * Create a new temp, REG + OFS, except PC is ALIGN(PC, 4).
+ * This is used for load/store for which use of PC implies (literal),
+ * or ADD that implies ADR.
+ */
+TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ if (reg == 15) {
+ /*
+ * This address is computed from an aligned PC:
+ * subtract off the low bits.
+ */
+ gen_pc_plus_diff(s, tmp, jmp_diff(s, ofs - (s->pc_curr & 3)));
+ } else {
+ tcg_gen_addi_i32(tmp, cpu_R[reg], ofs);
+ }
+ return tmp;
+}
+
+/* Set a CPU register. The source must be a temporary and will be
+ marked as dead. */
+void store_reg(DisasContext *s, int reg, TCGv_i32 var)
+{
+ if (reg == 15) {
+ /* In Thumb mode, we must ignore bit 0.
+ * In ARM mode, for ARMv4 and ARMv5, it is UNPREDICTABLE if bits [1:0]
+ * are not 0b00, but for ARMv6 and above, we must ignore bits [1:0].
+ * We choose to ignore [1:0] in ARM mode for all architecture versions.
+ */
+ tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
+ s->base.is_jmp = DISAS_JUMP;
+ s->pc_save = -1;
+ } else if (reg == 13 && arm_dc_feature(s, ARM_FEATURE_M)) {
+ /* For M-profile SP bits [1:0] are always zero */
+ tcg_gen_andi_i32(var, var, ~3);
+ }
+ tcg_gen_mov_i32(cpu_R[reg], var);
+ tcg_temp_free_i32(var);
+}
+
+/*
+ * Variant of store_reg which applies v8M stack-limit checks before updating
+ * SP. If the check fails this will result in an exception being taken.
+ * We disable the stack checks for CONFIG_USER_ONLY because we have
+ * no idea what the stack limits should be in that case.
+ * If stack checking is not being done this just acts like store_reg().
+ */
+static void store_sp_checked(DisasContext *s, TCGv_i32 var)
+{
+#ifndef CONFIG_USER_ONLY
+ if (s->v8m_stackcheck) {
+ gen_helper_v8m_stackcheck(cpu_env, var);
+ }
+#endif
+ store_reg(s, 13, var);
+}
+
+/* Value extensions. */
+#define gen_uxtb(var) tcg_gen_ext8u_i32(var, var)
+#define gen_uxth(var) tcg_gen_ext16u_i32(var, var)
+#define gen_sxtb(var) tcg_gen_ext8s_i32(var, var)
+#define gen_sxth(var) tcg_gen_ext16s_i32(var, var)
+
+#define gen_sxtb16(var) gen_helper_sxtb16(var, var)
+#define gen_uxtb16(var) gen_helper_uxtb16(var, var)
+
+void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
+{
+ gen_helper_cpsr_write(cpu_env, var, tcg_constant_i32(mask));
+}
+
+static void gen_rebuild_hflags(DisasContext *s, bool new_el)
+{
+ bool m_profile = arm_dc_feature(s, ARM_FEATURE_M);
+
+ if (new_el) {
+ if (m_profile) {
+ gen_helper_rebuild_hflags_m32_newel(cpu_env);
+ } else {
+ gen_helper_rebuild_hflags_a32_newel(cpu_env);
+ }
+ } else {
+ TCGv_i32 tcg_el = tcg_constant_i32(s->current_el);
+ if (m_profile) {
+ gen_helper_rebuild_hflags_m32(cpu_env, tcg_el);
+ } else {
+ gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
+ }
+ }
+}
+
+static void gen_exception_internal(int excp)
+{
+ assert(excp_is_internal(excp));
+ gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
+}
+
+static void gen_singlestep_exception(DisasContext *s)
+{
+ /* We just completed step of an insn. Move from Active-not-pending
+ * to Active-pending, and then also take the swstep exception.
+ * This corresponds to making the (IMPDEF) choice to prioritize
+ * swstep exceptions over asynchronous exceptions taken to an exception
+ * level where debug is disabled. This choice has the advantage that
+ * we do not need to maintain internal state corresponding to the
+ * ISV/EX syndrome bits between completion of the step and generation
+ * of the exception, and our syndrome information is always correct.
+ */
+ gen_ss_advance(s);
+ gen_swstep_exception(s, 1, s->is_ldex);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+void clear_eci_state(DisasContext *s)
+{
+ /*
+ * Clear any ECI/ICI state: used when a load multiple/store
+ * multiple insn executes.
+ */
+ if (s->eci) {
+ store_cpu_field_constant(0, condexec_bits);
+ s->eci = 0;
+ }
+}
+
+static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 tmp1 = tcg_temp_new_i32();
+ TCGv_i32 tmp2 = tcg_temp_new_i32();
+ tcg_gen_ext16s_i32(tmp1, a);
+ tcg_gen_ext16s_i32(tmp2, b);
+ tcg_gen_mul_i32(tmp1, tmp1, tmp2);
+ tcg_temp_free_i32(tmp2);
+ tcg_gen_sari_i32(a, a, 16);
+ tcg_gen_sari_i32(b, b, 16);
+ tcg_gen_mul_i32(b, b, a);
+ tcg_gen_mov_i32(a, tmp1);
+ tcg_temp_free_i32(tmp1);
+}
+
+/* Byteswap each halfword. */
+void gen_rev16(TCGv_i32 dest, TCGv_i32 var)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ TCGv_i32 mask = tcg_constant_i32(0x00ff00ff);
+ tcg_gen_shri_i32(tmp, var, 8);
+ tcg_gen_and_i32(tmp, tmp, mask);
+ tcg_gen_and_i32(var, var, mask);
+ tcg_gen_shli_i32(var, var, 8);
+ tcg_gen_or_i32(dest, var, tmp);
+ tcg_temp_free_i32(tmp);
+}
+
+/* Byteswap low halfword and sign extend. */
+static void gen_revsh(TCGv_i32 dest, TCGv_i32 var)
+{
+ tcg_gen_bswap16_i32(var, var, TCG_BSWAP_OS);
+}
+
+/* Dual 16-bit add. Result placed in t0 and t1 is marked as dead.
+ tmp = (t0 ^ t1) & 0x8000;
+ t0 &= ~0x8000;
+ t1 &= ~0x8000;
+ t0 = (t0 + t1) ^ tmp;
+ */
+
+static void gen_add16(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_xor_i32(tmp, t0, t1);
+ tcg_gen_andi_i32(tmp, tmp, 0x8000);
+ tcg_gen_andi_i32(t0, t0, ~0x8000);
+ tcg_gen_andi_i32(t1, t1, ~0x8000);
+ tcg_gen_add_i32(t0, t0, t1);
+ tcg_gen_xor_i32(dest, t0, tmp);
+ tcg_temp_free_i32(tmp);
+}
+
+/* Set N and Z flags from var. */
+static inline void gen_logic_CC(TCGv_i32 var)
+{
+ tcg_gen_mov_i32(cpu_NF, var);
+ tcg_gen_mov_i32(cpu_ZF, var);
+}
+
+/* dest = T0 + T1 + CF. */
+static void gen_add_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ tcg_gen_add_i32(dest, t0, t1);
+ tcg_gen_add_i32(dest, dest, cpu_CF);
+}
+
+/* dest = T0 - T1 + CF - 1. */
+static void gen_sub_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ tcg_gen_sub_i32(dest, t0, t1);
+ tcg_gen_add_i32(dest, dest, cpu_CF);
+ tcg_gen_subi_i32(dest, dest, 1);
+}
+
+/* dest = T0 + T1. Compute C, N, V and Z flags */
+static void gen_add_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_movi_i32(tmp, 0);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, t1, tmp);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+ tcg_gen_xor_i32(tmp, t0, t1);
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 + T1 + CF. Compute C, N, V and Z flags */
+static void gen_adc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ if (TCG_TARGET_HAS_add2_i32) {
+ tcg_gen_movi_i32(tmp, 0);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp);
+ tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp);
+ } else {
+ TCGv_i64 q0 = tcg_temp_new_i64();
+ TCGv_i64 q1 = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(q0, t0);
+ tcg_gen_extu_i32_i64(q1, t1);
+ tcg_gen_add_i64(q0, q0, q1);
+ tcg_gen_extu_i32_i64(q1, cpu_CF);
+ tcg_gen_add_i64(q0, q0, q1);
+ tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0);
+ tcg_temp_free_i64(q0);
+ tcg_temp_free_i64(q1);
+ }
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+ tcg_gen_xor_i32(tmp, t0, t1);
+ tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 - T1. Compute C, N, V and Z flags */
+static void gen_sub_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp;
+ tcg_gen_sub_i32(cpu_NF, t0, t1);
+ tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+ tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1);
+ tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+ tmp = tcg_temp_new_i32();
+ tcg_gen_xor_i32(tmp, t0, t1);
+ tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 + ~T1 + CF. Compute C, N, V and Z flags */
+static void gen_sbc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_not_i32(tmp, t1);
+ gen_adc_CC(dest, t0, tmp);
+ tcg_temp_free_i32(tmp);
+}
+
+#define GEN_SHIFT(name) \
+static void gen_##name(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) \
+{ \
+ TCGv_i32 tmpd = tcg_temp_new_i32(); \
+ TCGv_i32 tmp1 = tcg_temp_new_i32(); \
+ TCGv_i32 zero = tcg_constant_i32(0); \
+ tcg_gen_andi_i32(tmp1, t1, 0x1f); \
+ tcg_gen_##name##_i32(tmpd, t0, tmp1); \
+ tcg_gen_andi_i32(tmp1, t1, 0xe0); \
+ tcg_gen_movcond_i32(TCG_COND_NE, dest, tmp1, zero, zero, tmpd); \
+ tcg_temp_free_i32(tmpd); \
+ tcg_temp_free_i32(tmp1); \
+}
+GEN_SHIFT(shl)
+GEN_SHIFT(shr)
+#undef GEN_SHIFT
+
+static void gen_sar(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+ TCGv_i32 tmp1 = tcg_temp_new_i32();
+
+ tcg_gen_andi_i32(tmp1, t1, 0xff);
+ tcg_gen_umin_i32(tmp1, tmp1, tcg_constant_i32(31));
+ tcg_gen_sar_i32(dest, t0, tmp1);
+ tcg_temp_free_i32(tmp1);
+}
+
+static void shifter_out_im(TCGv_i32 var, int shift)
+{
+ tcg_gen_extract_i32(cpu_CF, var, shift, 1);
+}
+
+/* Shift by immediate. Includes special handling for shift == 0. */
+static inline void gen_arm_shift_im(TCGv_i32 var, int shiftop,
+ int shift, int flags)
+{
+ switch (shiftop) {
+ case 0: /* LSL */
+ if (shift != 0) {
+ if (flags)
+ shifter_out_im(var, 32 - shift);
+ tcg_gen_shli_i32(var, var, shift);
+ }
+ break;
+ case 1: /* LSR */
+ if (shift == 0) {
+ if (flags) {
+ tcg_gen_shri_i32(cpu_CF, var, 31);
+ }
+ tcg_gen_movi_i32(var, 0);
+ } else {
+ if (flags)
+ shifter_out_im(var, shift - 1);
+ tcg_gen_shri_i32(var, var, shift);
+ }
+ break;
+ case 2: /* ASR */
+ if (shift == 0)
+ shift = 32;
+ if (flags)
+ shifter_out_im(var, shift - 1);
+ if (shift == 32)
+ shift = 31;
+ tcg_gen_sari_i32(var, var, shift);
+ break;
+ case 3: /* ROR/RRX */
+ if (shift != 0) {
+ if (flags)
+ shifter_out_im(var, shift - 1);
+ tcg_gen_rotri_i32(var, var, shift); break;
+ } else {
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ tcg_gen_shli_i32(tmp, cpu_CF, 31);
+ if (flags)
+ shifter_out_im(var, 0);
+ tcg_gen_shri_i32(var, var, 1);
+ tcg_gen_or_i32(var, var, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+ }
+};
+
+static inline void gen_arm_shift_reg(TCGv_i32 var, int shiftop,
+ TCGv_i32 shift, int flags)
+{
+ if (flags) {
+ switch (shiftop) {
+ case 0: gen_helper_shl_cc(var, cpu_env, var, shift); break;
+ case 1: gen_helper_shr_cc(var, cpu_env, var, shift); break;
+ case 2: gen_helper_sar_cc(var, cpu_env, var, shift); break;
+ case 3: gen_helper_ror_cc(var, cpu_env, var, shift); break;
+ }
+ } else {
+ switch (shiftop) {
+ case 0:
+ gen_shl(var, var, shift);
+ break;
+ case 1:
+ gen_shr(var, var, shift);
+ break;
+ case 2:
+ gen_sar(var, var, shift);
+ break;
+ case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
+ tcg_gen_rotr_i32(var, var, shift); break;
+ }
+ }
+ tcg_temp_free_i32(shift);
+}
+
+/*
+ * Generate a conditional based on ARM condition code cc.
+ * This is common between ARM and Aarch64 targets.
+ */
+void arm_test_cc(DisasCompare *cmp, int cc)
+{
+ TCGv_i32 value;
+ TCGCond cond;
+ bool global = true;
+
+ switch (cc) {
+ case 0: /* eq: Z */
+ case 1: /* ne: !Z */
+ cond = TCG_COND_EQ;
+ value = cpu_ZF;
+ break;
+
+ case 2: /* cs: C */
+ case 3: /* cc: !C */
+ cond = TCG_COND_NE;
+ value = cpu_CF;
+ break;
+
+ case 4: /* mi: N */
+ case 5: /* pl: !N */
+ cond = TCG_COND_LT;
+ value = cpu_NF;
+ break;
+
+ case 6: /* vs: V */
+ case 7: /* vc: !V */
+ cond = TCG_COND_LT;
+ value = cpu_VF;
+ break;
+
+ case 8: /* hi: C && !Z */
+ case 9: /* ls: !C || Z -> !(C && !Z) */
+ cond = TCG_COND_NE;
+ value = tcg_temp_new_i32();
+ global = false;
+ /* CF is 1 for C, so -CF is an all-bits-set mask for C;
+ ZF is non-zero for !Z; so AND the two subexpressions. */
+ tcg_gen_neg_i32(value, cpu_CF);
+ tcg_gen_and_i32(value, value, cpu_ZF);
+ break;
+
+ case 10: /* ge: N == V -> N ^ V == 0 */
+ case 11: /* lt: N != V -> N ^ V != 0 */
+ /* Since we're only interested in the sign bit, == 0 is >= 0. */
+ cond = TCG_COND_GE;
+ value = tcg_temp_new_i32();
+ global = false;
+ tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
+ break;
+
+ case 12: /* gt: !Z && N == V */
+ case 13: /* le: Z || N != V */
+ cond = TCG_COND_NE;
+ value = tcg_temp_new_i32();
+ global = false;
+ /* (N == V) is equal to the sign bit of ~(NF ^ VF). Propagate
+ * the sign bit then AND with ZF to yield the result. */
+ tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
+ tcg_gen_sari_i32(value, value, 31);
+ tcg_gen_andc_i32(value, cpu_ZF, value);
+ break;
+
+ case 14: /* always */
+ case 15: /* always */
+ /* Use the ALWAYS condition, which will fold early.
+ * It doesn't matter what we use for the value. */
+ cond = TCG_COND_ALWAYS;
+ value = cpu_ZF;
+ goto no_invert;
+
+ default:
+ fprintf(stderr, "Bad condition code 0x%x\n", cc);
+ abort();
+ }
+
+ if (cc & 1) {
+ cond = tcg_invert_cond(cond);
+ }
+
+ no_invert:
+ cmp->cond = cond;
+ cmp->value = value;
+ cmp->value_global = global;
+}
+
+void arm_free_cc(DisasCompare *cmp)
+{
+ if (!cmp->value_global) {
+ tcg_temp_free_i32(cmp->value);
+ }
+}
+
+void arm_jump_cc(DisasCompare *cmp, TCGLabel *label)
+{
+ tcg_gen_brcondi_i32(cmp->cond, cmp->value, 0, label);
+}
+
+void arm_gen_test_cc(int cc, TCGLabel *label)
+{
+ DisasCompare cmp;
+ arm_test_cc(&cmp, cc);
+ arm_jump_cc(&cmp, label);
+ arm_free_cc(&cmp);
+}
+
+void gen_set_condexec(DisasContext *s)
+{
+ if (s->condexec_mask) {
+ uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1);
+
+ store_cpu_field_constant(val, condexec_bits);
+ }
+}
+
+void gen_update_pc(DisasContext *s, target_long diff)
+{
+ gen_pc_plus_diff(s, cpu_R[15], diff);
+ s->pc_save = s->pc_curr + diff;
+}
+
+/* Set PC and Thumb state from var. var is marked as dead. */
+static inline void gen_bx(DisasContext *s, TCGv_i32 var)
+{
+ s->base.is_jmp = DISAS_JUMP;
+ tcg_gen_andi_i32(cpu_R[15], var, ~1);
+ tcg_gen_andi_i32(var, var, 1);
+ store_cpu_field(var, thumb);
+ s->pc_save = -1;
+}
+
+/*
+ * Set PC and Thumb state from var. var is marked as dead.
+ * For M-profile CPUs, include logic to detect exception-return
+ * branches and handle them. This is needed for Thumb POP/LDM to PC, LDR to PC,
+ * and BX reg, and no others, and happens only for code in Handler mode.
+ * The Security Extension also requires us to check for the FNC_RETURN
+ * which signals a function return from non-secure state; this can happen
+ * in both Handler and Thread mode.
+ * To avoid having to do multiple comparisons in inline generated code,
+ * we make the check we do here loose, so it will match for EXC_RETURN
+ * in Thread mode. For system emulation do_v7m_exception_exit() checks
+ * for these spurious cases and returns without doing anything (giving
+ * the same behaviour as for a branch to a non-magic address).
+ *
+ * In linux-user mode it is unclear what the right behaviour for an
+ * attempted FNC_RETURN should be, because in real hardware this will go
+ * directly to Secure code (ie not the Linux kernel) which will then treat
+ * the error in any way it chooses. For QEMU we opt to make the FNC_RETURN
+ * attempt behave the way it would on a CPU without the security extension,
+ * which is to say "like a normal branch". That means we can simply treat
+ * all branches as normal with no magic address behaviour.
+ */
+static inline void gen_bx_excret(DisasContext *s, TCGv_i32 var)
+{
+ /* Generate the same code here as for a simple bx, but flag via
+ * s->base.is_jmp that we need to do the rest of the work later.
+ */
+ gen_bx(s, var);
+#ifndef CONFIG_USER_ONLY
+ if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY) ||
+ (s->v7m_handler_mode && arm_dc_feature(s, ARM_FEATURE_M))) {
+ s->base.is_jmp = DISAS_BX_EXCRET;
+ }
+#endif
+}
+
+static inline void gen_bx_excret_final_code(DisasContext *s)
+{
+ /* Generate the code to finish possible exception return and end the TB */
+ DisasLabel excret_label = gen_disas_label(s);
+ uint32_t min_magic;
+
+ if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY)) {
+ /* Covers FNC_RETURN and EXC_RETURN magic */
+ min_magic = FNC_RETURN_MIN_MAGIC;
+ } else {
+ /* EXC_RETURN magic only */
+ min_magic = EXC_RETURN_MIN_MAGIC;
+ }
+
+ /* Is the new PC value in the magic range indicating exception return? */
+ tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label.label);
+ /* No: end the TB as we would for a DISAS_JMP */
+ if (s->ss_active) {
+ gen_singlestep_exception(s);
+ } else {
+ tcg_gen_exit_tb(NULL, 0);
+ }
+ set_disas_label(s, excret_label);
+ /* Yes: this is an exception return.
+ * At this point in runtime env->regs[15] and env->thumb will hold
+ * the exception-return magic number, which do_v7m_exception_exit()
+ * will read. Nothing else will be able to see those values because
+ * the cpu-exec main loop guarantees that we will always go straight
+ * from raising the exception to the exception-handling code.
+ *
+ * gen_ss_advance(s) does nothing on M profile currently but
+ * calling it is conceptually the right thing as we have executed
+ * this instruction (compare SWI, HVC, SMC handling).
+ */
+ gen_ss_advance(s);
+ gen_exception_internal(EXCP_EXCEPTION_EXIT);
+}
+
+static inline void gen_bxns(DisasContext *s, int rm)
+{
+ TCGv_i32 var = load_reg(s, rm);
+
+ /* The bxns helper may raise an EXCEPTION_EXIT exception, so in theory
+ * we need to sync state before calling it, but:
+ * - we don't need to do gen_update_pc() because the bxns helper will
+ * always set the PC itself
+ * - we don't need to do gen_set_condexec() because BXNS is UNPREDICTABLE
+ * unless it's outside an IT block or the last insn in an IT block,
+ * so we know that condexec == 0 (already set at the top of the TB)
+ * is correct in the non-UNPREDICTABLE cases, and we can choose
+ * "zeroes the IT bits" as our UNPREDICTABLE behaviour otherwise.
+ */
+ gen_helper_v7m_bxns(cpu_env, var);
+ tcg_temp_free_i32(var);
+ s->base.is_jmp = DISAS_EXIT;
+}
+
+static inline void gen_blxns(DisasContext *s, int rm)
+{
+ TCGv_i32 var = load_reg(s, rm);
+
+ /* We don't need to sync condexec state, for the same reason as bxns.
+ * We do however need to set the PC, because the blxns helper reads it.
+ * The blxns helper may throw an exception.
+ */
+ gen_update_pc(s, curr_insn_len(s));
+ gen_helper_v7m_blxns(cpu_env, var);
+ tcg_temp_free_i32(var);
+ s->base.is_jmp = DISAS_EXIT;
+}
+
+/* Variant of store_reg which uses branch&exchange logic when storing
+ to r15 in ARM architecture v7 and above. The source must be a temporary
+ and will be marked as dead. */
+static inline void store_reg_bx(DisasContext *s, int reg, TCGv_i32 var)
+{
+ if (reg == 15 && ENABLE_ARCH_7) {
+ gen_bx(s, var);
+ } else {
+ store_reg(s, reg, var);
+ }
+}
+
+/* Variant of store_reg which uses branch&exchange logic when storing
+ * to r15 in ARM architecture v5T and above. This is used for storing
+ * the results of a LDR/LDM/POP into r15, and corresponds to the cases
+ * in the ARM ARM which use the LoadWritePC() pseudocode function. */
+static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var)
+{
+ if (reg == 15 && ENABLE_ARCH_5) {
+ gen_bx_excret(s, var);
+ } else {
+ store_reg(s, reg, var);
+ }
+}
+
+#ifdef CONFIG_USER_ONLY
+#define IS_USER_ONLY 1
+#else
+#define IS_USER_ONLY 0
+#endif
+
+MemOp pow2_align(unsigned i)
+{
+ static const MemOp mop_align[] = {
+ 0, MO_ALIGN_2, MO_ALIGN_4, MO_ALIGN_8, MO_ALIGN_16,
+ /*
+ * FIXME: TARGET_PAGE_BITS_MIN affects TLB_FLAGS_MASK such
+ * that 256-bit alignment (MO_ALIGN_32) cannot be supported:
+ * see get_alignment_bits(). Enforce only 128-bit alignment for now.
+ */
+ MO_ALIGN_16
+ };
+ g_assert(i < ARRAY_SIZE(mop_align));
+ return mop_align[i];
+}
+
+/*
+ * Abstractions of "generate code to do a guest load/store for
+ * AArch32", where a vaddr is always 32 bits (and is zero
+ * extended if we're a 64 bit core) and data is also
+ * 32 bits unless specifically doing a 64 bit access.
+ * These functions work like tcg_gen_qemu_{ld,st}* except
+ * that the address argument is TCGv_i32 rather than TCGv.
+ */
+
+static TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, MemOp op)
+{
+ TCGv addr = tcg_temp_new();
+ tcg_gen_extu_i32_tl(addr, a32);
+
+ /* Not needed for user-mode BE32, where we use MO_BE instead. */
+ if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) {
+ tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE)));
+ }
+ return addr;
+}
+
+/*
+ * Internal routines are used for NEON cases where the endianness
+ * and/or alignment has already been taken into account and manipulated.
+ */
+void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
+ TCGv_i32 a32, int index, MemOp opc)
+{
+ TCGv addr = gen_aa32_addr(s, a32, opc);
+ tcg_gen_qemu_ld_i32(val, addr, index, opc);
+ tcg_temp_free(addr);
+}
+
+void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
+ TCGv_i32 a32, int index, MemOp opc)
+{
+ TCGv addr = gen_aa32_addr(s, a32, opc);
+ tcg_gen_qemu_st_i32(val, addr, index, opc);
+ tcg_temp_free(addr);
+}
+
+void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
+ TCGv_i32 a32, int index, MemOp opc)
+{
+ TCGv addr = gen_aa32_addr(s, a32, opc);
+
+ tcg_gen_qemu_ld_i64(val, addr, index, opc);
+
+ /* Not needed for user-mode BE32, where we use MO_BE instead. */
+ if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
+ tcg_gen_rotri_i64(val, val, 32);
+ }
+ tcg_temp_free(addr);
+}
+
+void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
+ TCGv_i32 a32, int index, MemOp opc)
+{
+ TCGv addr = gen_aa32_addr(s, a32, opc);
+
+ /* Not needed for user-mode BE32, where we use MO_BE instead. */
+ if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
+ TCGv_i64 tmp = tcg_temp_new_i64();
+ tcg_gen_rotri_i64(tmp, val, 32);
+ tcg_gen_qemu_st_i64(tmp, addr, index, opc);
+ tcg_temp_free_i64(tmp);
+ } else {
+ tcg_gen_qemu_st_i64(val, addr, index, opc);
+ }
+ tcg_temp_free(addr);
+}
+
+void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+ int index, MemOp opc)
+{
+ gen_aa32_ld_internal_i32(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+ int index, MemOp opc)
+{
+ gen_aa32_st_internal_i32(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+ int index, MemOp opc)
+{
+ gen_aa32_ld_internal_i64(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+ int index, MemOp opc)
+{
+ gen_aa32_st_internal_i64(s, val, a32, index, finalize_memop(s, opc));
+}
+
+#define DO_GEN_LD(SUFF, OPC) \
+ static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val, \
+ TCGv_i32 a32, int index) \
+ { \
+ gen_aa32_ld_i32(s, val, a32, index, OPC); \
+ }
+
+#define DO_GEN_ST(SUFF, OPC) \
+ static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val, \
+ TCGv_i32 a32, int index) \
+ { \
+ gen_aa32_st_i32(s, val, a32, index, OPC); \
+ }
+
+static inline void gen_hvc(DisasContext *s, int imm16)
+{
+ /* The pre HVC helper handles cases when HVC gets trapped
+ * as an undefined insn by runtime configuration (ie before
+ * the insn really executes).
+ */
+ gen_update_pc(s, 0);
+ gen_helper_pre_hvc(cpu_env);
+ /* Otherwise we will treat this as a real exception which
+ * happens after execution of the insn. (The distinction matters
+ * for the PC value reported to the exception handler and also
+ * for single stepping.)
+ */
+ s->svc_imm = imm16;
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_HVC;
+}
+
+static inline void gen_smc(DisasContext *s)
+{
+ /* As with HVC, we may take an exception either before or after
+ * the insn executes.
+ */
+ gen_update_pc(s, 0);
+ gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa32_smc()));
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_SMC;
+}
+
+static void gen_exception_internal_insn(DisasContext *s, int excp)
+{
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ gen_exception_internal(excp);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_el_v(int excp, uint32_t syndrome, TCGv_i32 tcg_el)
+{
+ gen_helper_exception_with_syndrome_el(cpu_env, tcg_constant_i32(excp),
+ tcg_constant_i32(syndrome), tcg_el);
+}
+
+static void gen_exception_el(int excp, uint32_t syndrome, uint32_t target_el)
+{
+ gen_exception_el_v(excp, syndrome, tcg_constant_i32(target_el));
+}
+
+static void gen_exception(int excp, uint32_t syndrome)
+{
+ gen_helper_exception_with_syndrome(cpu_env, tcg_constant_i32(excp),
+ tcg_constant_i32(syndrome));
+}
+
+static void gen_exception_insn_el_v(DisasContext *s, target_long pc_diff,
+ int excp, uint32_t syn, TCGv_i32 tcg_el)
+{
+ if (s->aarch64) {
+ gen_a64_update_pc(s, pc_diff);
+ } else {
+ gen_set_condexec(s);
+ gen_update_pc(s, pc_diff);
+ }
+ gen_exception_el_v(excp, syn, tcg_el);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
+ uint32_t syn, uint32_t target_el)
+{
+ gen_exception_insn_el_v(s, pc_diff, excp, syn,
+ tcg_constant_i32(target_el));
+}
+
+void gen_exception_insn(DisasContext *s, target_long pc_diff,
+ int excp, uint32_t syn)
+{
+ if (s->aarch64) {
+ gen_a64_update_pc(s, pc_diff);
+ } else {
+ gen_set_condexec(s);
+ gen_update_pc(s, pc_diff);
+ }
+ gen_exception(excp, syn);
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn)
+{
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syn));
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+void unallocated_encoding(DisasContext *s)
+{
+ /* Unallocated and reserved encodings are uncategorized */
+ gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
+}
+
+/* Force a TB lookup after an instruction that changes the CPU state. */
+void gen_lookup_tb(DisasContext *s)
+{
+ gen_pc_plus_diff(s, cpu_R[15], curr_insn_len(s));
+ s->base.is_jmp = DISAS_EXIT;
+}
+
+static inline void gen_hlt(DisasContext *s, int imm)
+{
+ /* HLT. This has two purposes.
+ * Architecturally, it is an external halting debug instruction.
+ * Since QEMU doesn't implement external debug, we treat this as
+ * it is required for halting debug disabled: it will UNDEF.
+ * Secondly, "HLT 0x3C" is a T32 semihosting trap instruction,
+ * and "HLT 0xF000" is an A32 semihosting syscall. These traps
+ * must trigger semihosting even for ARMv7 and earlier, where
+ * HLT was an undefined encoding.
+ * In system mode, we don't allow userspace access to
+ * semihosting, to provide some semblance of security
+ * (and for consistency with our 32-bit semihosting).
+ */
+ if (semihosting_enabled(s->current_el == 0) &&
+ (imm == (s->thumb ? 0x3c : 0xf000))) {
+ gen_exception_internal_insn(s, EXCP_SEMIHOST);
+ return;
+ }
+
+ unallocated_encoding(s);
+}
+
+/*
+ * Return the offset of a "full" NEON Dreg.
+ */
+long neon_full_reg_offset(unsigned reg)
+{
+ return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
+}
+
+/*
+ * Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
+ * where 0 is the least significant end of the register.
+ */
+long neon_element_offset(int reg, int element, MemOp memop)
+{
+ int element_size = 1 << (memop & MO_SIZE);
+ int ofs = element * element_size;
+#if HOST_BIG_ENDIAN
+ /*
+ * Calculate the offset assuming fully little-endian,
+ * then XOR to account for the order of the 8-byte units.
+ */
+ if (element_size < 8) {
+ ofs ^= 8 - element_size;
+ }
+#endif
+ return neon_full_reg_offset(reg) + ofs;
+}
+
+/* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */
+long vfp_reg_offset(bool dp, unsigned reg)
+{
+ if (dp) {
+ return neon_element_offset(reg, 0, MO_64);
+ } else {
+ return neon_element_offset(reg >> 1, reg & 1, MO_32);
+ }
+}
+
+void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
+{
+ long off = neon_element_offset(reg, ele, memop);
+
+ switch (memop) {
+ case MO_SB:
+ tcg_gen_ld8s_i32(dest, cpu_env, off);
+ break;
+ case MO_UB:
+ tcg_gen_ld8u_i32(dest, cpu_env, off);
+ break;
+ case MO_SW:
+ tcg_gen_ld16s_i32(dest, cpu_env, off);
+ break;
+ case MO_UW:
+ tcg_gen_ld16u_i32(dest, cpu_env, off);
+ break;
+ case MO_UL:
+ case MO_SL:
+ tcg_gen_ld_i32(dest, cpu_env, off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
+{
+ long off = neon_element_offset(reg, ele, memop);
+
+ switch (memop) {
+ case MO_SL:
+ tcg_gen_ld32s_i64(dest, cpu_env, off);
+ break;
+ case MO_UL:
+ tcg_gen_ld32u_i64(dest, cpu_env, off);
+ break;
+ case MO_UQ:
+ tcg_gen_ld_i64(dest, cpu_env, off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
+{
+ long off = neon_element_offset(reg, ele, memop);
+
+ switch (memop) {
+ case MO_8:
+ tcg_gen_st8_i32(src, cpu_env, off);
+ break;
+ case MO_16:
+ tcg_gen_st16_i32(src, cpu_env, off);
+ break;
+ case MO_32:
+ tcg_gen_st_i32(src, cpu_env, off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
+{
+ long off = neon_element_offset(reg, ele, memop);
+
+ switch (memop) {
+ case MO_32:
+ tcg_gen_st32_i64(src, cpu_env, off);
+ break;
+ case MO_64:
+ tcg_gen_st_i64(src, cpu_env, off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+#define ARM_CP_RW_BIT (1 << 20)
+
+static inline void iwmmxt_load_reg(TCGv_i64 var, int reg)
+{
+ tcg_gen_ld_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
+}
+
+static inline void iwmmxt_store_reg(TCGv_i64 var, int reg)
+{
+ tcg_gen_st_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
+}
+
+static inline TCGv_i32 iwmmxt_load_creg(int reg)
+{
+ TCGv_i32 var = tcg_temp_new_i32();
+ tcg_gen_ld_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
+ return var;
+}
+
+static inline void iwmmxt_store_creg(int reg, TCGv_i32 var)
+{
+ tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
+ tcg_temp_free_i32(var);
+}
+
+static inline void gen_op_iwmmxt_movq_wRn_M0(int rn)
+{
+ iwmmxt_store_reg(cpu_M0, rn);
+}
+
+static inline void gen_op_iwmmxt_movq_M0_wRn(int rn)
+{
+ iwmmxt_load_reg(cpu_M0, rn);
+}
+
+static inline void gen_op_iwmmxt_orq_M0_wRn(int rn)
+{
+ iwmmxt_load_reg(cpu_V1, rn);
+ tcg_gen_or_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline void gen_op_iwmmxt_andq_M0_wRn(int rn)
+{
+ iwmmxt_load_reg(cpu_V1, rn);
+ tcg_gen_and_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline void gen_op_iwmmxt_xorq_M0_wRn(int rn)
+{
+ iwmmxt_load_reg(cpu_V1, rn);
+ tcg_gen_xor_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+#define IWMMXT_OP(name) \
+static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
+{ \
+ iwmmxt_load_reg(cpu_V1, rn); \
+ gen_helper_iwmmxt_##name(cpu_M0, cpu_M0, cpu_V1); \
+}
+
+#define IWMMXT_OP_ENV(name) \
+static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
+{ \
+ iwmmxt_load_reg(cpu_V1, rn); \
+ gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0, cpu_V1); \
+}
+
+#define IWMMXT_OP_ENV_SIZE(name) \
+IWMMXT_OP_ENV(name##b) \
+IWMMXT_OP_ENV(name##w) \
+IWMMXT_OP_ENV(name##l)
+
+#define IWMMXT_OP_ENV1(name) \
+static inline void gen_op_iwmmxt_##name##_M0(void) \
+{ \
+ gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0); \
+}
+
+IWMMXT_OP(maddsq)
+IWMMXT_OP(madduq)
+IWMMXT_OP(sadb)
+IWMMXT_OP(sadw)
+IWMMXT_OP(mulslw)
+IWMMXT_OP(mulshw)
+IWMMXT_OP(mululw)
+IWMMXT_OP(muluhw)
+IWMMXT_OP(macsw)
+IWMMXT_OP(macuw)
+
+IWMMXT_OP_ENV_SIZE(unpackl)
+IWMMXT_OP_ENV_SIZE(unpackh)
+
+IWMMXT_OP_ENV1(unpacklub)
+IWMMXT_OP_ENV1(unpackluw)
+IWMMXT_OP_ENV1(unpacklul)
+IWMMXT_OP_ENV1(unpackhub)
+IWMMXT_OP_ENV1(unpackhuw)
+IWMMXT_OP_ENV1(unpackhul)
+IWMMXT_OP_ENV1(unpacklsb)
+IWMMXT_OP_ENV1(unpacklsw)
+IWMMXT_OP_ENV1(unpacklsl)
+IWMMXT_OP_ENV1(unpackhsb)
+IWMMXT_OP_ENV1(unpackhsw)
+IWMMXT_OP_ENV1(unpackhsl)
+
+IWMMXT_OP_ENV_SIZE(cmpeq)
+IWMMXT_OP_ENV_SIZE(cmpgtu)
+IWMMXT_OP_ENV_SIZE(cmpgts)
+
+IWMMXT_OP_ENV_SIZE(mins)
+IWMMXT_OP_ENV_SIZE(minu)
+IWMMXT_OP_ENV_SIZE(maxs)
+IWMMXT_OP_ENV_SIZE(maxu)
+
+IWMMXT_OP_ENV_SIZE(subn)
+IWMMXT_OP_ENV_SIZE(addn)
+IWMMXT_OP_ENV_SIZE(subu)
+IWMMXT_OP_ENV_SIZE(addu)
+IWMMXT_OP_ENV_SIZE(subs)
+IWMMXT_OP_ENV_SIZE(adds)
+
+IWMMXT_OP_ENV(avgb0)
+IWMMXT_OP_ENV(avgb1)
+IWMMXT_OP_ENV(avgw0)
+IWMMXT_OP_ENV(avgw1)
+
+IWMMXT_OP_ENV(packuw)
+IWMMXT_OP_ENV(packul)
+IWMMXT_OP_ENV(packuq)
+IWMMXT_OP_ENV(packsw)
+IWMMXT_OP_ENV(packsl)
+IWMMXT_OP_ENV(packsq)
+
+static void gen_op_iwmmxt_set_mup(void)
+{
+ TCGv_i32 tmp;
+ tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
+ tcg_gen_ori_i32(tmp, tmp, 2);
+ store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
+}
+
+static void gen_op_iwmmxt_set_cup(void)
+{
+ TCGv_i32 tmp;
+ tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
+ tcg_gen_ori_i32(tmp, tmp, 1);
+ store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
+}
+
+static void gen_op_iwmmxt_setpsr_nz(void)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ gen_helper_iwmmxt_setpsr_nz(tmp, cpu_M0);
+ store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCASF]);
+}
+
+static inline void gen_op_iwmmxt_addl_M0_wRn(int rn)
+{
+ iwmmxt_load_reg(cpu_V1, rn);
+ tcg_gen_ext32u_i64(cpu_V1, cpu_V1);
+ tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn,
+ TCGv_i32 dest)
+{
+ int rd;
+ uint32_t offset;
+ TCGv_i32 tmp;
+
+ rd = (insn >> 16) & 0xf;
+ tmp = load_reg(s, rd);
+
+ offset = (insn & 0xff) << ((insn >> 7) & 2);
+ if (insn & (1 << 24)) {
+ /* Pre indexed */
+ if (insn & (1 << 23))
+ tcg_gen_addi_i32(tmp, tmp, offset);
+ else
+ tcg_gen_addi_i32(tmp, tmp, -offset);
+ tcg_gen_mov_i32(dest, tmp);
+ if (insn & (1 << 21))
+ store_reg(s, rd, tmp);
+ else
+ tcg_temp_free_i32(tmp);
+ } else if (insn & (1 << 21)) {
+ /* Post indexed */
+ tcg_gen_mov_i32(dest, tmp);
+ if (insn & (1 << 23))
+ tcg_gen_addi_i32(tmp, tmp, offset);
+ else
+ tcg_gen_addi_i32(tmp, tmp, -offset);
+ store_reg(s, rd, tmp);
+ } else if (!(insn & (1 << 23)))
+ return 1;
+ return 0;
+}
+
+static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest)
+{
+ int rd = (insn >> 0) & 0xf;
+ TCGv_i32 tmp;
+
+ if (insn & (1 << 8)) {
+ if (rd < ARM_IWMMXT_wCGR0 || rd > ARM_IWMMXT_wCGR3) {
+ return 1;
+ } else {
+ tmp = iwmmxt_load_creg(rd);
+ }
+ } else {
+ tmp = tcg_temp_new_i32();
+ iwmmxt_load_reg(cpu_V0, rd);
+ tcg_gen_extrl_i64_i32(tmp, cpu_V0);
+ }
+ tcg_gen_andi_i32(tmp, tmp, mask);
+ tcg_gen_mov_i32(dest, tmp);
+ tcg_temp_free_i32(tmp);
+ return 0;
+}
+
+/* Disassemble an iwMMXt instruction. Returns nonzero if an error occurred
+ (ie. an undefined instruction). */
+static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
+{
+ int rd, wrd;
+ int rdhi, rdlo, rd0, rd1, i;
+ TCGv_i32 addr;
+ TCGv_i32 tmp, tmp2, tmp3;
+
+ if ((insn & 0x0e000e00) == 0x0c000000) {
+ if ((insn & 0x0fe00ff0) == 0x0c400000) {
+ wrd = insn & 0xf;
+ rdlo = (insn >> 12) & 0xf;
+ rdhi = (insn >> 16) & 0xf;
+ if (insn & ARM_CP_RW_BIT) { /* TMRRC */
+ iwmmxt_load_reg(cpu_V0, wrd);
+ tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
+ tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
+ } else { /* TMCRR */
+ tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
+ iwmmxt_store_reg(cpu_V0, wrd);
+ gen_op_iwmmxt_set_mup();
+ }
+ return 0;
+ }
+
+ wrd = (insn >> 12) & 0xf;
+ addr = tcg_temp_new_i32();
+ if (gen_iwmmxt_address(s, insn, addr)) {
+ tcg_temp_free_i32(addr);
+ return 1;
+ }
+ if (insn & ARM_CP_RW_BIT) {
+ if ((insn >> 28) == 0xf) { /* WLDRW wCx */
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
+ iwmmxt_store_creg(wrd, tmp);
+ } else {
+ i = 1;
+ if (insn & (1 << 8)) {
+ if (insn & (1 << 22)) { /* WLDRD */
+ gen_aa32_ld64(s, cpu_M0, addr, get_mem_index(s));
+ i = 0;
+ } else { /* WLDRW wRd */
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
+ }
+ } else {
+ tmp = tcg_temp_new_i32();
+ if (insn & (1 << 22)) { /* WLDRH */
+ gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
+ } else { /* WLDRB */
+ gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
+ }
+ }
+ if (i) {
+ tcg_gen_extu_i32_i64(cpu_M0, tmp);
+ tcg_temp_free_i32(tmp);
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ }
+ } else {
+ if ((insn >> 28) == 0xf) { /* WSTRW wCx */
+ tmp = iwmmxt_load_creg(wrd);
+ gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+ } else {
+ gen_op_iwmmxt_movq_M0_wRn(wrd);
+ tmp = tcg_temp_new_i32();
+ if (insn & (1 << 8)) {
+ if (insn & (1 << 22)) { /* WSTRD */
+ gen_aa32_st64(s, cpu_M0, addr, get_mem_index(s));
+ } else { /* WSTRW wRd */
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+ }
+ } else {
+ if (insn & (1 << 22)) { /* WSTRH */
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ gen_aa32_st16(s, tmp, addr, get_mem_index(s));
+ } else { /* WSTRB */
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ gen_aa32_st8(s, tmp, addr, get_mem_index(s));
+ }
+ }
+ }
+ tcg_temp_free_i32(tmp);
+ }
+ tcg_temp_free_i32(addr);
+ return 0;
+ }
+
+ if ((insn & 0x0f000000) != 0x0e000000)
+ return 1;
+
+ switch (((insn >> 12) & 0xf00) | ((insn >> 4) & 0xff)) {
+ case 0x000: /* WOR */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 0) & 0xf;
+ rd1 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ gen_op_iwmmxt_orq_M0_wRn(rd1);
+ gen_op_iwmmxt_setpsr_nz();
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x011: /* TMCR */
+ if (insn & 0xf)
+ return 1;
+ rd = (insn >> 12) & 0xf;
+ wrd = (insn >> 16) & 0xf;
+ switch (wrd) {
+ case ARM_IWMMXT_wCID:
+ case ARM_IWMMXT_wCASF:
+ break;
+ case ARM_IWMMXT_wCon:
+ gen_op_iwmmxt_set_cup();
+ /* Fall through. */
+ case ARM_IWMMXT_wCSSF:
+ tmp = iwmmxt_load_creg(wrd);
+ tmp2 = load_reg(s, rd);
+ tcg_gen_andc_i32(tmp, tmp, tmp2);
+ tcg_temp_free_i32(tmp2);
+ iwmmxt_store_creg(wrd, tmp);
+ break;
+ case ARM_IWMMXT_wCGR0:
+ case ARM_IWMMXT_wCGR1:
+ case ARM_IWMMXT_wCGR2:
+ case ARM_IWMMXT_wCGR3:
+ gen_op_iwmmxt_set_cup();
+ tmp = load_reg(s, rd);
+ iwmmxt_store_creg(wrd, tmp);
+ break;
+ default:
+ return 1;
+ }
+ break;
+ case 0x100: /* WXOR */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 0) & 0xf;
+ rd1 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ gen_op_iwmmxt_xorq_M0_wRn(rd1);
+ gen_op_iwmmxt_setpsr_nz();
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x111: /* TMRC */
+ if (insn & 0xf)
+ return 1;
+ rd = (insn >> 12) & 0xf;
+ wrd = (insn >> 16) & 0xf;
+ tmp = iwmmxt_load_creg(wrd);
+ store_reg(s, rd, tmp);
+ break;
+ case 0x300: /* WANDN */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 0) & 0xf;
+ rd1 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tcg_gen_neg_i64(cpu_M0, cpu_M0);
+ gen_op_iwmmxt_andq_M0_wRn(rd1);
+ gen_op_iwmmxt_setpsr_nz();
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x200: /* WAND */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 0) & 0xf;
+ rd1 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ gen_op_iwmmxt_andq_M0_wRn(rd1);
+ gen_op_iwmmxt_setpsr_nz();
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x810: case 0xa10: /* WMADD */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 0) & 0xf;
+ rd1 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_maddsq_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_madduq_M0_wRn(rd1);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x10e: case 0x50e: case 0x90e: case 0xd0e: /* WUNPCKIL */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ gen_op_iwmmxt_unpacklb_M0_wRn(rd1);
+ break;
+ case 1:
+ gen_op_iwmmxt_unpacklw_M0_wRn(rd1);
+ break;
+ case 2:
+ gen_op_iwmmxt_unpackll_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x10c: case 0x50c: case 0x90c: case 0xd0c: /* WUNPCKIH */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ gen_op_iwmmxt_unpackhb_M0_wRn(rd1);
+ break;
+ case 1:
+ gen_op_iwmmxt_unpackhw_M0_wRn(rd1);
+ break;
+ case 2:
+ gen_op_iwmmxt_unpackhl_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x012: case 0x112: case 0x412: case 0x512: /* WSAD */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ if (insn & (1 << 22))
+ gen_op_iwmmxt_sadw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_sadb_M0_wRn(rd1);
+ if (!(insn & (1 << 20)))
+ gen_op_iwmmxt_addl_M0_wRn(wrd);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x010: case 0x110: case 0x210: case 0x310: /* WMUL */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ if (insn & (1 << 21)) {
+ if (insn & (1 << 20))
+ gen_op_iwmmxt_mulshw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_mulslw_M0_wRn(rd1);
+ } else {
+ if (insn & (1 << 20))
+ gen_op_iwmmxt_muluhw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_mululw_M0_wRn(rd1);
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x410: case 0x510: case 0x610: case 0x710: /* WMAC */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_macsw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_macuw_M0_wRn(rd1);
+ if (!(insn & (1 << 20))) {
+ iwmmxt_load_reg(cpu_V1, wrd);
+ tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x006: case 0x406: case 0x806: case 0xc06: /* WCMPEQ */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ gen_op_iwmmxt_cmpeqb_M0_wRn(rd1);
+ break;
+ case 1:
+ gen_op_iwmmxt_cmpeqw_M0_wRn(rd1);
+ break;
+ case 2:
+ gen_op_iwmmxt_cmpeql_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x800: case 0x900: case 0xc00: case 0xd00: /* WAVG2 */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ if (insn & (1 << 22)) {
+ if (insn & (1 << 20))
+ gen_op_iwmmxt_avgw1_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_avgw0_M0_wRn(rd1);
+ } else {
+ if (insn & (1 << 20))
+ gen_op_iwmmxt_avgb1_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_avgb0_M0_wRn(rd1);
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x802: case 0x902: case 0xa02: case 0xb02: /* WALIGNR */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = iwmmxt_load_creg(ARM_IWMMXT_wCGR0 + ((insn >> 20) & 3));
+ tcg_gen_andi_i32(tmp, tmp, 7);
+ iwmmxt_load_reg(cpu_V1, rd1);
+ gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, tmp);
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x601: case 0x605: case 0x609: case 0x60d: /* TINSR */
+ if (((insn >> 6) & 3) == 3)
+ return 1;
+ rd = (insn >> 12) & 0xf;
+ wrd = (insn >> 16) & 0xf;
+ tmp = load_reg(s, rd);
+ gen_op_iwmmxt_movq_M0_wRn(wrd);
+ switch ((insn >> 6) & 3) {
+ case 0:
+ tmp2 = tcg_constant_i32(0xff);
+ tmp3 = tcg_constant_i32((insn & 7) << 3);
+ break;
+ case 1:
+ tmp2 = tcg_constant_i32(0xffff);
+ tmp3 = tcg_constant_i32((insn & 3) << 4);
+ break;
+ case 2:
+ tmp2 = tcg_constant_i32(0xffffffff);
+ tmp3 = tcg_constant_i32((insn & 1) << 5);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ gen_helper_iwmmxt_insr(cpu_M0, cpu_M0, tmp, tmp2, tmp3);
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x107: case 0x507: case 0x907: case 0xd07: /* TEXTRM */
+ rd = (insn >> 12) & 0xf;
+ wrd = (insn >> 16) & 0xf;
+ if (rd == 15 || ((insn >> 22) & 3) == 3)
+ return 1;
+ gen_op_iwmmxt_movq_M0_wRn(wrd);
+ tmp = tcg_temp_new_i32();
+ switch ((insn >> 22) & 3) {
+ case 0:
+ tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 7) << 3);
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ if (insn & 8) {
+ tcg_gen_ext8s_i32(tmp, tmp);
+ } else {
+ tcg_gen_andi_i32(tmp, tmp, 0xff);
+ }
+ break;
+ case 1:
+ tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 3) << 4);
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ if (insn & 8) {
+ tcg_gen_ext16s_i32(tmp, tmp);
+ } else {
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
+ }
+ break;
+ case 2:
+ tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 1) << 5);
+ tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+ break;
+ }
+ store_reg(s, rd, tmp);
+ break;
+ case 0x117: case 0x517: case 0x917: case 0xd17: /* TEXTRC */
+ if ((insn & 0x000ff008) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+ return 1;
+ tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ tcg_gen_shri_i32(tmp, tmp, ((insn & 7) << 2) + 0);
+ break;
+ case 1:
+ tcg_gen_shri_i32(tmp, tmp, ((insn & 3) << 3) + 4);
+ break;
+ case 2:
+ tcg_gen_shri_i32(tmp, tmp, ((insn & 1) << 4) + 12);
+ break;
+ }
+ tcg_gen_shli_i32(tmp, tmp, 28);
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp);
+ break;
+ case 0x401: case 0x405: case 0x409: case 0x40d: /* TBCST */
+ if (((insn >> 6) & 3) == 3)
+ return 1;
+ rd = (insn >> 12) & 0xf;
+ wrd = (insn >> 16) & 0xf;
+ tmp = load_reg(s, rd);
+ switch ((insn >> 6) & 3) {
+ case 0:
+ gen_helper_iwmmxt_bcstb(cpu_M0, tmp);
+ break;
+ case 1:
+ gen_helper_iwmmxt_bcstw(cpu_M0, tmp);
+ break;
+ case 2:
+ gen_helper_iwmmxt_bcstl(cpu_M0, tmp);
+ break;
+ }
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x113: case 0x513: case 0x913: case 0xd13: /* TANDC */
+ if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+ return 1;
+ tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+ tmp2 = tcg_temp_new_i32();
+ tcg_gen_mov_i32(tmp2, tmp);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ for (i = 0; i < 7; i ++) {
+ tcg_gen_shli_i32(tmp2, tmp2, 4);
+ tcg_gen_and_i32(tmp, tmp, tmp2);
+ }
+ break;
+ case 1:
+ for (i = 0; i < 3; i ++) {
+ tcg_gen_shli_i32(tmp2, tmp2, 8);
+ tcg_gen_and_i32(tmp, tmp, tmp2);
+ }
+ break;
+ case 2:
+ tcg_gen_shli_i32(tmp2, tmp2, 16);
+ tcg_gen_and_i32(tmp, tmp, tmp2);
+ break;
+ }
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp);
+ break;
+ case 0x01c: case 0x41c: case 0x81c: case 0xc1c: /* WACC */
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ gen_helper_iwmmxt_addcb(cpu_M0, cpu_M0);
+ break;
+ case 1:
+ gen_helper_iwmmxt_addcw(cpu_M0, cpu_M0);
+ break;
+ case 2:
+ gen_helper_iwmmxt_addcl(cpu_M0, cpu_M0);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x115: case 0x515: case 0x915: case 0xd15: /* TORC */
+ if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+ return 1;
+ tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+ tmp2 = tcg_temp_new_i32();
+ tcg_gen_mov_i32(tmp2, tmp);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ for (i = 0; i < 7; i ++) {
+ tcg_gen_shli_i32(tmp2, tmp2, 4);
+ tcg_gen_or_i32(tmp, tmp, tmp2);
+ }
+ break;
+ case 1:
+ for (i = 0; i < 3; i ++) {
+ tcg_gen_shli_i32(tmp2, tmp2, 8);
+ tcg_gen_or_i32(tmp, tmp, tmp2);
+ }
+ break;
+ case 2:
+ tcg_gen_shli_i32(tmp2, tmp2, 16);
+ tcg_gen_or_i32(tmp, tmp, tmp2);
+ break;
+ }
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp);
+ break;
+ case 0x103: case 0x503: case 0x903: case 0xd03: /* TMOVMSK */
+ rd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ if ((insn & 0xf) != 0 || ((insn >> 22) & 3) == 3)
+ return 1;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_temp_new_i32();
+ switch ((insn >> 22) & 3) {
+ case 0:
+ gen_helper_iwmmxt_msbb(tmp, cpu_M0);
+ break;
+ case 1:
+ gen_helper_iwmmxt_msbw(tmp, cpu_M0);
+ break;
+ case 2:
+ gen_helper_iwmmxt_msbl(tmp, cpu_M0);
+ break;
+ }
+ store_reg(s, rd, tmp);
+ break;
+ case 0x106: case 0x306: case 0x506: case 0x706: /* WCMPGT */
+ case 0x906: case 0xb06: case 0xd06: case 0xf06:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_cmpgtsb_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_cmpgtub_M0_wRn(rd1);
+ break;
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_cmpgtsw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_cmpgtuw_M0_wRn(rd1);
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_cmpgtsl_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_cmpgtul_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x00e: case 0x20e: case 0x40e: case 0x60e: /* WUNPCKEL */
+ case 0x80e: case 0xa0e: case 0xc0e: case 0xe0e:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpacklsb_M0();
+ else
+ gen_op_iwmmxt_unpacklub_M0();
+ break;
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpacklsw_M0();
+ else
+ gen_op_iwmmxt_unpackluw_M0();
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpacklsl_M0();
+ else
+ gen_op_iwmmxt_unpacklul_M0();
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x00c: case 0x20c: case 0x40c: case 0x60c: /* WUNPCKEH */
+ case 0x80c: case 0xa0c: case 0xc0c: case 0xe0c:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpackhsb_M0();
+ else
+ gen_op_iwmmxt_unpackhub_M0();
+ break;
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpackhsw_M0();
+ else
+ gen_op_iwmmxt_unpackhuw_M0();
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_unpackhsl_M0();
+ else
+ gen_op_iwmmxt_unpackhul_M0();
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x204: case 0x604: case 0xa04: case 0xe04: /* WSRL */
+ case 0x214: case 0x614: case 0xa14: case 0xe14:
+ if (((insn >> 22) & 3) == 0)
+ return 1;
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_temp_new_i32();
+ if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ switch ((insn >> 22) & 3) {
+ case 1:
+ gen_helper_iwmmxt_srlw(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 2:
+ gen_helper_iwmmxt_srll(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 3:
+ gen_helper_iwmmxt_srlq(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ }
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x004: case 0x404: case 0x804: case 0xc04: /* WSRA */
+ case 0x014: case 0x414: case 0x814: case 0xc14:
+ if (((insn >> 22) & 3) == 0)
+ return 1;
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_temp_new_i32();
+ if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ switch ((insn >> 22) & 3) {
+ case 1:
+ gen_helper_iwmmxt_sraw(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 2:
+ gen_helper_iwmmxt_sral(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 3:
+ gen_helper_iwmmxt_sraq(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ }
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x104: case 0x504: case 0x904: case 0xd04: /* WSLL */
+ case 0x114: case 0x514: case 0x914: case 0xd14:
+ if (((insn >> 22) & 3) == 0)
+ return 1;
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_temp_new_i32();
+ if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ switch ((insn >> 22) & 3) {
+ case 1:
+ gen_helper_iwmmxt_sllw(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 2:
+ gen_helper_iwmmxt_slll(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 3:
+ gen_helper_iwmmxt_sllq(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ }
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x304: case 0x704: case 0xb04: case 0xf04: /* WROR */
+ case 0x314: case 0x714: case 0xb14: case 0xf14:
+ if (((insn >> 22) & 3) == 0)
+ return 1;
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_temp_new_i32();
+ switch ((insn >> 22) & 3) {
+ case 1:
+ if (gen_iwmmxt_shift(insn, 0xf, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ gen_helper_iwmmxt_rorw(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 2:
+ if (gen_iwmmxt_shift(insn, 0x1f, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ gen_helper_iwmmxt_rorl(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ case 3:
+ if (gen_iwmmxt_shift(insn, 0x3f, tmp)) {
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ gen_helper_iwmmxt_rorq(cpu_M0, cpu_env, cpu_M0, tmp);
+ break;
+ }
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x116: case 0x316: case 0x516: case 0x716: /* WMIN */
+ case 0x916: case 0xb16: case 0xd16: case 0xf16:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_minsb_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_minub_M0_wRn(rd1);
+ break;
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_minsw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_minuw_M0_wRn(rd1);
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_minsl_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_minul_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x016: case 0x216: case 0x416: case 0x616: /* WMAX */
+ case 0x816: case 0xa16: case 0xc16: case 0xe16:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 0:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_maxsb_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_maxub_M0_wRn(rd1);
+ break;
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_maxsw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_maxuw_M0_wRn(rd1);
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_maxsl_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_maxul_M0_wRn(rd1);
+ break;
+ case 3:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x002: case 0x102: case 0x202: case 0x302: /* WALIGNI */
+ case 0x402: case 0x502: case 0x602: case 0x702:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ iwmmxt_load_reg(cpu_V1, rd1);
+ gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1,
+ tcg_constant_i32((insn >> 20) & 3));
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ case 0x01a: case 0x11a: case 0x21a: case 0x31a: /* WSUB */
+ case 0x41a: case 0x51a: case 0x61a: case 0x71a:
+ case 0x81a: case 0x91a: case 0xa1a: case 0xb1a:
+ case 0xc1a: case 0xd1a: case 0xe1a: case 0xf1a:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 20) & 0xf) {
+ case 0x0:
+ gen_op_iwmmxt_subnb_M0_wRn(rd1);
+ break;
+ case 0x1:
+ gen_op_iwmmxt_subub_M0_wRn(rd1);
+ break;
+ case 0x3:
+ gen_op_iwmmxt_subsb_M0_wRn(rd1);
+ break;
+ case 0x4:
+ gen_op_iwmmxt_subnw_M0_wRn(rd1);
+ break;
+ case 0x5:
+ gen_op_iwmmxt_subuw_M0_wRn(rd1);
+ break;
+ case 0x7:
+ gen_op_iwmmxt_subsw_M0_wRn(rd1);
+ break;
+ case 0x8:
+ gen_op_iwmmxt_subnl_M0_wRn(rd1);
+ break;
+ case 0x9:
+ gen_op_iwmmxt_subul_M0_wRn(rd1);
+ break;
+ case 0xb:
+ gen_op_iwmmxt_subsl_M0_wRn(rd1);
+ break;
+ default:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x01e: case 0x11e: case 0x21e: case 0x31e: /* WSHUFH */
+ case 0x41e: case 0x51e: case 0x61e: case 0x71e:
+ case 0x81e: case 0x91e: case 0xa1e: case 0xb1e:
+ case 0xc1e: case 0xd1e: case 0xe1e: case 0xf1e:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ tmp = tcg_constant_i32(((insn >> 16) & 0xf0) | (insn & 0x0f));
+ gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x018: case 0x118: case 0x218: case 0x318: /* WADD */
+ case 0x418: case 0x518: case 0x618: case 0x718:
+ case 0x818: case 0x918: case 0xa18: case 0xb18:
+ case 0xc18: case 0xd18: case 0xe18: case 0xf18:
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 20) & 0xf) {
+ case 0x0:
+ gen_op_iwmmxt_addnb_M0_wRn(rd1);
+ break;
+ case 0x1:
+ gen_op_iwmmxt_addub_M0_wRn(rd1);
+ break;
+ case 0x3:
+ gen_op_iwmmxt_addsb_M0_wRn(rd1);
+ break;
+ case 0x4:
+ gen_op_iwmmxt_addnw_M0_wRn(rd1);
+ break;
+ case 0x5:
+ gen_op_iwmmxt_adduw_M0_wRn(rd1);
+ break;
+ case 0x7:
+ gen_op_iwmmxt_addsw_M0_wRn(rd1);
+ break;
+ case 0x8:
+ gen_op_iwmmxt_addnl_M0_wRn(rd1);
+ break;
+ case 0x9:
+ gen_op_iwmmxt_addul_M0_wRn(rd1);
+ break;
+ case 0xb:
+ gen_op_iwmmxt_addsl_M0_wRn(rd1);
+ break;
+ default:
+ return 1;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x008: case 0x108: case 0x208: case 0x308: /* WPACK */
+ case 0x408: case 0x508: case 0x608: case 0x708:
+ case 0x808: case 0x908: case 0xa08: case 0xb08:
+ case 0xc08: case 0xd08: case 0xe08: case 0xf08:
+ if (!(insn & (1 << 20)) || ((insn >> 22) & 3) == 0)
+ return 1;
+ wrd = (insn >> 12) & 0xf;
+ rd0 = (insn >> 16) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ gen_op_iwmmxt_movq_M0_wRn(rd0);
+ switch ((insn >> 22) & 3) {
+ case 1:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_packsw_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_packuw_M0_wRn(rd1);
+ break;
+ case 2:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_packsl_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_packul_M0_wRn(rd1);
+ break;
+ case 3:
+ if (insn & (1 << 21))
+ gen_op_iwmmxt_packsq_M0_wRn(rd1);
+ else
+ gen_op_iwmmxt_packuq_M0_wRn(rd1);
+ break;
+ }
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ gen_op_iwmmxt_set_cup();
+ break;
+ case 0x201: case 0x203: case 0x205: case 0x207:
+ case 0x209: case 0x20b: case 0x20d: case 0x20f:
+ case 0x211: case 0x213: case 0x215: case 0x217:
+ case 0x219: case 0x21b: case 0x21d: case 0x21f:
+ wrd = (insn >> 5) & 0xf;
+ rd0 = (insn >> 12) & 0xf;
+ rd1 = (insn >> 0) & 0xf;
+ if (rd0 == 0xf || rd1 == 0xf)
+ return 1;
+ gen_op_iwmmxt_movq_M0_wRn(wrd);
+ tmp = load_reg(s, rd0);
+ tmp2 = load_reg(s, rd1);
+ switch ((insn >> 16) & 0xf) {
+ case 0x0: /* TMIA */
+ gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ case 0x8: /* TMIAPH */
+ gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ case 0xc: case 0xd: case 0xe: case 0xf: /* TMIAxy */
+ if (insn & (1 << 16))
+ tcg_gen_shri_i32(tmp, tmp, 16);
+ if (insn & (1 << 17))
+ tcg_gen_shri_i32(tmp2, tmp2, 16);
+ gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ default:
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp);
+ return 1;
+ }
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp);
+ gen_op_iwmmxt_movq_wRn_M0(wrd);
+ gen_op_iwmmxt_set_mup();
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Disassemble an XScale DSP instruction. Returns nonzero if an error occurred
+ (ie. an undefined instruction). */
+static int disas_dsp_insn(DisasContext *s, uint32_t insn)
+{
+ int acc, rd0, rd1, rdhi, rdlo;
+ TCGv_i32 tmp, tmp2;
+
+ if ((insn & 0x0ff00f10) == 0x0e200010) {
+ /* Multiply with Internal Accumulate Format */
+ rd0 = (insn >> 12) & 0xf;
+ rd1 = insn & 0xf;
+ acc = (insn >> 5) & 7;
+
+ if (acc != 0)
+ return 1;
+
+ tmp = load_reg(s, rd0);
+ tmp2 = load_reg(s, rd1);
+ switch ((insn >> 16) & 0xf) {
+ case 0x0: /* MIA */
+ gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ case 0x8: /* MIAPH */
+ gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ case 0xc: /* MIABB */
+ case 0xd: /* MIABT */
+ case 0xe: /* MIATB */
+ case 0xf: /* MIATT */
+ if (insn & (1 << 16))
+ tcg_gen_shri_i32(tmp, tmp, 16);
+ if (insn & (1 << 17))
+ tcg_gen_shri_i32(tmp2, tmp2, 16);
+ gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
+ break;
+ default:
+ return 1;
+ }
+ tcg_temp_free_i32(tmp2);
+ tcg_temp_free_i32(tmp);
+
+ gen_op_iwmmxt_movq_wRn_M0(acc);
+ return 0;
+ }
+
+ if ((insn & 0x0fe00ff8) == 0x0c400000) {
+ /* Internal Accumulator Access Format */
+ rdhi = (insn >> 16) & 0xf;
+ rdlo = (insn >> 12) & 0xf;
+ acc = insn & 7;
+
+ if (acc != 0)
+ return 1;
+
+ if (insn & ARM_CP_RW_BIT) { /* MRA */
+ iwmmxt_load_reg(cpu_V0, acc);
+ tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
+ tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
+ tcg_gen_andi_i32(cpu_R[rdhi], cpu_R[rdhi], (1 << (40 - 32)) - 1);
+ } else { /* MAR */
+ tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
+ iwmmxt_store_reg(cpu_V0, acc);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static void gen_goto_ptr(void)
+{
+ tcg_gen_lookup_and_goto_ptr();
+}
+
+/* This will end the TB but doesn't guarantee we'll return to
+ * cpu_loop_exec. Any live exit_requests will be processed as we
+ * enter the next TB.
+ */
+static void gen_goto_tb(DisasContext *s, int n, target_long diff)
+{
+ if (translator_use_goto_tb(&s->base, s->pc_curr + diff)) {
+ /*
+ * For pcrel, the pc must always be up-to-date on entry to
+ * the linked TB, so that it can use simple additions for all
+ * further adjustments. For !pcrel, the linked TB is compiled
+ * to know its full virtual address, so we can delay the
+ * update to pc to the unlinked path. A long chain of links
+ * can thus avoid many updates to the PC.
+ */
+ if (TARGET_TB_PCREL) {
+ gen_update_pc(s, diff);
+ tcg_gen_goto_tb(n);
+ } else {
+ tcg_gen_goto_tb(n);
+ gen_update_pc(s, diff);
+ }
+ tcg_gen_exit_tb(s->base.tb, n);
+ } else {
+ gen_update_pc(s, diff);
+ gen_goto_ptr();
+ }
+ s->base.is_jmp = DISAS_NORETURN;
+}
+
+/* Jump, specifying which TB number to use if we gen_goto_tb() */
+static void gen_jmp_tb(DisasContext *s, target_long diff, int tbno)
+{
+ if (unlikely(s->ss_active)) {
+ /* An indirect jump so that we still trigger the debug exception. */
+ gen_update_pc(s, diff);
+ s->base.is_jmp = DISAS_JUMP;
+ return;
+ }
+ switch (s->base.is_jmp) {
+ case DISAS_NEXT:
+ case DISAS_TOO_MANY:
+ case DISAS_NORETURN:
+ /*
+ * The normal case: just go to the destination TB.
+ * NB: NORETURN happens if we generate code like
+ * gen_brcondi(l);
+ * gen_jmp();
+ * gen_set_label(l);
+ * gen_jmp();
+ * on the second call to gen_jmp().
+ */
+ gen_goto_tb(s, tbno, diff);
+ break;
+ case DISAS_UPDATE_NOCHAIN:
+ case DISAS_UPDATE_EXIT:
+ /*
+ * We already decided we're leaving the TB for some other reason.
+ * Avoid using goto_tb so we really do exit back to the main loop
+ * and don't chain to another TB.
+ */
+ gen_update_pc(s, diff);
+ gen_goto_ptr();
+ s->base.is_jmp = DISAS_NORETURN;
+ break;
+ default:
+ /*
+ * We shouldn't be emitting code for a jump and also have
+ * is_jmp set to one of the special cases like DISAS_SWI.
+ */
+ g_assert_not_reached();
+ }
+}
+
+static inline void gen_jmp(DisasContext *s, target_long diff)
+{
+ gen_jmp_tb(s, diff, 0);
+}
+
+static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
+{
+ if (x)
+ tcg_gen_sari_i32(t0, t0, 16);
+ else
+ gen_sxth(t0);
+ if (y)
+ tcg_gen_sari_i32(t1, t1, 16);
+ else
+ gen_sxth(t1);
+ tcg_gen_mul_i32(t0, t0, t1);
+}
+
+/* Return the mask of PSR bits set by a MSR instruction. */
+static uint32_t msr_mask(DisasContext *s, int flags, int spsr)
+{
+ uint32_t mask = 0;
+
+ if (flags & (1 << 0)) {
+ mask |= 0xff;
+ }
+ if (flags & (1 << 1)) {
+ mask |= 0xff00;
+ }
+ if (flags & (1 << 2)) {
+ mask |= 0xff0000;
+ }
+ if (flags & (1 << 3)) {
+ mask |= 0xff000000;
+ }
+
+ /* Mask out undefined and reserved bits. */
+ mask &= aarch32_cpsr_valid_mask(s->features, s->isar);
+
+ /* Mask out execution state. */
+ if (!spsr) {
+ mask &= ~CPSR_EXEC;
+ }
+
+ /* Mask out privileged bits. */
+ if (IS_USER(s)) {
+ mask &= CPSR_USER;
+ }
+ return mask;
+}
+
+/* Returns nonzero if access to the PSR is not permitted. Marks t0 as dead. */
+static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv_i32 t0)
+{
+ TCGv_i32 tmp;
+ if (spsr) {
+ /* ??? This is also undefined in system mode. */
+ if (IS_USER(s))
+ return 1;
+
+ tmp = load_cpu_field(spsr);
+ tcg_gen_andi_i32(tmp, tmp, ~mask);
+ tcg_gen_andi_i32(t0, t0, mask);
+ tcg_gen_or_i32(tmp, tmp, t0);
+ store_cpu_field(tmp, spsr);
+ } else {
+ gen_set_cpsr(t0, mask);
+ }
+ tcg_temp_free_i32(t0);
+ gen_lookup_tb(s);
+ return 0;
+}
+
+/* Returns nonzero if access to the PSR is not permitted. */
+static int gen_set_psr_im(DisasContext *s, uint32_t mask, int spsr, uint32_t val)
+{
+ TCGv_i32 tmp;
+ tmp = tcg_temp_new_i32();
+ tcg_gen_movi_i32(tmp, val);
+ return gen_set_psr(s, mask, spsr, tmp);
+}
+
+static bool msr_banked_access_decode(DisasContext *s, int r, int sysm, int rn,
+ int *tgtmode, int *regno)
+{
+ /* Decode the r and sysm fields of MSR/MRS banked accesses into
+ * the target mode and register number, and identify the various
+ * unpredictable cases.
+ * MSR (banked) and MRS (banked) are CONSTRAINED UNPREDICTABLE if:
+ * + executed in user mode
+ * + using R15 as the src/dest register
+ * + accessing an unimplemented register
+ * + accessing a register that's inaccessible at current PL/security state*
+ * + accessing a register that you could access with a different insn
+ * We choose to UNDEF in all these cases.
+ * Since we don't know which of the various AArch32 modes we are in
+ * we have to defer some checks to runtime.
+ * Accesses to Monitor mode registers from Secure EL1 (which implies
+ * that EL3 is AArch64) must trap to EL3.
+ *
+ * If the access checks fail this function will emit code to take
+ * an exception and return false. Otherwise it will return true,
+ * and set *tgtmode and *regno appropriately.
+ */
+ /* These instructions are present only in ARMv8, or in ARMv7 with the
+ * Virtualization Extensions.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
+ !arm_dc_feature(s, ARM_FEATURE_EL2)) {
+ goto undef;
+ }
+
+ if (IS_USER(s) || rn == 15) {
+ goto undef;
+ }
+
+ /* The table in the v8 ARM ARM section F5.2.3 describes the encoding
+ * of registers into (r, sysm).
+ */
+ if (r) {
+ /* SPSRs for other modes */
+ switch (sysm) {
+ case 0xe: /* SPSR_fiq */
+ *tgtmode = ARM_CPU_MODE_FIQ;
+ break;
+ case 0x10: /* SPSR_irq */
+ *tgtmode = ARM_CPU_MODE_IRQ;
+ break;
+ case 0x12: /* SPSR_svc */
+ *tgtmode = ARM_CPU_MODE_SVC;
+ break;
+ case 0x14: /* SPSR_abt */
+ *tgtmode = ARM_CPU_MODE_ABT;
+ break;
+ case 0x16: /* SPSR_und */
+ *tgtmode = ARM_CPU_MODE_UND;
+ break;
+ case 0x1c: /* SPSR_mon */
+ *tgtmode = ARM_CPU_MODE_MON;
+ break;
+ case 0x1e: /* SPSR_hyp */
+ *tgtmode = ARM_CPU_MODE_HYP;
+ break;
+ default: /* unallocated */
+ goto undef;
+ }
+ /* We arbitrarily assign SPSR a register number of 16. */
+ *regno = 16;
+ } else {
+ /* general purpose registers for other modes */
+ switch (sysm) {
+ case 0x0 ... 0x6: /* 0b00xxx : r8_usr ... r14_usr */
+ *tgtmode = ARM_CPU_MODE_USR;
+ *regno = sysm + 8;
+ break;
+ case 0x8 ... 0xe: /* 0b01xxx : r8_fiq ... r14_fiq */
+ *tgtmode = ARM_CPU_MODE_FIQ;
+ *regno = sysm;
+ break;
+ case 0x10 ... 0x11: /* 0b1000x : r14_irq, r13_irq */
+ *tgtmode = ARM_CPU_MODE_IRQ;
+ *regno = sysm & 1 ? 13 : 14;
+ break;
+ case 0x12 ... 0x13: /* 0b1001x : r14_svc, r13_svc */
+ *tgtmode = ARM_CPU_MODE_SVC;
+ *regno = sysm & 1 ? 13 : 14;
+ break;
+ case 0x14 ... 0x15: /* 0b1010x : r14_abt, r13_abt */
+ *tgtmode = ARM_CPU_MODE_ABT;
+ *regno = sysm & 1 ? 13 : 14;
+ break;
+ case 0x16 ... 0x17: /* 0b1011x : r14_und, r13_und */
+ *tgtmode = ARM_CPU_MODE_UND;
+ *regno = sysm & 1 ? 13 : 14;
+ break;
+ case 0x1c ... 0x1d: /* 0b1110x : r14_mon, r13_mon */
+ *tgtmode = ARM_CPU_MODE_MON;
+ *regno = sysm & 1 ? 13 : 14;
+ break;
+ case 0x1e ... 0x1f: /* 0b1111x : elr_hyp, r13_hyp */
+ *tgtmode = ARM_CPU_MODE_HYP;
+ /* Arbitrarily pick 17 for ELR_Hyp (which is not a banked LR!) */
+ *regno = sysm & 1 ? 13 : 17;
+ break;
+ default: /* unallocated */
+ goto undef;
+ }
+ }
+
+ /* Catch the 'accessing inaccessible register' cases we can detect
+ * at translate time.
+ */
+ switch (*tgtmode) {
+ case ARM_CPU_MODE_MON:
+ if (!arm_dc_feature(s, ARM_FEATURE_EL3) || s->ns) {
+ goto undef;
+ }
+ if (s->current_el == 1) {
+ /* If we're in Secure EL1 (which implies that EL3 is AArch64)
+ * then accesses to Mon registers trap to Secure EL2, if it exists,
+ * otherwise EL3.
+ */
+ TCGv_i32 tcg_el;
+
+ if (arm_dc_feature(s, ARM_FEATURE_AARCH64) &&
+ dc_isar_feature(aa64_sel2, s)) {
+ /* Target EL is EL<3 minus SCR_EL3.EEL2> */
+ tcg_el = load_cpu_field(cp15.scr_el3);
+ tcg_gen_sextract_i32(tcg_el, tcg_el, ctz32(SCR_EEL2), 1);
+ tcg_gen_addi_i32(tcg_el, tcg_el, 3);
+ } else {
+ tcg_el = tcg_constant_i32(3);
+ }
+
+ gen_exception_insn_el_v(s, 0, EXCP_UDEF,
+ syn_uncategorized(), tcg_el);
+ tcg_temp_free_i32(tcg_el);
+ return false;
+ }
+ break;
+ case ARM_CPU_MODE_HYP:
+ /*
+ * SPSR_hyp and r13_hyp can only be accessed from Monitor mode
+ * (and so we can forbid accesses from EL2 or below). elr_hyp
+ * can be accessed also from Hyp mode, so forbid accesses from
+ * EL0 or EL1.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_EL2) || s->current_el < 2 ||
+ (s->current_el < 3 && *regno != 17)) {
+ goto undef;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return true;
+
+undef:
+ /* If we get here then some access check did not pass */
+ gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
+ return false;
+}
+
+static void gen_msr_banked(DisasContext *s, int r, int sysm, int rn)
+{
+ TCGv_i32 tcg_reg;
+ int tgtmode = 0, regno = 0;
+
+ if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
+ return;
+ }
+
+ /* Sync state because msr_banked() can raise exceptions */
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ tcg_reg = load_reg(s, rn);
+ gen_helper_msr_banked(cpu_env, tcg_reg,
+ tcg_constant_i32(tgtmode),
+ tcg_constant_i32(regno));
+ tcg_temp_free_i32(tcg_reg);
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn)
+{
+ TCGv_i32 tcg_reg;
+ int tgtmode = 0, regno = 0;
+
+ if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
+ return;
+ }
+
+ /* Sync state because mrs_banked() can raise exceptions */
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ tcg_reg = tcg_temp_new_i32();
+ gen_helper_mrs_banked(tcg_reg, cpu_env,
+ tcg_constant_i32(tgtmode),
+ tcg_constant_i32(regno));
+ store_reg(s, rn, tcg_reg);
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+/* Store value to PC as for an exception return (ie don't
+ * mask bits). The subsequent call to gen_helper_cpsr_write_eret()
+ * will do the masking based on the new value of the Thumb bit.
+ */
+static void store_pc_exc_ret(DisasContext *s, TCGv_i32 pc)
+{
+ tcg_gen_mov_i32(cpu_R[15], pc);
+ tcg_temp_free_i32(pc);
+}
+
+/* Generate a v6 exception return. Marks both values as dead. */
+static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
+{
+ store_pc_exc_ret(s, pc);
+ /* The cpsr_write_eret helper will mask the low bits of PC
+ * appropriately depending on the new Thumb bit, so it must
+ * be called after storing the new PC.
+ */
+ if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+ gen_io_start();
+ }
+ gen_helper_cpsr_write_eret(cpu_env, cpsr);
+ tcg_temp_free_i32(cpsr);
+ /* Must exit loop to check un-masked IRQs */
+ s->base.is_jmp = DISAS_EXIT;
+}
+
+/* Generate an old-style exception return. Marks pc as dead. */
+static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
+{
+ gen_rfe(s, pc, load_cpu_field(spsr));
+}
+
+static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz,
+ gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr qc_ptr = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
+ tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
+ opr_sz, max_sz, 0, fn);
+ tcg_temp_free_ptr(qc_ptr);
+}
+
+void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_3_ptr * const fns[2] = {
+ gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
+ };
+ tcg_debug_assert(vece >= 1 && vece <= 2);
+ gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
+}
+
+void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_3_ptr * const fns[2] = {
+ gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
+ };
+ tcg_debug_assert(vece >= 1 && vece <= 2);
+ gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
+}
+
+#define GEN_CMP0(NAME, COND) \
+ static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a) \
+ { \
+ tcg_gen_setcondi_i32(COND, d, a, 0); \
+ tcg_gen_neg_i32(d, d); \
+ } \
+ static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a) \
+ { \
+ tcg_gen_setcondi_i64(COND, d, a, 0); \
+ tcg_gen_neg_i64(d, d); \
+ } \
+ static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \
+ { \
+ TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0); \
+ tcg_gen_cmp_vec(COND, vece, d, a, zero); \
+ } \
+ void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m, \
+ uint32_t opr_sz, uint32_t max_sz) \
+ { \
+ const GVecGen2 op[4] = { \
+ { .fno = gen_helper_gvec_##NAME##0_b, \
+ .fniv = gen_##NAME##0_vec, \
+ .opt_opc = vecop_list_cmp, \
+ .vece = MO_8 }, \
+ { .fno = gen_helper_gvec_##NAME##0_h, \
+ .fniv = gen_##NAME##0_vec, \
+ .opt_opc = vecop_list_cmp, \
+ .vece = MO_16 }, \
+ { .fni4 = gen_##NAME##0_i32, \
+ .fniv = gen_##NAME##0_vec, \
+ .opt_opc = vecop_list_cmp, \
+ .vece = MO_32 }, \
+ { .fni8 = gen_##NAME##0_i64, \
+ .fniv = gen_##NAME##0_vec, \
+ .opt_opc = vecop_list_cmp, \
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64, \
+ .vece = MO_64 }, \
+ }; \
+ tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]); \
+ }
+
+static const TCGOpcode vecop_list_cmp[] = {
+ INDEX_op_cmp_vec, 0
+};
+
+GEN_CMP0(ceq, TCG_COND_EQ)
+GEN_CMP0(cle, TCG_COND_LE)
+GEN_CMP0(cge, TCG_COND_GE)
+GEN_CMP0(clt, TCG_COND_LT)
+GEN_CMP0(cgt, TCG_COND_GT)
+
+#undef GEN_CMP0
+
+static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_sar8i_i64(a, a, shift);
+ tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_sar16i_i64(a, a, shift);
+ tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_sari_i32(a, a, shift);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_sari_i64(a, a, shift);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ tcg_gen_sari_vec(vece, a, a, sh);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sari_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_ssra8_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_ssra16_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_ssra32_i32,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_ssra64_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_b,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize]. */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Signed results in all sign bits.
+ */
+ shift = MIN(shift, (8 << vece) - 1);
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+}
+
+static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_shr8i_i64(a, a, shift);
+ tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_shr16i_i64(a, a, shift);
+ tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_shri_i32(a, a, shift);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_shri_i64(a, a, shift);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ tcg_gen_shri_vec(vece, a, a, sh);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_usra8_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8, },
+ { .fni8 = gen_usra16_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16, },
+ { .fni4 = gen_usra32_i32,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32, },
+ { .fni8 = gen_usra64_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64, },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize]. */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Unsigned results in all zeros as input to accumulate: nop.
+ */
+ if (shift < (8 << vece)) {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ } else {
+ /* Nop, but we do need to clear the tail. */
+ tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+ }
+}
+
+/*
+ * Shift one less than the requested amount, and the low bit is
+ * the rounding bit. For the 8 and 16-bit operations, because we
+ * mask the low bit, we can perform a normal integer shift instead
+ * of a vector shift.
+ */
+static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, sh - 1);
+ tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
+ tcg_gen_vec_sar8i_i64(d, a, sh);
+ tcg_gen_vec_add8_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, sh - 1);
+ tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
+ tcg_gen_vec_sar16i_i64(d, a, sh);
+ tcg_gen_vec_add16_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+ TCGv_i32 t;
+
+ /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
+ if (sh == 32) {
+ tcg_gen_movi_i32(d, 0);
+ return;
+ }
+ t = tcg_temp_new_i32();
+ tcg_gen_extract_i32(t, a, sh - 1, 1);
+ tcg_gen_sari_i32(d, a, sh);
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_extract_i64(t, a, sh - 1, 1);
+ tcg_gen_sari_i64(d, a, sh);
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec ones = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_shri_vec(vece, t, a, sh - 1);
+ tcg_gen_dupi_vec(vece, ones, 1);
+ tcg_gen_and_vec(vece, t, t, ones);
+ tcg_gen_sari_vec(vece, d, a, sh);
+ tcg_gen_add_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(ones);
+}
+
+void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_srshr8_i64,
+ .fniv = gen_srshr_vec,
+ .fno = gen_helper_gvec_srshr_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_srshr16_i64,
+ .fniv = gen_srshr_vec,
+ .fno = gen_helper_gvec_srshr_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_srshr32_i32,
+ .fniv = gen_srshr_vec,
+ .fno = gen_helper_gvec_srshr_s,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_srshr64_i64,
+ .fniv = gen_srshr_vec,
+ .fno = gen_helper_gvec_srshr_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize] */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ if (shift == (8 << vece)) {
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Signed results in all sign bits. With rounding, this produces
+ * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
+ * I.e. always zero.
+ */
+ tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
+ } else {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ }
+}
+
+static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ gen_srshr8_i64(t, a, sh);
+ tcg_gen_vec_add8_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ gen_srshr16_i64(t, a, sh);
+ tcg_gen_vec_add16_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ gen_srshr32_i32(t, a, sh);
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ gen_srshr64_i64(t, a, sh);
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ gen_srshr_vec(vece, t, a, sh);
+ tcg_gen_add_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_srsra8_i64,
+ .fniv = gen_srsra_vec,
+ .fno = gen_helper_gvec_srsra_b,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fni8 = gen_srsra16_i64,
+ .fniv = gen_srsra_vec,
+ .fno = gen_helper_gvec_srsra_h,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_srsra32_i32,
+ .fniv = gen_srsra_vec,
+ .fno = gen_helper_gvec_srsra_s,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_srsra64_i64,
+ .fniv = gen_srsra_vec,
+ .fno = gen_helper_gvec_srsra_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize] */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Signed results in all sign bits. With rounding, this produces
+ * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
+ * I.e. always zero. With accumulation, this leaves D unchanged.
+ */
+ if (shift == (8 << vece)) {
+ /* Nop, but we do need to clear the tail. */
+ tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+ } else {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ }
+}
+
+static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, sh - 1);
+ tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
+ tcg_gen_vec_shr8i_i64(d, a, sh);
+ tcg_gen_vec_add8_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, sh - 1);
+ tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
+ tcg_gen_vec_shr16i_i64(d, a, sh);
+ tcg_gen_vec_add16_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+ TCGv_i32 t;
+
+ /* Handle shift by the input size for the benefit of trans_URSHR_ri */
+ if (sh == 32) {
+ tcg_gen_extract_i32(d, a, sh - 1, 1);
+ return;
+ }
+ t = tcg_temp_new_i32();
+ tcg_gen_extract_i32(t, a, sh - 1, 1);
+ tcg_gen_shri_i32(d, a, sh);
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_extract_i64(t, a, sh - 1, 1);
+ tcg_gen_shri_i64(d, a, sh);
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec ones = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_shri_vec(vece, t, a, shift - 1);
+ tcg_gen_dupi_vec(vece, ones, 1);
+ tcg_gen_and_vec(vece, t, t, ones);
+ tcg_gen_shri_vec(vece, d, a, shift);
+ tcg_gen_add_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(ones);
+}
+
+void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_urshr8_i64,
+ .fniv = gen_urshr_vec,
+ .fno = gen_helper_gvec_urshr_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_urshr16_i64,
+ .fniv = gen_urshr_vec,
+ .fno = gen_helper_gvec_urshr_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_urshr32_i32,
+ .fniv = gen_urshr_vec,
+ .fno = gen_helper_gvec_urshr_s,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_urshr64_i64,
+ .fniv = gen_urshr_vec,
+ .fno = gen_helper_gvec_urshr_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize] */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ if (shift == (8 << vece)) {
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Unsigned results in zero. With rounding, this produces a
+ * copy of the most significant bit.
+ */
+ tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
+ } else {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ }
+}
+
+static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ if (sh == 8) {
+ tcg_gen_vec_shr8i_i64(t, a, 7);
+ } else {
+ gen_urshr8_i64(t, a, sh);
+ }
+ tcg_gen_vec_add8_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ if (sh == 16) {
+ tcg_gen_vec_shr16i_i64(t, a, 15);
+ } else {
+ gen_urshr16_i64(t, a, sh);
+ }
+ tcg_gen_vec_add16_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ if (sh == 32) {
+ tcg_gen_shri_i32(t, a, 31);
+ } else {
+ gen_urshr32_i32(t, a, sh);
+ }
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ if (sh == 64) {
+ tcg_gen_shri_i64(t, a, 63);
+ } else {
+ gen_urshr64_i64(t, a, sh);
+ }
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ if (sh == (8 << vece)) {
+ tcg_gen_shri_vec(vece, t, a, sh - 1);
+ } else {
+ gen_urshr_vec(vece, t, a, sh);
+ }
+ tcg_gen_add_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_ursra8_i64,
+ .fniv = gen_ursra_vec,
+ .fno = gen_helper_gvec_ursra_b,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fni8 = gen_ursra16_i64,
+ .fniv = gen_ursra_vec,
+ .fno = gen_helper_gvec_ursra_h,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_ursra32_i32,
+ .fniv = gen_ursra_vec,
+ .fno = gen_helper_gvec_ursra_s,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_ursra64_i64,
+ .fniv = gen_ursra_vec,
+ .fno = gen_helper_gvec_ursra_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize] */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+}
+
+static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_8, 0xff >> shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff >> shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_shri_i32(a, a, shift);
+ tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
+}
+
+static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_shri_i64(a, a, shift);
+ tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
+}
+
+static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
+ tcg_gen_shri_vec(vece, t, a, sh);
+ tcg_gen_and_vec(vece, d, d, m);
+ tcg_gen_or_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(m);
+}
+
+void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
+ const GVecGen2i ops[4] = {
+ { .fni8 = gen_shr8_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .fno = gen_helper_gvec_sri_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_shr16_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .fno = gen_helper_gvec_sri_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_shr32_ins_i32,
+ .fniv = gen_shr_ins_vec,
+ .fno = gen_helper_gvec_sri_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_shr64_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .fno = gen_helper_gvec_sri_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [1..esize]. */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /* Shift of esize leaves destination unchanged. */
+ if (shift < (8 << vece)) {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ } else {
+ /* Nop, but we do need to clear the tail. */
+ tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+ }
+}
+
+static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_8, 0xff << shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff << shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
+}
+
+static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
+}
+
+static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_shli_vec(vece, t, a, sh);
+ tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
+ tcg_gen_and_vec(vece, d, d, m);
+ tcg_gen_or_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(m);
+}
+
+void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
+ const GVecGen2i ops[4] = {
+ { .fni8 = gen_shl8_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .fno = gen_helper_gvec_sli_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_shl16_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .fno = gen_helper_gvec_sli_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_shl32_ins_i32,
+ .fniv = gen_shl_ins_vec,
+ .fno = gen_helper_gvec_sli_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_shl64_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .fno = gen_helper_gvec_sli_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+
+ /* tszimm encoding produces immediates in the range [0..esize-1]. */
+ tcg_debug_assert(shift >= 0);
+ tcg_debug_assert(shift < (8 << vece));
+
+ if (shift == 0) {
+ tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
+ } else {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ }
+}
+
+static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u8(a, a, b);
+ gen_helper_neon_add_u8(d, d, a);
+}
+
+static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u8(a, a, b);
+ gen_helper_neon_sub_u8(d, d, a);
+}
+
+static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u16(a, a, b);
+ gen_helper_neon_add_u16(d, d, a);
+}
+
+static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u16(a, a, b);
+ gen_helper_neon_sub_u16(d, d, a);
+}
+
+static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_mul_i32(a, a, b);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_mul_i32(a, a, b);
+ tcg_gen_sub_i32(d, d, a);
+}
+
+static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_mul_i64(a, a, b);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_mul_i64(a, a, b);
+ tcg_gen_sub_i64(d, d, a);
+}
+
+static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_mul_vec(vece, a, a, b);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_mul_vec(vece, a, a, b);
+ tcg_gen_sub_vec(vece, d, d, a);
+}
+
+/* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
+ * these tables are shared with AArch64 which does support them.
+ */
+void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_mul_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fni4 = gen_mla8_i32,
+ .fniv = gen_mla_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni4 = gen_mla16_i32,
+ .fniv = gen_mla_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_mla32_i32,
+ .fniv = gen_mla_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_mla64_i64,
+ .fniv = gen_mla_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_mul_vec, INDEX_op_sub_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fni4 = gen_mls8_i32,
+ .fniv = gen_mls_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni4 = gen_mls16_i32,
+ .fniv = gen_mls_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_mls32_i32,
+ .fniv = gen_mls_vec,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_mls64_i64,
+ .fniv = gen_mls_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+/* CMTST : test is "if (X & Y != 0)". */
+static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_and_i32(d, a, b);
+ tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0);
+ tcg_gen_neg_i32(d, d);
+}
+
+void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_and_i64(d, a, b);
+ tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0);
+ tcg_gen_neg_i64(d, d);
+}
+
+static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_and_vec(vece, d, a, b);
+ tcg_gen_dupi_vec(vece, a, 0);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
+}
+
+void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
+ static const GVecGen3 ops[4] = {
+ { .fni4 = gen_helper_neon_tst_u8,
+ .fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni4 = gen_helper_neon_tst_u16,
+ .fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_cmtst_i32,
+ .fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_cmtst_i64,
+ .fniv = gen_cmtst_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
+{
+ TCGv_i32 lval = tcg_temp_new_i32();
+ TCGv_i32 rval = tcg_temp_new_i32();
+ TCGv_i32 lsh = tcg_temp_new_i32();
+ TCGv_i32 rsh = tcg_temp_new_i32();
+ TCGv_i32 zero = tcg_constant_i32(0);
+ TCGv_i32 max = tcg_constant_i32(32);
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_ext8s_i32(lsh, shift);
+ tcg_gen_neg_i32(rsh, lsh);
+ tcg_gen_shl_i32(lval, src, lsh);
+ tcg_gen_shr_i32(rval, src, rsh);
+ tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
+ tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
+
+ tcg_temp_free_i32(lval);
+ tcg_temp_free_i32(rval);
+ tcg_temp_free_i32(lsh);
+ tcg_temp_free_i32(rsh);
+}
+
+void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
+{
+ TCGv_i64 lval = tcg_temp_new_i64();
+ TCGv_i64 rval = tcg_temp_new_i64();
+ TCGv_i64 lsh = tcg_temp_new_i64();
+ TCGv_i64 rsh = tcg_temp_new_i64();
+ TCGv_i64 zero = tcg_constant_i64(0);
+ TCGv_i64 max = tcg_constant_i64(64);
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_ext8s_i64(lsh, shift);
+ tcg_gen_neg_i64(rsh, lsh);
+ tcg_gen_shl_i64(lval, src, lsh);
+ tcg_gen_shr_i64(rval, src, rsh);
+ tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
+ tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
+
+ tcg_temp_free_i64(lval);
+ tcg_temp_free_i64(rval);
+ tcg_temp_free_i64(lsh);
+ tcg_temp_free_i64(rsh);
+}
+
+static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
+ TCGv_vec src, TCGv_vec shift)
+{
+ TCGv_vec lval = tcg_temp_new_vec_matching(dst);
+ TCGv_vec rval = tcg_temp_new_vec_matching(dst);
+ TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
+ TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
+ TCGv_vec msk, max;
+
+ tcg_gen_neg_vec(vece, rsh, shift);
+ if (vece == MO_8) {
+ tcg_gen_mov_vec(lsh, shift);
+ } else {
+ msk = tcg_temp_new_vec_matching(dst);
+ tcg_gen_dupi_vec(vece, msk, 0xff);
+ tcg_gen_and_vec(vece, lsh, shift, msk);
+ tcg_gen_and_vec(vece, rsh, rsh, msk);
+ tcg_temp_free_vec(msk);
+ }
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_shlv_vec(vece, lval, src, lsh);
+ tcg_gen_shrv_vec(vece, rval, src, rsh);
+
+ max = tcg_temp_new_vec_matching(dst);
+ tcg_gen_dupi_vec(vece, max, 8 << vece);
+
+ /*
+ * The choice of LT (signed) and GEU (unsigned) are biased toward
+ * the instructions of the x86_64 host. For MO_8, the whole byte
+ * is significant so we must use an unsigned compare; otherwise we
+ * have already masked to a byte and so a signed compare works.
+ * Other tcg hosts have a full set of comparisons and do not care.
+ */
+ if (vece == MO_8) {
+ tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
+ tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
+ tcg_gen_andc_vec(vece, lval, lval, lsh);
+ tcg_gen_andc_vec(vece, rval, rval, rsh);
+ } else {
+ tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
+ tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
+ tcg_gen_and_vec(vece, lval, lval, lsh);
+ tcg_gen_and_vec(vece, rval, rval, rsh);
+ }
+ tcg_gen_or_vec(vece, dst, lval, rval);
+
+ tcg_temp_free_vec(max);
+ tcg_temp_free_vec(lval);
+ tcg_temp_free_vec(rval);
+ tcg_temp_free_vec(lsh);
+ tcg_temp_free_vec(rsh);
+}
+
+void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_neg_vec, INDEX_op_shlv_vec,
+ INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_ushl_vec,
+ .fno = gen_helper_gvec_ushl_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = gen_ushl_vec,
+ .fno = gen_helper_gvec_ushl_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_ushl_i32,
+ .fniv = gen_ushl_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_ushl_i64,
+ .fniv = gen_ushl_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
+{
+ TCGv_i32 lval = tcg_temp_new_i32();
+ TCGv_i32 rval = tcg_temp_new_i32();
+ TCGv_i32 lsh = tcg_temp_new_i32();
+ TCGv_i32 rsh = tcg_temp_new_i32();
+ TCGv_i32 zero = tcg_constant_i32(0);
+ TCGv_i32 max = tcg_constant_i32(31);
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_ext8s_i32(lsh, shift);
+ tcg_gen_neg_i32(rsh, lsh);
+ tcg_gen_shl_i32(lval, src, lsh);
+ tcg_gen_umin_i32(rsh, rsh, max);
+ tcg_gen_sar_i32(rval, src, rsh);
+ tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
+ tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
+
+ tcg_temp_free_i32(lval);
+ tcg_temp_free_i32(rval);
+ tcg_temp_free_i32(lsh);
+ tcg_temp_free_i32(rsh);
+}
+
+void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
+{
+ TCGv_i64 lval = tcg_temp_new_i64();
+ TCGv_i64 rval = tcg_temp_new_i64();
+ TCGv_i64 lsh = tcg_temp_new_i64();
+ TCGv_i64 rsh = tcg_temp_new_i64();
+ TCGv_i64 zero = tcg_constant_i64(0);
+ TCGv_i64 max = tcg_constant_i64(63);
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_ext8s_i64(lsh, shift);
+ tcg_gen_neg_i64(rsh, lsh);
+ tcg_gen_shl_i64(lval, src, lsh);
+ tcg_gen_umin_i64(rsh, rsh, max);
+ tcg_gen_sar_i64(rval, src, rsh);
+ tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
+ tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
+
+ tcg_temp_free_i64(lval);
+ tcg_temp_free_i64(rval);
+ tcg_temp_free_i64(lsh);
+ tcg_temp_free_i64(rsh);
+}
+
+static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
+ TCGv_vec src, TCGv_vec shift)
+{
+ TCGv_vec lval = tcg_temp_new_vec_matching(dst);
+ TCGv_vec rval = tcg_temp_new_vec_matching(dst);
+ TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
+ TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
+ TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
+
+ /*
+ * Rely on the TCG guarantee that out of range shifts produce
+ * unspecified results, not undefined behaviour (i.e. no trap).
+ * Discard out-of-range results after the fact.
+ */
+ tcg_gen_neg_vec(vece, rsh, shift);
+ if (vece == MO_8) {
+ tcg_gen_mov_vec(lsh, shift);
+ } else {
+ tcg_gen_dupi_vec(vece, tmp, 0xff);
+ tcg_gen_and_vec(vece, lsh, shift, tmp);
+ tcg_gen_and_vec(vece, rsh, rsh, tmp);
+ }
+
+ /* Bound rsh so out of bound right shift gets -1. */
+ tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
+ tcg_gen_umin_vec(vece, rsh, rsh, tmp);
+ tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
+
+ tcg_gen_shlv_vec(vece, lval, src, lsh);
+ tcg_gen_sarv_vec(vece, rval, src, rsh);
+
+ /* Select in-bound left shift. */
+ tcg_gen_andc_vec(vece, lval, lval, tmp);
+
+ /* Select between left and right shift. */
+ if (vece == MO_8) {
+ tcg_gen_dupi_vec(vece, tmp, 0);
+ tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
+ } else {
+ tcg_gen_dupi_vec(vece, tmp, 0x80);
+ tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
+ }
+
+ tcg_temp_free_vec(lval);
+ tcg_temp_free_vec(rval);
+ tcg_temp_free_vec(lsh);
+ tcg_temp_free_vec(rsh);
+ tcg_temp_free_vec(tmp);
+}
+
+void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
+ INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_sshl_vec,
+ .fno = gen_helper_gvec_sshl_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = gen_sshl_vec,
+ .fno = gen_helper_gvec_sshl_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_sshl_i32,
+ .fniv = gen_sshl_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_sshl_i64,
+ .fniv = gen_sshl_vec,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec x = tcg_temp_new_vec_matching(t);
+ tcg_gen_add_vec(vece, x, a, b);
+ tcg_gen_usadd_vec(vece, t, a, b);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+ tcg_gen_or_vec(vece, sat, sat, x);
+ tcg_temp_free_vec(x);
+}
+
+void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_uqadd_vec,
+ .fno = gen_helper_gvec_uqadd_b,
+ .write_aofs = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = gen_uqadd_vec,
+ .fno = gen_helper_gvec_uqadd_h,
+ .write_aofs = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fniv = gen_uqadd_vec,
+ .fno = gen_helper_gvec_uqadd_s,
+ .write_aofs = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fniv = gen_uqadd_vec,
+ .fno = gen_helper_gvec_uqadd_d,
+ .write_aofs = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+ rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec x = tcg_temp_new_vec_matching(t);
+ tcg_gen_add_vec(vece, x, a, b);
+ tcg_gen_ssadd_vec(vece, t, a, b);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+ tcg_gen_or_vec(vece, sat, sat, x);
+ tcg_temp_free_vec(x);
+}
+
+void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_sqadd_vec,
+ .fno = gen_helper_gvec_sqadd_b,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_8 },
+ { .fniv = gen_sqadd_vec,
+ .fno = gen_helper_gvec_sqadd_h,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_16 },
+ { .fniv = gen_sqadd_vec,
+ .fno = gen_helper_gvec_sqadd_s,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_32 },
+ { .fniv = gen_sqadd_vec,
+ .fno = gen_helper_gvec_sqadd_d,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+ rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec x = tcg_temp_new_vec_matching(t);
+ tcg_gen_sub_vec(vece, x, a, b);
+ tcg_gen_ussub_vec(vece, t, a, b);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+ tcg_gen_or_vec(vece, sat, sat, x);
+ tcg_temp_free_vec(x);
+}
+
+void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_uqsub_vec,
+ .fno = gen_helper_gvec_uqsub_b,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_8 },
+ { .fniv = gen_uqsub_vec,
+ .fno = gen_helper_gvec_uqsub_h,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_16 },
+ { .fniv = gen_uqsub_vec,
+ .fno = gen_helper_gvec_uqsub_s,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_32 },
+ { .fniv = gen_uqsub_vec,
+ .fno = gen_helper_gvec_uqsub_d,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+ rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec x = tcg_temp_new_vec_matching(t);
+ tcg_gen_sub_vec(vece, x, a, b);
+ tcg_gen_sssub_vec(vece, t, a, b);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+ tcg_gen_or_vec(vece, sat, sat, x);
+ tcg_temp_free_vec(x);
+}
+
+void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+ };
+ static const GVecGen4 ops[4] = {
+ { .fniv = gen_sqsub_vec,
+ .fno = gen_helper_gvec_sqsub_b,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_8 },
+ { .fniv = gen_sqsub_vec,
+ .fno = gen_helper_gvec_sqsub_h,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_16 },
+ { .fniv = gen_sqsub_vec,
+ .fno = gen_helper_gvec_sqsub_s,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_32 },
+ { .fniv = gen_sqsub_vec,
+ .fno = gen_helper_gvec_sqsub_d,
+ .opt_opc = vecop_list,
+ .write_aofs = true,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+ rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_sub_i32(t, a, b);
+ tcg_gen_sub_i32(d, b, a);
+ tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_sub_i64(t, a, b);
+ tcg_gen_sub_i64(d, b, a);
+ tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_smin_vec(vece, t, a, b);
+ tcg_gen_smax_vec(vece, d, a, b);
+ tcg_gen_sub_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_sabd_vec,
+ .fno = gen_helper_gvec_sabd_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = gen_sabd_vec,
+ .fno = gen_helper_gvec_sabd_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_sabd_i32,
+ .fniv = gen_sabd_vec,
+ .fno = gen_helper_gvec_sabd_s,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_sabd_i64,
+ .fniv = gen_sabd_vec,
+ .fno = gen_helper_gvec_sabd_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_sub_i32(t, a, b);
+ tcg_gen_sub_i32(d, b, a);
+ tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_sub_i64(t, a, b);
+ tcg_gen_sub_i64(d, b, a);
+ tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_umin_vec(vece, t, a, b);
+ tcg_gen_umax_vec(vece, d, a, b);
+ tcg_gen_sub_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_uabd_vec,
+ .fno = gen_helper_gvec_uabd_b,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = gen_uabd_vec,
+ .fno = gen_helper_gvec_uabd_h,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_uabd_i32,
+ .fniv = gen_uabd_vec,
+ .fno = gen_helper_gvec_uabd_s,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_uabd_i64,
+ .fniv = gen_uabd_vec,
+ .fno = gen_helper_gvec_uabd_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+ gen_sabd_i32(t, a, b);
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+ gen_sabd_i64(t, a, b);
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ gen_sabd_vec(vece, t, a, b);
+ tcg_gen_add_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sub_vec, INDEX_op_add_vec,
+ INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_saba_vec,
+ .fno = gen_helper_gvec_saba_b,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fniv = gen_saba_vec,
+ .fno = gen_helper_gvec_saba_h,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_saba_i32,
+ .fniv = gen_saba_vec,
+ .fno = gen_helper_gvec_saba_s,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_saba_i64,
+ .fniv = gen_saba_vec,
+ .fno = gen_helper_gvec_saba_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+ gen_uabd_i32(t, a, b);
+ tcg_gen_add_i32(d, d, t);
+ tcg_temp_free_i32(t);
+}
+
+static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+ gen_uabd_i64(t, a, b);
+ tcg_gen_add_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ gen_uabd_vec(vece, t, a, b);
+ tcg_gen_add_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sub_vec, INDEX_op_add_vec,
+ INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+ };
+ static const GVecGen3 ops[4] = {
+ { .fniv = gen_uaba_vec,
+ .fno = gen_helper_gvec_uaba_b,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fniv = gen_uaba_vec,
+ .fno = gen_helper_gvec_uaba_h,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_uaba_i32,
+ .fniv = gen_uaba_vec,
+ .fno = gen_helper_gvec_uaba_s,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_uaba_i64,
+ .fniv = gen_uaba_vec,
+ .fno = gen_helper_gvec_uaba_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+ tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
+ int opc1, int crn, int crm, int opc2,
+ bool isread, int rt, int rt2)
+{
+ uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2);
+ const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
+ TCGv_ptr tcg_ri = NULL;
+ bool need_exit_tb;
+ uint32_t syndrome;
+
+ /*
+ * Note that since we are an implementation which takes an
+ * exception on a trapped conditional instruction only if the
+ * instruction passes its condition code check, we can take
+ * advantage of the clause in the ARM ARM that allows us to set
+ * the COND field in the instruction to 0xE in all cases.
+ * We could fish the actual condition out of the insn (ARM)
+ * or the condexec bits (Thumb) but it isn't necessary.
+ */
+ switch (cpnum) {
+ case 14:
+ if (is64) {
+ syndrome = syn_cp14_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
+ isread, false);
+ } else {
+ syndrome = syn_cp14_rt_trap(1, 0xe, opc1, opc2, crn, crm,
+ rt, isread, false);
+ }
+ break;
+ case 15:
+ if (is64) {
+ syndrome = syn_cp15_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
+ isread, false);
+ } else {
+ syndrome = syn_cp15_rt_trap(1, 0xe, opc1, opc2, crn, crm,
+ rt, isread, false);
+ }
+ break;
+ default:
+ /*
+ * ARMv8 defines that only coprocessors 14 and 15 exist,
+ * so this can only happen if this is an ARMv7 or earlier CPU,
+ * in which case the syndrome information won't actually be
+ * guest visible.
+ */
+ assert(!arm_dc_feature(s, ARM_FEATURE_V8));
+ syndrome = syn_uncategorized();
+ break;
+ }
+
+ if (s->hstr_active && cpnum == 15 && s->current_el == 1) {
+ /*
+ * At EL1, check for a HSTR_EL2 trap, which must take precedence
+ * over the UNDEF for "no such register" or the UNDEF for "access
+ * permissions forbid this EL1 access". HSTR_EL2 traps from EL0
+ * only happen if the cpreg doesn't UNDEF at EL0, so we do those in
+ * access_check_cp_reg(), after the checks for whether the access
+ * configurably trapped to EL1.
+ */
+ uint32_t maskbit = is64 ? crm : crn;
+
+ if (maskbit != 4 && maskbit != 14) {
+ /* T4 and T14 are RES0 so never cause traps */
+ TCGv_i32 t;
+ DisasLabel over = gen_disas_label(s);
+
+ t = load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2));
+ tcg_gen_andi_i32(t, t, 1u << maskbit);
+ tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label);
+ tcg_temp_free_i32(t);
+
+ gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
+ set_disas_label(s, over);
+ }
+ }
+
+ if (!ri) {
+ /*
+ * Unknown register; this might be a guest error or a QEMU
+ * unimplemented feature.
+ */
+ if (is64) {
+ qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+ "64 bit system register cp:%d opc1: %d crm:%d "
+ "(%s)\n",
+ isread ? "read" : "write", cpnum, opc1, crm,
+ s->ns ? "non-secure" : "secure");
+ } else {
+ qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+ "system register cp:%d opc1:%d crn:%d crm:%d "
+ "opc2:%d (%s)\n",
+ isread ? "read" : "write", cpnum, opc1, crn,
+ crm, opc2, s->ns ? "non-secure" : "secure");
+ }
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Check access permissions */
+ if (!cp_access_ok(s->current_el, ri, isread)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if ((s->hstr_active && s->current_el == 0) || ri->accessfn ||
+ (ri->fgt && s->fgt_active) ||
+ (arm_dc_feature(s, ARM_FEATURE_XSCALE) && cpnum < 14)) {
+ /*
+ * Emit code to perform further access permissions checks at
+ * runtime; this may result in an exception.
+ * Note that on XScale all cp0..c13 registers do an access check
+ * call in order to handle c15_cpar.
+ */
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ tcg_ri = tcg_temp_new_ptr();
+ gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
+ tcg_constant_i32(key),
+ tcg_constant_i32(syndrome),
+ tcg_constant_i32(isread));
+ } else if (ri->type & ARM_CP_RAISES_EXC) {
+ /*
+ * The readfn or writefn might raise an exception;
+ * synchronize the CPU state in case it does.
+ */
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ }
+
+ /* Handle special cases first */
+ switch (ri->type & ARM_CP_SPECIAL_MASK) {
+ case 0:
+ break;
+ case ARM_CP_NOP:
+ goto exit;
+ case ARM_CP_WFI:
+ if (isread) {
+ unallocated_encoding(s);
+ } else {
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_WFI;
+ }
+ goto exit;
+ default:
+ g_assert_not_reached();
+ }
+
+ if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+ gen_io_start();
+ }
+
+ if (isread) {
+ /* Read */
+ if (is64) {
+ TCGv_i64 tmp64;
+ TCGv_i32 tmp;
+ if (ri->type & ARM_CP_CONST) {
+ tmp64 = tcg_constant_i64(ri->resetvalue);
+ } else if (ri->readfn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ tmp64 = tcg_temp_new_i64();
+ gen_helper_get_cp_reg64(tmp64, cpu_env, tcg_ri);
+ } else {
+ tmp64 = tcg_temp_new_i64();
+ tcg_gen_ld_i64(tmp64, cpu_env, ri->fieldoffset);
+ }
+ tmp = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tmp, tmp64);
+ store_reg(s, rt, tmp);
+ tmp = tcg_temp_new_i32();
+ tcg_gen_extrh_i64_i32(tmp, tmp64);
+ tcg_temp_free_i64(tmp64);
+ store_reg(s, rt2, tmp);
+ } else {
+ TCGv_i32 tmp;
+ if (ri->type & ARM_CP_CONST) {
+ tmp = tcg_constant_i32(ri->resetvalue);
+ } else if (ri->readfn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ tmp = tcg_temp_new_i32();
+ gen_helper_get_cp_reg(tmp, cpu_env, tcg_ri);
+ } else {
+ tmp = load_cpu_offset(ri->fieldoffset);
+ }
+ if (rt == 15) {
+ /* Destination register of r15 for 32 bit loads sets
+ * the condition codes from the high 4 bits of the value
+ */
+ gen_set_nzcv(tmp);
+ tcg_temp_free_i32(tmp);
+ } else {
+ store_reg(s, rt, tmp);
+ }
+ }
+ } else {
+ /* Write */
+ if (ri->type & ARM_CP_CONST) {
+ /* If not forbidden by access permissions, treat as WI */
+ goto exit;
+ }
+
+ if (is64) {
+ TCGv_i32 tmplo, tmphi;
+ TCGv_i64 tmp64 = tcg_temp_new_i64();
+ tmplo = load_reg(s, rt);
+ tmphi = load_reg(s, rt2);
+ tcg_gen_concat_i32_i64(tmp64, tmplo, tmphi);
+ tcg_temp_free_i32(tmplo);
+ tcg_temp_free_i32(tmphi);
+ if (ri->writefn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ gen_helper_set_cp_reg64(cpu_env, tcg_ri, tmp64);
+ } else {
+ tcg_gen_st_i64(tmp64, cpu_env, ri->fieldoffset);
+ }
+ tcg_temp_free_i64(tmp64);
+ } else {
+ TCGv_i32 tmp = load_reg(s, rt);
+ if (ri->writefn) {
+ if (!tcg_ri) {
+ tcg_ri = gen_lookup_cp_reg(key);
+ }
+ gen_helper_set_cp_reg(cpu_env, tcg_ri, tmp);
+ tcg_temp_free_i32(tmp);
+ } else {
+ store_cpu_offset(tmp, ri->fieldoffset, 4);
+ }
+ }
+ }
+
+ /* I/O operations must end the TB here (whether read or write) */
+ need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) &&
+ (ri->type & ARM_CP_IO));
+
+ if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+ /*
+ * A write to any coprocessor register that ends a TB
+ * must rebuild the hflags for the next TB.
+ */
+ gen_rebuild_hflags(s, ri->type & ARM_CP_NEWEL);
+ /*
+ * We default to ending the TB on a coprocessor register write,
+ * but allow this to be suppressed by the register definition
+ * (usually only necessary to work around guest bugs).
+ */
+ need_exit_tb = true;
+ }
+ if (need_exit_tb) {
+ gen_lookup_tb(s);
+ }
+
+ exit:
+ if (tcg_ri) {
+ tcg_temp_free_ptr(tcg_ri);
+ }
+}
+
+/* Decode XScale DSP or iWMMXt insn (in the copro space, cp=0 or 1) */
+static void disas_xscale_insn(DisasContext *s, uint32_t insn)
+{
+ int cpnum = (insn >> 8) & 0xf;
+
+ if (extract32(s->c15_cpar, cpnum, 1) == 0) {
+ unallocated_encoding(s);
+ } else if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
+ if (disas_iwmmxt_insn(s, insn)) {
+ unallocated_encoding(s);
+ }
+ } else if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
+ if (disas_dsp_insn(s, insn)) {
+ unallocated_encoding(s);
+ }
+ }
+}
+
+/* Store a 64-bit value to a register pair. Clobbers val. */
+static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
+{
+ TCGv_i32 tmp;
+ tmp = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(tmp, val);
+ store_reg(s, rlow, tmp);
+ tmp = tcg_temp_new_i32();
+ tcg_gen_extrh_i64_i32(tmp, val);
+ store_reg(s, rhigh, tmp);
+}
+
+/* load and add a 64-bit value from a register pair. */
+static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh)
+{
+ TCGv_i64 tmp;
+ TCGv_i32 tmpl;
+ TCGv_i32 tmph;
+
+ /* Load 64-bit value rd:rn. */
+ tmpl = load_reg(s, rlow);
+ tmph = load_reg(s, rhigh);
+ tmp = tcg_temp_new_i64();
+ tcg_gen_concat_i32_i64(tmp, tmpl, tmph);
+ tcg_temp_free_i32(tmpl);
+ tcg_temp_free_i32(tmph);
+ tcg_gen_add_i64(val, val, tmp);
+ tcg_temp_free_i64(tmp);
+}
+
+/* Set N and Z flags from hi|lo. */
+static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
+{
+ tcg_gen_mov_i32(cpu_NF, hi);
+ tcg_gen_or_i32(cpu_ZF, lo, hi);
+}
+
+/* Load/Store exclusive instructions are implemented by remembering
+ the value/address loaded, and seeing if these are the same
+ when the store is performed. This should be sufficient to implement
+ the architecturally mandated semantics, and avoids having to monitor
+ regular stores. The compare vs the remembered value is done during
+ the cmpxchg operation, but we must compare the addresses manually. */
+static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
+ TCGv_i32 addr, int size)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ MemOp opc = size | MO_ALIGN | s->be_data;
+
+ s->is_ldex = true;
+
+ if (size == 3) {
+ TCGv_i32 tmp2 = tcg_temp_new_i32();
+ TCGv_i64 t64 = tcg_temp_new_i64();
+
+ /*
+ * For AArch32, architecturally the 32-bit word at the lowest
+ * address is always Rt and the one at addr+4 is Rt2, even if
+ * the CPU is big-endian. That means we don't want to do a
+ * gen_aa32_ld_i64(), which checks SCTLR_B as if for an
+ * architecturally 64-bit access, but instead do a 64-bit access
+ * using MO_BE if appropriate and then split the two halves.
+ */
+ TCGv taddr = gen_aa32_addr(s, addr, opc);
+
+ tcg_gen_qemu_ld_i64(t64, taddr, get_mem_index(s), opc);
+ tcg_temp_free(taddr);
+ tcg_gen_mov_i64(cpu_exclusive_val, t64);
+ if (s->be_data == MO_BE) {
+ tcg_gen_extr_i64_i32(tmp2, tmp, t64);
+ } else {
+ tcg_gen_extr_i64_i32(tmp, tmp2, t64);
+ }
+ tcg_temp_free_i64(t64);
+
+ store_reg(s, rt2, tmp2);
+ } else {
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
+ tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
+ }
+
+ store_reg(s, rt, tmp);
+ tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
+}
+
+static void gen_clrex(DisasContext *s)
+{
+ tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
+ TCGv_i32 addr, int size)
+{
+ TCGv_i32 t0, t1, t2;
+ TCGv_i64 extaddr;
+ TCGv taddr;
+ TCGLabel *done_label;
+ TCGLabel *fail_label;
+ MemOp opc = size | MO_ALIGN | s->be_data;
+
+ /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
+ [addr] = {Rt};
+ {Rd} = 0;
+ } else {
+ {Rd} = 1;
+ } */
+ fail_label = gen_new_label();
+ done_label = gen_new_label();
+ extaddr = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(extaddr, addr);
+ tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
+ tcg_temp_free_i64(extaddr);
+
+ taddr = gen_aa32_addr(s, addr, opc);
+ t0 = tcg_temp_new_i32();
+ t1 = load_reg(s, rt);
+ if (size == 3) {
+ TCGv_i64 o64 = tcg_temp_new_i64();
+ TCGv_i64 n64 = tcg_temp_new_i64();
+
+ t2 = load_reg(s, rt2);
+
+ /*
+ * For AArch32, architecturally the 32-bit word at the lowest
+ * address is always Rt and the one at addr+4 is Rt2, even if
+ * the CPU is big-endian. Since we're going to treat this as a
+ * single 64-bit BE store, we need to put the two halves in the
+ * opposite order for BE to LE, so that they end up in the right
+ * places. We don't want gen_aa32_st_i64, because that checks
+ * SCTLR_B as if for an architectural 64-bit access.
+ */
+ if (s->be_data == MO_BE) {
+ tcg_gen_concat_i32_i64(n64, t2, t1);
+ } else {
+ tcg_gen_concat_i32_i64(n64, t1, t2);
+ }
+ tcg_temp_free_i32(t2);
+
+ tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
+ get_mem_index(s), opc);
+ tcg_temp_free_i64(n64);
+
+ tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
+ tcg_gen_extrl_i64_i32(t0, o64);
+
+ tcg_temp_free_i64(o64);
+ } else {
+ t2 = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
+ tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
+ tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
+ tcg_temp_free_i32(t2);
+ }
+ tcg_temp_free_i32(t1);
+ tcg_temp_free(taddr);
+ tcg_gen_mov_i32(cpu_R[rd], t0);
+ tcg_temp_free_i32(t0);
+ tcg_gen_br(done_label);
+
+ gen_set_label(fail_label);
+ tcg_gen_movi_i32(cpu_R[rd], 1);
+ gen_set_label(done_label);
+ tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+/* gen_srs:
+ * @env: CPUARMState
+ * @s: DisasContext
+ * @mode: mode field from insn (which stack to store to)
+ * @amode: addressing mode (DA/IA/DB/IB), encoded as per P,U bits in ARM insn
+ * @writeback: true if writeback bit set
+ *
+ * Generate code for the SRS (Store Return State) insn.
+ */
+static void gen_srs(DisasContext *s,
+ uint32_t mode, uint32_t amode, bool writeback)
+{
+ int32_t offset;
+ TCGv_i32 addr, tmp;
+ bool undef = false;
+
+ /* SRS is:
+ * - trapped to EL3 if EL3 is AArch64 and we are at Secure EL1
+ * and specified mode is monitor mode
+ * - UNDEFINED in Hyp mode
+ * - UNPREDICTABLE in User or System mode
+ * - UNPREDICTABLE if the specified mode is:
+ * -- not implemented
+ * -- not a valid mode number
+ * -- a mode that's at a higher exception level
+ * -- Monitor, if we are Non-secure
+ * For the UNPREDICTABLE cases we choose to UNDEF.
+ */
+ if (s->current_el == 1 && !s->ns && mode == ARM_CPU_MODE_MON) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syn_uncategorized(), 3);
+ return;
+ }
+
+ if (s->current_el == 0 || s->current_el == 2) {
+ undef = true;
+ }
+
+ switch (mode) {
+ case ARM_CPU_MODE_USR:
+ case ARM_CPU_MODE_FIQ:
+ case ARM_CPU_MODE_IRQ:
+ case ARM_CPU_MODE_SVC:
+ case ARM_CPU_MODE_ABT:
+ case ARM_CPU_MODE_UND:
+ case ARM_CPU_MODE_SYS:
+ break;
+ case ARM_CPU_MODE_HYP:
+ if (s->current_el == 1 || !arm_dc_feature(s, ARM_FEATURE_EL2)) {
+ undef = true;
+ }
+ break;
+ case ARM_CPU_MODE_MON:
+ /* No need to check specifically for "are we non-secure" because
+ * we've already made EL0 UNDEF and handled the trap for S-EL1;
+ * so if this isn't EL3 then we must be non-secure.
+ */
+ if (s->current_el != 3) {
+ undef = true;
+ }
+ break;
+ default:
+ undef = true;
+ }
+
+ if (undef) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ addr = tcg_temp_new_i32();
+ /* get_r13_banked() will raise an exception if called from System mode */
+ gen_set_condexec(s);
+ gen_update_pc(s, 0);
+ gen_helper_get_r13_banked(addr, cpu_env, tcg_constant_i32(mode));
+ switch (amode) {
+ case 0: /* DA */
+ offset = -4;
+ break;
+ case 1: /* IA */
+ offset = 0;
+ break;
+ case 2: /* DB */
+ offset = -8;
+ break;
+ case 3: /* IB */
+ offset = 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_gen_addi_i32(addr, addr, offset);
+ tmp = load_reg(s, 14);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+ tmp = load_cpu_field(spsr);
+ tcg_gen_addi_i32(addr, addr, 4);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+ if (writeback) {
+ switch (amode) {
+ case 0:
+ offset = -8;
+ break;
+ case 1:
+ offset = 4;
+ break;
+ case 2:
+ offset = -4;
+ break;
+ case 3:
+ offset = 0;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_gen_addi_i32(addr, addr, offset);
+ gen_helper_set_r13_banked(cpu_env, tcg_constant_i32(mode), addr);
+ }
+ tcg_temp_free_i32(addr);
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+/* Skip this instruction if the ARM condition is false */
+static void arm_skip_unless(DisasContext *s, uint32_t cond)
+{
+ arm_gen_condlabel(s);
+ arm_gen_test_cc(cond ^ 1, s->condlabel.label);
+}
+
+
+/*
+ * Constant expanders used by T16/T32 decode
+ */
+
+/* Return only the rotation part of T32ExpandImm. */
+static int t32_expandimm_rot(DisasContext *s, int x)
+{
+ return x & 0xc00 ? extract32(x, 7, 5) : 0;
+}
+
+/* Return the unrotated immediate from T32ExpandImm. */
+static int t32_expandimm_imm(DisasContext *s, int x)
+{
+ int imm = extract32(x, 0, 8);
+
+ switch (extract32(x, 8, 4)) {
+ case 0: /* XY */
+ /* Nothing to do. */
+ break;
+ case 1: /* 00XY00XY */
+ imm *= 0x00010001;
+ break;
+ case 2: /* XY00XY00 */
+ imm *= 0x01000100;
+ break;
+ case 3: /* XYXYXYXY */
+ imm *= 0x01010101;
+ break;
+ default:
+ /* Rotated constant. */
+ imm |= 0x80;
+ break;
+ }
+ return imm;
+}
+
+static int t32_branch24(DisasContext *s, int x)
+{
+ /* Convert J1:J2 at x[22:21] to I2:I1, which involves I=J^~S. */
+ x ^= !(x < 0) * (3 << 21);
+ /* Append the final zero. */
+ return x << 1;
+}
+
+static int t16_setflags(DisasContext *s)
+{
+ return s->condexec_mask == 0;
+}
+
+static int t16_push_list(DisasContext *s, int x)
+{
+ return (x & 0xff) | (x & 0x100) << (14 - 8);
+}
+
+static int t16_pop_list(DisasContext *s, int x)
+{
+ return (x & 0xff) | (x & 0x100) << (15 - 8);
+}
+
+/*
+ * Include the generated decoders.
+ */
+
+#include "decode-a32.c.inc"
+#include "decode-a32-uncond.c.inc"
+#include "decode-t32.c.inc"
+#include "decode-t16.c.inc"
+
+static bool valid_cp(DisasContext *s, int cp)
+{
+ /*
+ * Return true if this coprocessor field indicates something
+ * that's really a possible coprocessor.
+ * For v7 and earlier, coprocessors 8..15 were reserved for Arm use,
+ * and of those only cp14 and cp15 were used for registers.
+ * cp10 and cp11 were used for VFP and Neon, whose decode is
+ * dealt with elsewhere. With the advent of fp16, cp9 is also
+ * now part of VFP.
+ * For v8A and later, the encoding has been tightened so that
+ * only cp14 and cp15 are valid, and other values aren't considered
+ * to be in the coprocessor-instruction space at all. v8M still
+ * permits coprocessors 0..7.
+ * For XScale, we must not decode the XScale cp0, cp1 space as
+ * a standard coprocessor insn, because we want to fall through to
+ * the legacy disas_xscale_insn() decoder after decodetree is done.
+ */
+ if (arm_dc_feature(s, ARM_FEATURE_XSCALE) && (cp == 0 || cp == 1)) {
+ return false;
+ }
+
+ if (arm_dc_feature(s, ARM_FEATURE_V8) &&
+ !arm_dc_feature(s, ARM_FEATURE_M)) {
+ return cp >= 14;
+ }
+ return cp < 8 || cp >= 14;
+}
+
+static bool trans_MCR(DisasContext *s, arg_MCR *a)
+{
+ if (!valid_cp(s, a->cp)) {
+ return false;
+ }
+ do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
+ false, a->rt, 0);
+ return true;
+}
+
+static bool trans_MRC(DisasContext *s, arg_MRC *a)
+{
+ if (!valid_cp(s, a->cp)) {
+ return false;
+ }
+ do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
+ true, a->rt, 0);
+ return true;
+}
+
+static bool trans_MCRR(DisasContext *s, arg_MCRR *a)
+{
+ if (!valid_cp(s, a->cp)) {
+ return false;
+ }
+ do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
+ false, a->rt, a->rt2);
+ return true;
+}
+
+static bool trans_MRRC(DisasContext *s, arg_MRRC *a)
+{
+ if (!valid_cp(s, a->cp)) {
+ return false;
+ }
+ do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
+ true, a->rt, a->rt2);
+ return true;
+}
+
+/* Helpers to swap operands for reverse-subtract. */
+static void gen_rsb(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_sub_i32(dst, b, a);
+}
+
+static void gen_rsb_CC(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_sub_CC(dst, b, a);
+}
+
+static void gen_rsc(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_sub_carry(dest, b, a);
+}
+
+static void gen_rsc_CC(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_sbc_CC(dest, b, a);
+}
+
+/*
+ * Helpers for the data processing routines.
+ *
+ * After the computation store the results back.
+ * This may be suppressed altogether (STREG_NONE), require a runtime
+ * check against the stack limits (STREG_SP_CHECK), or generate an
+ * exception return. Oh, or store into a register.
+ *
+ * Always return true, indicating success for a trans_* function.
+ */
+typedef enum {
+ STREG_NONE,
+ STREG_NORMAL,
+ STREG_SP_CHECK,
+ STREG_EXC_RET,
+} StoreRegKind;
+
+static bool store_reg_kind(DisasContext *s, int rd,
+ TCGv_i32 val, StoreRegKind kind)
+{
+ switch (kind) {
+ case STREG_NONE:
+ tcg_temp_free_i32(val);
+ return true;
+ case STREG_NORMAL:
+ /* See ALUWritePC: Interworking only from a32 mode. */
+ if (s->thumb) {
+ store_reg(s, rd, val);
+ } else {
+ store_reg_bx(s, rd, val);
+ }
+ return true;
+ case STREG_SP_CHECK:
+ store_sp_checked(s, val);
+ return true;
+ case STREG_EXC_RET:
+ gen_exception_return(s, val);
+ return true;
+ }
+ g_assert_not_reached();
+}
+
+/*
+ * Data Processing (register)
+ *
+ * Operate, with set flags, one register source,
+ * one immediate shifted register source, and a destination.
+ */
+static bool op_s_rrr_shi(DisasContext *s, arg_s_rrr_shi *a,
+ void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp1, tmp2;
+
+ tmp2 = load_reg(s, a->rm);
+ gen_arm_shift_im(tmp2, a->shty, a->shim, logic_cc);
+ tmp1 = load_reg(s, a->rn);
+
+ gen(tmp1, tmp1, tmp2);
+ tcg_temp_free_i32(tmp2);
+
+ if (logic_cc) {
+ gen_logic_CC(tmp1);
+ }
+ return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxr_shi(DisasContext *s, arg_s_rrr_shi *a,
+ void (*gen)(TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp;
+
+ tmp = load_reg(s, a->rm);
+ gen_arm_shift_im(tmp, a->shty, a->shim, logic_cc);
+
+ gen(tmp, tmp);
+ if (logic_cc) {
+ gen_logic_CC(tmp);
+ }
+ return store_reg_kind(s, a->rd, tmp, kind);
+}
+
+/*
+ * Data-processing (register-shifted register)
+ *
+ * Operate, with set flags, one register source,
+ * one register shifted register source, and a destination.
+ */
+static bool op_s_rrr_shr(DisasContext *s, arg_s_rrr_shr *a,
+ void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp1, tmp2;
+
+ tmp1 = load_reg(s, a->rs);
+ tmp2 = load_reg(s, a->rm);
+ gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
+ tmp1 = load_reg(s, a->rn);
+
+ gen(tmp1, tmp1, tmp2);
+ tcg_temp_free_i32(tmp2);
+
+ if (logic_cc) {
+ gen_logic_CC(tmp1);
+ }
+ return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxr_shr(DisasContext *s, arg_s_rrr_shr *a,
+ void (*gen)(TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp1, tmp2;
+
+ tmp1 = load_reg(s, a->rs);
+ tmp2 = load_reg(s, a->rm);
+ gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
+
+ gen(tmp2, tmp2);
+ if (logic_cc) {
+ gen_logic_CC(tmp2);
+ }
+ return store_reg_kind(s, a->rd, tmp2, kind);
+}
+
+/*
+ * Data-processing (immediate)
+ *
+ * Operate, with set flags, one register source,
+ * one rotated immediate, and a destination.
+ *
+ * Note that logic_cc && a->rot setting CF based on the msb of the
+ * immediate is the reason why we must pass in the unrotated form
+ * of the immediate.
+ */
+static bool op_s_rri_rot(DisasContext *s, arg_s_rri_rot *a,
+ void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp1;
+ uint32_t imm;
+
+ imm = ror32(a->imm, a->rot);
+ if (logic_cc && a->rot) {
+ tcg_gen_movi_i32(cpu_CF, imm >> 31);
+ }
+ tmp1 = load_reg(s, a->rn);
+
+ gen(tmp1, tmp1, tcg_constant_i32(imm));
+
+ if (logic_cc) {
+ gen_logic_CC(tmp1);
+ }
+ return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxi_rot(DisasContext *s, arg_s_rri_rot *a,
+ void (*gen)(TCGv_i32, TCGv_i32),
+ int logic_cc, StoreRegKind kind)
+{
+ TCGv_i32 tmp;
+ uint32_t imm;
+
+ imm = ror32(a->imm, a->rot);
+ if (logic_cc && a->rot) {
+ tcg_gen_movi_i32(cpu_CF, imm >> 31);
+ }
+
+ tmp = tcg_temp_new_i32();
+ gen(tmp, tcg_constant_i32(imm));
+
+ if (logic_cc) {
+ gen_logic_CC(tmp);
+ }
+ return store_reg_kind(s, a->rd, tmp, kind);
+}
+
+#define DO_ANY3(NAME, OP, L, K) \
+ static bool trans_##NAME##_rrri(DisasContext *s, arg_s_rrr_shi *a) \
+ { StoreRegKind k = (K); return op_s_rrr_shi(s, a, OP, L, k); } \
+ static bool trans_##NAME##_rrrr(DisasContext *s, arg_s_rrr_shr *a) \
+ { StoreRegKind k = (K); return op_s_rrr_shr(s, a, OP, L, k); } \
+ static bool trans_##NAME##_rri(DisasContext *s, arg_s_rri_rot *a) \
+ { StoreRegKind k = (K); return op_s_rri_rot(s, a, OP, L, k); }
+
+#define DO_ANY2(NAME, OP, L, K) \
+ static bool trans_##NAME##_rxri(DisasContext *s, arg_s_rrr_shi *a) \
+ { StoreRegKind k = (K); return op_s_rxr_shi(s, a, OP, L, k); } \
+ static bool trans_##NAME##_rxrr(DisasContext *s, arg_s_rrr_shr *a) \
+ { StoreRegKind k = (K); return op_s_rxr_shr(s, a, OP, L, k); } \
+ static bool trans_##NAME##_rxi(DisasContext *s, arg_s_rri_rot *a) \
+ { StoreRegKind k = (K); return op_s_rxi_rot(s, a, OP, L, k); }
+
+#define DO_CMP2(NAME, OP, L) \
+ static bool trans_##NAME##_xrri(DisasContext *s, arg_s_rrr_shi *a) \
+ { return op_s_rrr_shi(s, a, OP, L, STREG_NONE); } \
+ static bool trans_##NAME##_xrrr(DisasContext *s, arg_s_rrr_shr *a) \
+ { return op_s_rrr_shr(s, a, OP, L, STREG_NONE); } \
+ static bool trans_##NAME##_xri(DisasContext *s, arg_s_rri_rot *a) \
+ { return op_s_rri_rot(s, a, OP, L, STREG_NONE); }
+
+DO_ANY3(AND, tcg_gen_and_i32, a->s, STREG_NORMAL)
+DO_ANY3(EOR, tcg_gen_xor_i32, a->s, STREG_NORMAL)
+DO_ANY3(ORR, tcg_gen_or_i32, a->s, STREG_NORMAL)
+DO_ANY3(BIC, tcg_gen_andc_i32, a->s, STREG_NORMAL)
+
+DO_ANY3(RSB, a->s ? gen_rsb_CC : gen_rsb, false, STREG_NORMAL)
+DO_ANY3(ADC, a->s ? gen_adc_CC : gen_add_carry, false, STREG_NORMAL)
+DO_ANY3(SBC, a->s ? gen_sbc_CC : gen_sub_carry, false, STREG_NORMAL)
+DO_ANY3(RSC, a->s ? gen_rsc_CC : gen_rsc, false, STREG_NORMAL)
+
+DO_CMP2(TST, tcg_gen_and_i32, true)
+DO_CMP2(TEQ, tcg_gen_xor_i32, true)
+DO_CMP2(CMN, gen_add_CC, false)
+DO_CMP2(CMP, gen_sub_CC, false)
+
+DO_ANY3(ADD, a->s ? gen_add_CC : tcg_gen_add_i32, false,
+ a->rd == 13 && a->rn == 13 ? STREG_SP_CHECK : STREG_NORMAL)
+
+/*
+ * Note for the computation of StoreRegKind we return out of the
+ * middle of the functions that are expanded by DO_ANY3, and that
+ * we modify a->s via that parameter before it is used by OP.
+ */
+DO_ANY3(SUB, a->s ? gen_sub_CC : tcg_gen_sub_i32, false,
+ ({
+ StoreRegKind ret = STREG_NORMAL;
+ if (a->rd == 15 && a->s) {
+ /*
+ * See ALUExceptionReturn:
+ * In User mode, UNPREDICTABLE; we choose UNDEF.
+ * In Hyp mode, UNDEFINED.
+ */
+ if (IS_USER(s) || s->current_el == 2) {
+ unallocated_encoding(s);
+ return true;
+ }
+ /* There is no writeback of nzcv to PSTATE. */
+ a->s = 0;
+ ret = STREG_EXC_RET;
+ } else if (a->rd == 13 && a->rn == 13) {
+ ret = STREG_SP_CHECK;
+ }
+ ret;
+ }))
+
+DO_ANY2(MOV, tcg_gen_mov_i32, a->s,
+ ({
+ StoreRegKind ret = STREG_NORMAL;
+ if (a->rd == 15 && a->s) {
+ /*
+ * See ALUExceptionReturn:
+ * In User mode, UNPREDICTABLE; we choose UNDEF.
+ * In Hyp mode, UNDEFINED.
+ */
+ if (IS_USER(s) || s->current_el == 2) {
+ unallocated_encoding(s);
+ return true;
+ }
+ /* There is no writeback of nzcv to PSTATE. */
+ a->s = 0;
+ ret = STREG_EXC_RET;
+ } else if (a->rd == 13) {
+ ret = STREG_SP_CHECK;
+ }
+ ret;
+ }))
+
+DO_ANY2(MVN, tcg_gen_not_i32, a->s, STREG_NORMAL)
+
+/*
+ * ORN is only available with T32, so there is no register-shifted-register
+ * form of the insn. Using the DO_ANY3 macro would create an unused function.
+ */
+static bool trans_ORN_rrri(DisasContext *s, arg_s_rrr_shi *a)
+{
+ return op_s_rrr_shi(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
+}
+
+static bool trans_ORN_rri(DisasContext *s, arg_s_rri_rot *a)
+{
+ return op_s_rri_rot(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
+}
+
+#undef DO_ANY3
+#undef DO_ANY2
+#undef DO_CMP2
+
+static bool trans_ADR(DisasContext *s, arg_ri *a)
+{
+ store_reg_bx(s, a->rd, add_reg_for_lit(s, 15, a->imm));
+ return true;
+}
+
+static bool trans_MOVW(DisasContext *s, arg_MOVW *a)
+{
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+
+ store_reg(s, a->rd, tcg_constant_i32(a->imm));
+ return true;
+}
+
+static bool trans_MOVT(DisasContext *s, arg_MOVW *a)
+{
+ TCGv_i32 tmp;
+
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+
+ tmp = load_reg(s, a->rd);
+ tcg_gen_ext16u_i32(tmp, tmp);
+ tcg_gen_ori_i32(tmp, tmp, a->imm << 16);
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+/*
+ * v8.1M MVE wide-shifts
+ */
+static bool do_mve_shl_ri(DisasContext *s, arg_mve_shl_ri *a,
+ WideShiftImmFn *fn)
+{
+ TCGv_i64 rda;
+ TCGv_i32 rdalo, rdahi;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+ return false;
+ }
+ if (a->rdahi == 15) {
+ /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
+ return false;
+ }
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+ a->rdahi == 13) {
+ /* RdaHi == 13 is UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ if (a->shim == 0) {
+ a->shim = 32;
+ }
+
+ rda = tcg_temp_new_i64();
+ rdalo = load_reg(s, a->rdalo);
+ rdahi = load_reg(s, a->rdahi);
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+
+ fn(rda, rda, a->shim);
+
+ tcg_gen_extrl_i64_i32(rdalo, rda);
+ tcg_gen_extrh_i64_i32(rdahi, rda);
+ store_reg(s, a->rdalo, rdalo);
+ store_reg(s, a->rdahi, rdahi);
+ tcg_temp_free_i64(rda);
+
+ return true;
+}
+
+static bool trans_ASRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, tcg_gen_sari_i64);
+}
+
+static bool trans_LSLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, tcg_gen_shli_i64);
+}
+
+static bool trans_LSRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, tcg_gen_shri_i64);
+}
+
+static void gen_mve_sqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
+{
+ gen_helper_mve_sqshll(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_SQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, gen_mve_sqshll);
+}
+
+static void gen_mve_uqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
+{
+ gen_helper_mve_uqshll(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_UQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, gen_mve_uqshll);
+}
+
+static bool trans_SRSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, gen_srshr64_i64);
+}
+
+static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+ return do_mve_shl_ri(s, a, gen_urshr64_i64);
+}
+
+static bool do_mve_shl_rr(DisasContext *s, arg_mve_shl_rr *a, WideShiftFn *fn)
+{
+ TCGv_i64 rda;
+ TCGv_i32 rdalo, rdahi;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+ return false;
+ }
+ if (a->rdahi == 15) {
+ /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
+ return false;
+ }
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+ a->rdahi == 13 || a->rm == 13 || a->rm == 15 ||
+ a->rm == a->rdahi || a->rm == a->rdalo) {
+ /* These rdahi/rdalo/rm cases are UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ rda = tcg_temp_new_i64();
+ rdalo = load_reg(s, a->rdalo);
+ rdahi = load_reg(s, a->rdahi);
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+
+ /* The helper takes care of the sign-extension of the low 8 bits of Rm */
+ fn(rda, cpu_env, rda, cpu_R[a->rm]);
+
+ tcg_gen_extrl_i64_i32(rdalo, rda);
+ tcg_gen_extrh_i64_i32(rdahi, rda);
+ store_reg(s, a->rdalo, rdalo);
+ store_reg(s, a->rdahi, rdahi);
+ tcg_temp_free_i64(rda);
+
+ return true;
+}
+
+static bool trans_LSLL_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_ushll);
+}
+
+static bool trans_ASRL_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_sshrl);
+}
+
+static bool trans_UQRSHLL64_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll);
+}
+
+static bool trans_SQRSHRL64_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl);
+}
+
+static bool trans_UQRSHLL48_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll48);
+}
+
+static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+ return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48);
+}
+
+static bool do_mve_sh_ri(DisasContext *s, arg_mve_sh_ri *a, ShiftImmFn *fn)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+ return false;
+ }
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+ a->rda == 13 || a->rda == 15) {
+ /* These rda cases are UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ if (a->shim == 0) {
+ a->shim = 32;
+ }
+ fn(cpu_R[a->rda], cpu_R[a->rda], a->shim);
+
+ return true;
+}
+
+static bool trans_URSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+ return do_mve_sh_ri(s, a, gen_urshr32_i32);
+}
+
+static bool trans_SRSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+ return do_mve_sh_ri(s, a, gen_srshr32_i32);
+}
+
+static void gen_mve_sqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
+{
+ gen_helper_mve_sqshl(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_SQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+ return do_mve_sh_ri(s, a, gen_mve_sqshl);
+}
+
+static void gen_mve_uqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
+{
+ gen_helper_mve_uqshl(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+ return do_mve_sh_ri(s, a, gen_mve_uqshl);
+}
+
+static bool do_mve_sh_rr(DisasContext *s, arg_mve_sh_rr *a, ShiftFn *fn)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+ return false;
+ }
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+ a->rda == 13 || a->rda == 15 || a->rm == 13 || a->rm == 15 ||
+ a->rm == a->rda) {
+ /* These rda/rm cases are UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ /* The helper takes care of the sign-extension of the low 8 bits of Rm */
+ fn(cpu_R[a->rda], cpu_env, cpu_R[a->rda], cpu_R[a->rm]);
+ return true;
+}
+
+static bool trans_SQRSHR_rr(DisasContext *s, arg_mve_sh_rr *a)
+{
+ return do_mve_sh_rr(s, a, gen_helper_mve_sqrshr);
+}
+
+static bool trans_UQRSHL_rr(DisasContext *s, arg_mve_sh_rr *a)
+{
+ return do_mve_sh_rr(s, a, gen_helper_mve_uqrshl);
+}
+
+/*
+ * Multiply and multiply accumulate
+ */
+
+static bool op_mla(DisasContext *s, arg_s_rrrr *a, bool add)
+{
+ TCGv_i32 t1, t2;
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ tcg_gen_mul_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+ if (add) {
+ t2 = load_reg(s, a->ra);
+ tcg_gen_add_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+ }
+ if (a->s) {
+ gen_logic_CC(t1);
+ }
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool trans_MUL(DisasContext *s, arg_MUL *a)
+{
+ return op_mla(s, a, false);
+}
+
+static bool trans_MLA(DisasContext *s, arg_MLA *a)
+{
+ return op_mla(s, a, true);
+}
+
+static bool trans_MLS(DisasContext *s, arg_MLS *a)
+{
+ TCGv_i32 t1, t2;
+
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ tcg_gen_mul_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+ t2 = load_reg(s, a->ra);
+ tcg_gen_sub_i32(t1, t2, t1);
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool op_mlal(DisasContext *s, arg_s_rrrr *a, bool uns, bool add)
+{
+ TCGv_i32 t0, t1, t2, t3;
+
+ t0 = load_reg(s, a->rm);
+ t1 = load_reg(s, a->rn);
+ if (uns) {
+ tcg_gen_mulu2_i32(t0, t1, t0, t1);
+ } else {
+ tcg_gen_muls2_i32(t0, t1, t0, t1);
+ }
+ if (add) {
+ t2 = load_reg(s, a->ra);
+ t3 = load_reg(s, a->rd);
+ tcg_gen_add2_i32(t0, t1, t0, t1, t2, t3);
+ tcg_temp_free_i32(t2);
+ tcg_temp_free_i32(t3);
+ }
+ if (a->s) {
+ gen_logicq_cc(t0, t1);
+ }
+ store_reg(s, a->ra, t0);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool trans_UMULL(DisasContext *s, arg_UMULL *a)
+{
+ return op_mlal(s, a, true, false);
+}
+
+static bool trans_SMULL(DisasContext *s, arg_SMULL *a)
+{
+ return op_mlal(s, a, false, false);
+}
+
+static bool trans_UMLAL(DisasContext *s, arg_UMLAL *a)
+{
+ return op_mlal(s, a, true, true);
+}
+
+static bool trans_SMLAL(DisasContext *s, arg_SMLAL *a)
+{
+ return op_mlal(s, a, false, true);
+}
+
+static bool trans_UMAAL(DisasContext *s, arg_UMAAL *a)
+{
+ TCGv_i32 t0, t1, t2, zero;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rm);
+ t1 = load_reg(s, a->rn);
+ tcg_gen_mulu2_i32(t0, t1, t0, t1);
+ zero = tcg_constant_i32(0);
+ t2 = load_reg(s, a->ra);
+ tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
+ tcg_temp_free_i32(t2);
+ t2 = load_reg(s, a->rd);
+ tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->ra, t0);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+/*
+ * Saturating addition and subtraction
+ */
+
+static bool op_qaddsub(DisasContext *s, arg_rrr *a, bool add, bool doub)
+{
+ TCGv_i32 t0, t1;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_5TE) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rm);
+ t1 = load_reg(s, a->rn);
+ if (doub) {
+ gen_helper_add_saturate(t1, cpu_env, t1, t1);
+ }
+ if (add) {
+ gen_helper_add_saturate(t0, cpu_env, t0, t1);
+ } else {
+ gen_helper_sub_saturate(t0, cpu_env, t0, t1);
+ }
+ tcg_temp_free_i32(t1);
+ store_reg(s, a->rd, t0);
+ return true;
+}
+
+#define DO_QADDSUB(NAME, ADD, DOUB) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a) \
+{ \
+ return op_qaddsub(s, a, ADD, DOUB); \
+}
+
+DO_QADDSUB(QADD, true, false)
+DO_QADDSUB(QSUB, false, false)
+DO_QADDSUB(QDADD, true, true)
+DO_QADDSUB(QDSUB, false, true)
+
+#undef DO_QADDSUB
+
+/*
+ * Halfword multiply and multiply accumulate
+ */
+
+static bool op_smlaxxx(DisasContext *s, arg_rrrr *a,
+ int add_long, bool nt, bool mt)
+{
+ TCGv_i32 t0, t1, tl, th;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_5TE) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rn);
+ t1 = load_reg(s, a->rm);
+ gen_mulxy(t0, t1, nt, mt);
+ tcg_temp_free_i32(t1);
+
+ switch (add_long) {
+ case 0:
+ store_reg(s, a->rd, t0);
+ break;
+ case 1:
+ t1 = load_reg(s, a->ra);
+ gen_helper_add_setq(t0, cpu_env, t0, t1);
+ tcg_temp_free_i32(t1);
+ store_reg(s, a->rd, t0);
+ break;
+ case 2:
+ tl = load_reg(s, a->ra);
+ th = load_reg(s, a->rd);
+ /* Sign-extend the 32-bit product to 64 bits. */
+ t1 = tcg_temp_new_i32();
+ tcg_gen_sari_i32(t1, t0, 31);
+ tcg_gen_add2_i32(tl, th, tl, th, t0, t1);
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+ store_reg(s, a->ra, tl);
+ store_reg(s, a->rd, th);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return true;
+}
+
+#define DO_SMLAX(NAME, add, nt, mt) \
+static bool trans_##NAME(DisasContext *s, arg_rrrr *a) \
+{ \
+ return op_smlaxxx(s, a, add, nt, mt); \
+}
+
+DO_SMLAX(SMULBB, 0, 0, 0)
+DO_SMLAX(SMULBT, 0, 0, 1)
+DO_SMLAX(SMULTB, 0, 1, 0)
+DO_SMLAX(SMULTT, 0, 1, 1)
+
+DO_SMLAX(SMLABB, 1, 0, 0)
+DO_SMLAX(SMLABT, 1, 0, 1)
+DO_SMLAX(SMLATB, 1, 1, 0)
+DO_SMLAX(SMLATT, 1, 1, 1)
+
+DO_SMLAX(SMLALBB, 2, 0, 0)
+DO_SMLAX(SMLALBT, 2, 0, 1)
+DO_SMLAX(SMLALTB, 2, 1, 0)
+DO_SMLAX(SMLALTT, 2, 1, 1)
+
+#undef DO_SMLAX
+
+static bool op_smlawx(DisasContext *s, arg_rrrr *a, bool add, bool mt)
+{
+ TCGv_i32 t0, t1;
+
+ if (!ENABLE_ARCH_5TE) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rn);
+ t1 = load_reg(s, a->rm);
+ /*
+ * Since the nominal result is product<47:16>, shift the 16-bit
+ * input up by 16 bits, so that the result is at product<63:32>.
+ */
+ if (mt) {
+ tcg_gen_andi_i32(t1, t1, 0xffff0000);
+ } else {
+ tcg_gen_shli_i32(t1, t1, 16);
+ }
+ tcg_gen_muls2_i32(t0, t1, t0, t1);
+ tcg_temp_free_i32(t0);
+ if (add) {
+ t0 = load_reg(s, a->ra);
+ gen_helper_add_setq(t1, cpu_env, t1, t0);
+ tcg_temp_free_i32(t0);
+ }
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+#define DO_SMLAWX(NAME, add, mt) \
+static bool trans_##NAME(DisasContext *s, arg_rrrr *a) \
+{ \
+ return op_smlawx(s, a, add, mt); \
+}
+
+DO_SMLAWX(SMULWB, 0, 0)
+DO_SMLAWX(SMULWT, 0, 1)
+DO_SMLAWX(SMLAWB, 1, 0)
+DO_SMLAWX(SMLAWT, 1, 1)
+
+#undef DO_SMLAWX
+
+/*
+ * MSR (immediate) and hints
+ */
+
+static bool trans_YIELD(DisasContext *s, arg_YIELD *a)
+{
+ /*
+ * When running single-threaded TCG code, use the helper to ensure that
+ * the next round-robin scheduled vCPU gets a crack. When running in
+ * MTTCG we don't generate jumps to the helper as it won't affect the
+ * scheduling of other vCPUs.
+ */
+ if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_YIELD;
+ }
+ return true;
+}
+
+static bool trans_WFE(DisasContext *s, arg_WFE *a)
+{
+ /*
+ * When running single-threaded TCG code, use the helper to ensure that
+ * the next round-robin scheduled vCPU gets a crack. In MTTCG mode we
+ * just skip this instruction. Currently the SEV/SEVL instructions,
+ * which are *one* of many ways to wake the CPU from WFE, are not
+ * implemented so we can't sleep like WFI does.
+ */
+ if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_WFE;
+ }
+ return true;
+}
+
+static bool trans_WFI(DisasContext *s, arg_WFI *a)
+{
+ /* For WFI, halt the vCPU until an IRQ. */
+ gen_update_pc(s, curr_insn_len(s));
+ s->base.is_jmp = DISAS_WFI;
+ return true;
+}
+
+static bool trans_ESB(DisasContext *s, arg_ESB *a)
+{
+ /*
+ * For M-profile, minimal-RAS ESB can be a NOP.
+ * Without RAS, we must implement this as NOP.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_M) && dc_isar_feature(aa32_ras, s)) {
+ /*
+ * QEMU does not have a source of physical SErrors,
+ * so we are only concerned with virtual SErrors.
+ * The pseudocode in the ARM for this case is
+ * if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
+ * AArch32.vESBOperation();
+ * Most of the condition can be evaluated at translation time.
+ * Test for EL2 present, and defer test for SEL2 to runtime.
+ */
+ if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
+ gen_helper_vesb(cpu_env);
+ }
+ }
+ return true;
+}
+
+static bool trans_NOP(DisasContext *s, arg_NOP *a)
+{
+ return true;
+}
+
+static bool trans_MSR_imm(DisasContext *s, arg_MSR_imm *a)
+{
+ uint32_t val = ror32(a->imm, a->rot * 2);
+ uint32_t mask = msr_mask(s, a->mask, a->r);
+
+ if (gen_set_psr_im(s, mask, a->r, val)) {
+ unallocated_encoding(s);
+ }
+ return true;
+}
+
+/*
+ * Cyclic Redundancy Check
+ */
+
+static bool op_crc32(DisasContext *s, arg_rrr *a, bool c, MemOp sz)
+{
+ TCGv_i32 t1, t2, t3;
+
+ if (!dc_isar_feature(aa32_crc32, s)) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ switch (sz) {
+ case MO_8:
+ gen_uxtb(t2);
+ break;
+ case MO_16:
+ gen_uxth(t2);
+ break;
+ case MO_32:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ t3 = tcg_constant_i32(1 << sz);
+ if (c) {
+ gen_helper_crc32c(t1, t1, t2, t3);
+ } else {
+ gen_helper_crc32(t1, t1, t2, t3);
+ }
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+#define DO_CRC32(NAME, c, sz) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a) \
+ { return op_crc32(s, a, c, sz); }
+
+DO_CRC32(CRC32B, false, MO_8)
+DO_CRC32(CRC32H, false, MO_16)
+DO_CRC32(CRC32W, false, MO_32)
+DO_CRC32(CRC32CB, true, MO_8)
+DO_CRC32(CRC32CH, true, MO_16)
+DO_CRC32(CRC32CW, true, MO_32)
+
+#undef DO_CRC32
+
+/*
+ * Miscellaneous instructions
+ */
+
+static bool trans_MRS_bank(DisasContext *s, arg_MRS_bank *a)
+{
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ gen_mrs_banked(s, a->r, a->sysm, a->rd);
+ return true;
+}
+
+static bool trans_MSR_bank(DisasContext *s, arg_MSR_bank *a)
+{
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ gen_msr_banked(s, a->r, a->sysm, a->rn);
+ return true;
+}
+
+static bool trans_MRS_reg(DisasContext *s, arg_MRS_reg *a)
+{
+ TCGv_i32 tmp;
+
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (a->r) {
+ if (IS_USER(s)) {
+ unallocated_encoding(s);
+ return true;
+ }
+ tmp = load_cpu_field(spsr);
+ } else {
+ tmp = tcg_temp_new_i32();
+ gen_helper_cpsr_read(tmp, cpu_env);
+ }
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_MSR_reg(DisasContext *s, arg_MSR_reg *a)
+{
+ TCGv_i32 tmp;
+ uint32_t mask = msr_mask(s, a->mask, a->r);
+
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ tmp = load_reg(s, a->rn);
+ if (gen_set_psr(s, mask, a->r, tmp)) {
+ unallocated_encoding(s);
+ }
+ return true;
+}
+
+static bool trans_MRS_v7m(DisasContext *s, arg_MRS_v7m *a)
+{
+ TCGv_i32 tmp;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ tmp = tcg_temp_new_i32();
+ gen_helper_v7m_mrs(tmp, cpu_env, tcg_constant_i32(a->sysm));
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_MSR_v7m(DisasContext *s, arg_MSR_v7m *a)
+{
+ TCGv_i32 addr, reg;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ addr = tcg_constant_i32((a->mask << 10) | a->sysm);
+ reg = load_reg(s, a->rn);
+ gen_helper_v7m_msr(cpu_env, addr, reg);
+ tcg_temp_free_i32(reg);
+ /* If we wrote to CONTROL, the EL might have changed */
+ gen_rebuild_hflags(s, true);
+ gen_lookup_tb(s);
+ return true;
+}
+
+static bool trans_BX(DisasContext *s, arg_BX *a)
+{
+ if (!ENABLE_ARCH_4T) {
+ return false;
+ }
+ gen_bx_excret(s, load_reg(s, a->rm));
+ return true;
+}
+
+static bool trans_BXJ(DisasContext *s, arg_BXJ *a)
+{
+ if (!ENABLE_ARCH_5J || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ /*
+ * v7A allows BXJ to be trapped via HSTR.TJDBX. We don't waste a
+ * TBFLAGS bit on a basically-never-happens case, so call a helper
+ * function to check for the trap and raise the exception if needed
+ * (passing it the register number for the syndrome value).
+ * v8A doesn't have this HSTR bit.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
+ arm_dc_feature(s, ARM_FEATURE_EL2) &&
+ s->current_el < 2 && s->ns) {
+ gen_helper_check_bxj_trap(cpu_env, tcg_constant_i32(a->rm));
+ }
+ /* Trivial implementation equivalent to bx. */
+ gen_bx(s, load_reg(s, a->rm));
+ return true;
+}
+
+static bool trans_BLX_r(DisasContext *s, arg_BLX_r *a)
+{
+ TCGv_i32 tmp;
+
+ if (!ENABLE_ARCH_5) {
+ return false;
+ }
+ tmp = load_reg(s, a->rm);
+ gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+ gen_bx(s, tmp);
+ return true;
+}
+
+/*
+ * BXNS/BLXNS: only exist for v8M with the security extensions,
+ * and always UNDEF if NonSecure. We don't implement these in
+ * the user-only mode either (in theory you can use them from
+ * Secure User mode but they are too tied in to system emulation).
+ */
+static bool trans_BXNS(DisasContext *s, arg_BXNS *a)
+{
+ if (!s->v8m_secure || IS_USER_ONLY) {
+ unallocated_encoding(s);
+ } else {
+ gen_bxns(s, a->rm);
+ }
+ return true;
+}
+
+static bool trans_BLXNS(DisasContext *s, arg_BLXNS *a)
+{
+ if (!s->v8m_secure || IS_USER_ONLY) {
+ unallocated_encoding(s);
+ } else {
+ gen_blxns(s, a->rm);
+ }
+ return true;
+}
+
+static bool trans_CLZ(DisasContext *s, arg_CLZ *a)
+{
+ TCGv_i32 tmp;
+
+ if (!ENABLE_ARCH_5) {
+ return false;
+ }
+ tmp = load_reg(s, a->rm);
+ tcg_gen_clzi_i32(tmp, tmp, 32);
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_ERET(DisasContext *s, arg_ERET *a)
+{
+ TCGv_i32 tmp;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_V7VE)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ unallocated_encoding(s);
+ return true;
+ }
+ if (s->current_el == 2) {
+ /* ERET from Hyp uses ELR_Hyp, not LR */
+ tmp = load_cpu_field(elr_el[2]);
+ } else {
+ tmp = load_reg(s, 14);
+ }
+ gen_exception_return(s, tmp);
+ return true;
+}
+
+static bool trans_HLT(DisasContext *s, arg_HLT *a)
+{
+ gen_hlt(s, a->imm);
+ return true;
+}
+
+static bool trans_BKPT(DisasContext *s, arg_BKPT *a)
+{
+ if (!ENABLE_ARCH_5) {
+ return false;
+ }
+ /* BKPT is OK with ECI set and leaves it untouched */
+ s->eci_handled = true;
+ if (arm_dc_feature(s, ARM_FEATURE_M) &&
+ semihosting_enabled(s->current_el == 0) &&
+ (a->imm == 0xab)) {
+ gen_exception_internal_insn(s, EXCP_SEMIHOST);
+ } else {
+ gen_exception_bkpt_insn(s, syn_aa32_bkpt(a->imm, false));
+ }
+ return true;
+}
+
+static bool trans_HVC(DisasContext *s, arg_HVC *a)
+{
+ if (!ENABLE_ARCH_7 || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ unallocated_encoding(s);
+ } else {
+ gen_hvc(s, a->imm);
+ }
+ return true;
+}
+
+static bool trans_SMC(DisasContext *s, arg_SMC *a)
+{
+ if (!ENABLE_ARCH_6K || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ unallocated_encoding(s);
+ } else {
+ gen_smc(s);
+ }
+ return true;
+}
+
+static bool trans_SG(DisasContext *s, arg_SG *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+ !arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+ /*
+ * SG (v8M only)
+ * The bulk of the behaviour for this instruction is implemented
+ * in v7m_handle_execute_nsc(), which deals with the insn when
+ * it is executed by a CPU in non-secure state from memory
+ * which is Secure & NonSecure-Callable.
+ * Here we only need to handle the remaining cases:
+ * * in NS memory (including the "security extension not
+ * implemented" case) : NOP
+ * * in S memory but CPU already secure (clear IT bits)
+ * We know that the attribute for the memory this insn is
+ * in must match the current CPU state, because otherwise
+ * get_phys_addr_pmsav8 would have generated an exception.
+ */
+ if (s->v8m_secure) {
+ /* Like the IT insn, we don't need to generate any code */
+ s->condexec_cond = 0;
+ s->condexec_mask = 0;
+ }
+ return true;
+}
+
+static bool trans_TT(DisasContext *s, arg_TT *a)
+{
+ TCGv_i32 addr, tmp;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+ !arm_dc_feature(s, ARM_FEATURE_V8)) {
+ return false;
+ }
+ if (a->rd == 13 || a->rd == 15 || a->rn == 15) {
+ /* We UNDEF for these UNPREDICTABLE cases */
+ unallocated_encoding(s);
+ return true;
+ }
+ if (a->A && !s->v8m_secure) {
+ /* This case is UNDEFINED. */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ addr = load_reg(s, a->rn);
+ tmp = tcg_temp_new_i32();
+ gen_helper_v7m_tt(tmp, cpu_env, addr, tcg_constant_i32((a->A << 1) | a->T));
+ tcg_temp_free_i32(addr);
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+/*
+ * Load/store register index
+ */
+
+static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w)
+{
+ ISSInfo ret;
+
+ /* ISS not valid if writeback */
+ if (p && !w) {
+ ret = rd;
+ if (curr_insn_len(s) == 2) {
+ ret |= ISSIs16Bit;
+ }
+ } else {
+ ret = ISSInvalid;
+ }
+ return ret;
+}
+
+static TCGv_i32 op_addr_rr_pre(DisasContext *s, arg_ldst_rr *a)
+{
+ TCGv_i32 addr = load_reg(s, a->rn);
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ if (a->p) {
+ TCGv_i32 ofs = load_reg(s, a->rm);
+ gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
+ if (a->u) {
+ tcg_gen_add_i32(addr, addr, ofs);
+ } else {
+ tcg_gen_sub_i32(addr, addr, ofs);
+ }
+ tcg_temp_free_i32(ofs);
+ }
+ return addr;
+}
+
+static void op_addr_rr_post(DisasContext *s, arg_ldst_rr *a,
+ TCGv_i32 addr, int address_offset)
+{
+ if (!a->p) {
+ TCGv_i32 ofs = load_reg(s, a->rm);
+ gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
+ if (a->u) {
+ tcg_gen_add_i32(addr, addr, ofs);
+ } else {
+ tcg_gen_sub_i32(addr, addr, ofs);
+ }
+ tcg_temp_free_i32(ofs);
+ } else if (!a->w) {
+ tcg_temp_free_i32(addr);
+ return;
+ }
+ tcg_gen_addi_i32(addr, addr, address_offset);
+ store_reg(s, a->rn, addr);
+}
+
+static bool op_load_rr(DisasContext *s, arg_ldst_rr *a,
+ MemOp mop, int mem_idx)
+{
+ ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
+ TCGv_i32 addr, tmp;
+
+ addr = op_addr_rr_pre(s, a);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
+ disas_set_da_iss(s, mop, issinfo);
+
+ /*
+ * Perform base writeback before the loaded value to
+ * ensure correct behavior with overlapping index registers.
+ */
+ op_addr_rr_post(s, a, addr, 0);
+ store_reg_from_load(s, a->rt, tmp);
+ return true;
+}
+
+static bool op_store_rr(DisasContext *s, arg_ldst_rr *a,
+ MemOp mop, int mem_idx)
+{
+ ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
+ TCGv_i32 addr, tmp;
+
+ /*
+ * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
+ * is either UNPREDICTABLE or has defined behaviour
+ */
+ if (s->thumb && a->rn == 15) {
+ return false;
+ }
+
+ addr = op_addr_rr_pre(s, a);
+
+ tmp = load_reg(s, a->rt);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
+ disas_set_da_iss(s, mop, issinfo);
+ tcg_temp_free_i32(tmp);
+
+ op_addr_rr_post(s, a, addr, 0);
+ return true;
+}
+
+static bool trans_LDRD_rr(DisasContext *s, arg_ldst_rr *a)
+{
+ int mem_idx = get_mem_index(s);
+ TCGv_i32 addr, tmp;
+
+ if (!ENABLE_ARCH_5TE) {
+ return false;
+ }
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ addr = op_addr_rr_pre(s, a);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ store_reg(s, a->rt, tmp);
+
+ tcg_gen_addi_i32(addr, addr, 4);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ store_reg(s, a->rt + 1, tmp);
+
+ /* LDRD w/ base writeback is undefined if the registers overlap. */
+ op_addr_rr_post(s, a, addr, -4);
+ return true;
+}
+
+static bool trans_STRD_rr(DisasContext *s, arg_ldst_rr *a)
+{
+ int mem_idx = get_mem_index(s);
+ TCGv_i32 addr, tmp;
+
+ if (!ENABLE_ARCH_5TE) {
+ return false;
+ }
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ addr = op_addr_rr_pre(s, a);
+
+ tmp = load_reg(s, a->rt);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+
+ tcg_gen_addi_i32(addr, addr, 4);
+
+ tmp = load_reg(s, a->rt + 1);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+
+ op_addr_rr_post(s, a, addr, -4);
+ return true;
+}
+
+/*
+ * Load/store immediate index
+ */
+
+static TCGv_i32 op_addr_ri_pre(DisasContext *s, arg_ldst_ri *a)
+{
+ int ofs = a->imm;
+
+ if (!a->u) {
+ ofs = -ofs;
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ /*
+ * Stackcheck. Here we know 'addr' is the current SP;
+ * U is set if we're moving SP up, else down. It is
+ * UNKNOWN whether the limit check triggers when SP starts
+ * below the limit and ends up above it; we chose to do so.
+ */
+ if (!a->u) {
+ TCGv_i32 newsp = tcg_temp_new_i32();
+ tcg_gen_addi_i32(newsp, cpu_R[13], ofs);
+ gen_helper_v8m_stackcheck(cpu_env, newsp);
+ tcg_temp_free_i32(newsp);
+ } else {
+ gen_helper_v8m_stackcheck(cpu_env, cpu_R[13]);
+ }
+ }
+
+ return add_reg_for_lit(s, a->rn, a->p ? ofs : 0);
+}
+
+static void op_addr_ri_post(DisasContext *s, arg_ldst_ri *a,
+ TCGv_i32 addr, int address_offset)
+{
+ if (!a->p) {
+ if (a->u) {
+ address_offset += a->imm;
+ } else {
+ address_offset -= a->imm;
+ }
+ } else if (!a->w) {
+ tcg_temp_free_i32(addr);
+ return;
+ }
+ tcg_gen_addi_i32(addr, addr, address_offset);
+ store_reg(s, a->rn, addr);
+}
+
+static bool op_load_ri(DisasContext *s, arg_ldst_ri *a,
+ MemOp mop, int mem_idx)
+{
+ ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
+ TCGv_i32 addr, tmp;
+
+ addr = op_addr_ri_pre(s, a);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
+ disas_set_da_iss(s, mop, issinfo);
+
+ /*
+ * Perform base writeback before the loaded value to
+ * ensure correct behavior with overlapping index registers.
+ */
+ op_addr_ri_post(s, a, addr, 0);
+ store_reg_from_load(s, a->rt, tmp);
+ return true;
+}
+
+static bool op_store_ri(DisasContext *s, arg_ldst_ri *a,
+ MemOp mop, int mem_idx)
+{
+ ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
+ TCGv_i32 addr, tmp;
+
+ /*
+ * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
+ * is either UNPREDICTABLE or has defined behaviour
+ */
+ if (s->thumb && a->rn == 15) {
+ return false;
+ }
+
+ addr = op_addr_ri_pre(s, a);
+
+ tmp = load_reg(s, a->rt);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
+ disas_set_da_iss(s, mop, issinfo);
+ tcg_temp_free_i32(tmp);
+
+ op_addr_ri_post(s, a, addr, 0);
+ return true;
+}
+
+static bool op_ldrd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
+{
+ int mem_idx = get_mem_index(s);
+ TCGv_i32 addr, tmp;
+
+ addr = op_addr_ri_pre(s, a);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ store_reg(s, a->rt, tmp);
+
+ tcg_gen_addi_i32(addr, addr, 4);
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ store_reg(s, rt2, tmp);
+
+ /* LDRD w/ base writeback is undefined if the registers overlap. */
+ op_addr_ri_post(s, a, addr, -4);
+ return true;
+}
+
+static bool trans_LDRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
+{
+ if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
+ return false;
+ }
+ return op_ldrd_ri(s, a, a->rt + 1);
+}
+
+static bool trans_LDRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
+{
+ arg_ldst_ri b = {
+ .u = a->u, .w = a->w, .p = a->p,
+ .rn = a->rn, .rt = a->rt, .imm = a->imm
+ };
+ return op_ldrd_ri(s, &b, a->rt2);
+}
+
+static bool op_strd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
+{
+ int mem_idx = get_mem_index(s);
+ TCGv_i32 addr, tmp;
+
+ addr = op_addr_ri_pre(s, a);
+
+ tmp = load_reg(s, a->rt);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+
+ tcg_gen_addi_i32(addr, addr, 4);
+
+ tmp = load_reg(s, rt2);
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+
+ op_addr_ri_post(s, a, addr, -4);
+ return true;
+}
+
+static bool trans_STRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
+{
+ if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
+ return false;
+ }
+ return op_strd_ri(s, a, a->rt + 1);
+}
+
+static bool trans_STRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
+{
+ arg_ldst_ri b = {
+ .u = a->u, .w = a->w, .p = a->p,
+ .rn = a->rn, .rt = a->rt, .imm = a->imm
+ };
+ return op_strd_ri(s, &b, a->rt2);
+}
+
+#define DO_LDST(NAME, WHICH, MEMOP) \
+static bool trans_##NAME##_ri(DisasContext *s, arg_ldst_ri *a) \
+{ \
+ return op_##WHICH##_ri(s, a, MEMOP, get_mem_index(s)); \
+} \
+static bool trans_##NAME##T_ri(DisasContext *s, arg_ldst_ri *a) \
+{ \
+ return op_##WHICH##_ri(s, a, MEMOP, get_a32_user_mem_index(s)); \
+} \
+static bool trans_##NAME##_rr(DisasContext *s, arg_ldst_rr *a) \
+{ \
+ return op_##WHICH##_rr(s, a, MEMOP, get_mem_index(s)); \
+} \
+static bool trans_##NAME##T_rr(DisasContext *s, arg_ldst_rr *a) \
+{ \
+ return op_##WHICH##_rr(s, a, MEMOP, get_a32_user_mem_index(s)); \
+}
+
+DO_LDST(LDR, load, MO_UL)
+DO_LDST(LDRB, load, MO_UB)
+DO_LDST(LDRH, load, MO_UW)
+DO_LDST(LDRSB, load, MO_SB)
+DO_LDST(LDRSH, load, MO_SW)
+
+DO_LDST(STR, store, MO_UL)
+DO_LDST(STRB, store, MO_UB)
+DO_LDST(STRH, store, MO_UW)
+
+#undef DO_LDST
+
+/*
+ * Synchronization primitives
+ */
+
+static bool op_swp(DisasContext *s, arg_SWP *a, MemOp opc)
+{
+ TCGv_i32 addr, tmp;
+ TCGv taddr;
+
+ opc |= s->be_data;
+ addr = load_reg(s, a->rn);
+ taddr = gen_aa32_addr(s, addr, opc);
+ tcg_temp_free_i32(addr);
+
+ tmp = load_reg(s, a->rt2);
+ tcg_gen_atomic_xchg_i32(tmp, taddr, tmp, get_mem_index(s), opc);
+ tcg_temp_free(taddr);
+
+ store_reg(s, a->rt, tmp);
+ return true;
+}
+
+static bool trans_SWP(DisasContext *s, arg_SWP *a)
+{
+ return op_swp(s, a, MO_UL | MO_ALIGN);
+}
+
+static bool trans_SWPB(DisasContext *s, arg_SWP *a)
+{
+ return op_swp(s, a, MO_UB);
+}
+
+/*
+ * Load/Store Exclusive and Load-Acquire/Store-Release
+ */
+
+static bool op_strex(DisasContext *s, arg_STREX *a, MemOp mop, bool rel)
+{
+ TCGv_i32 addr;
+ /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
+ bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
+
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rd == 15 || a->rn == 15 || a->rt == 15
+ || a->rd == a->rn || a->rd == a->rt
+ || (!v8a && s->thumb && (a->rd == 13 || a->rt == 13))
+ || (mop == MO_64
+ && (a->rt2 == 15
+ || a->rd == a->rt2
+ || (!v8a && s->thumb && a->rt2 == 13)))) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ if (rel) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ }
+
+ addr = tcg_temp_local_new_i32();
+ load_reg_var(s, addr, a->rn);
+ tcg_gen_addi_i32(addr, addr, a->imm);
+
+ gen_store_exclusive(s, a->rd, a->rt, a->rt2, addr, mop);
+ tcg_temp_free_i32(addr);
+ return true;
+}
+
+static bool trans_STREX(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ return op_strex(s, a, MO_32, false);
+}
+
+static bool trans_STREXD_a32(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_6K) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ a->rt2 = a->rt + 1;
+ return op_strex(s, a, MO_64, false);
+}
+
+static bool trans_STREXD_t32(DisasContext *s, arg_STREX *a)
+{
+ return op_strex(s, a, MO_64, false);
+}
+
+static bool trans_STREXB(DisasContext *s, arg_STREX *a)
+{
+ if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+ return false;
+ }
+ return op_strex(s, a, MO_8, false);
+}
+
+static bool trans_STREXH(DisasContext *s, arg_STREX *a)
+{
+ if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+ return false;
+ }
+ return op_strex(s, a, MO_16, false);
+}
+
+static bool trans_STLEX(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_strex(s, a, MO_32, true);
+}
+
+static bool trans_STLEXD_a32(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ a->rt2 = a->rt + 1;
+ return op_strex(s, a, MO_64, true);
+}
+
+static bool trans_STLEXD_t32(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_strex(s, a, MO_64, true);
+}
+
+static bool trans_STLEXB(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_strex(s, a, MO_8, true);
+}
+
+static bool trans_STLEXH(DisasContext *s, arg_STREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_strex(s, a, MO_16, true);
+}
+
+static bool op_stl(DisasContext *s, arg_STL *a, MemOp mop)
+{
+ TCGv_i32 addr, tmp;
+
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rn == 15 || a->rt == 15) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ addr = load_reg(s, a->rn);
+ tmp = load_reg(s, a->rt);
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
+ disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel | ISSIsWrite);
+
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(addr);
+ return true;
+}
+
+static bool trans_STL(DisasContext *s, arg_STL *a)
+{
+ return op_stl(s, a, MO_UL);
+}
+
+static bool trans_STLB(DisasContext *s, arg_STL *a)
+{
+ return op_stl(s, a, MO_UB);
+}
+
+static bool trans_STLH(DisasContext *s, arg_STL *a)
+{
+ return op_stl(s, a, MO_UW);
+}
+
+static bool op_ldrex(DisasContext *s, arg_LDREX *a, MemOp mop, bool acq)
+{
+ TCGv_i32 addr;
+ /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
+ bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
+
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rn == 15 || a->rt == 15
+ || (!v8a && s->thumb && a->rt == 13)
+ || (mop == MO_64
+ && (a->rt2 == 15 || a->rt == a->rt2
+ || (!v8a && s->thumb && a->rt2 == 13)))) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ addr = tcg_temp_local_new_i32();
+ load_reg_var(s, addr, a->rn);
+ tcg_gen_addi_i32(addr, addr, a->imm);
+
+ gen_load_exclusive(s, a->rt, a->rt2, addr, mop);
+ tcg_temp_free_i32(addr);
+
+ if (acq) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+ }
+ return true;
+}
+
+static bool trans_LDREX(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_32, false);
+}
+
+static bool trans_LDREXD_a32(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_6K) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ a->rt2 = a->rt + 1;
+ return op_ldrex(s, a, MO_64, false);
+}
+
+static bool trans_LDREXD_t32(DisasContext *s, arg_LDREX *a)
+{
+ return op_ldrex(s, a, MO_64, false);
+}
+
+static bool trans_LDREXB(DisasContext *s, arg_LDREX *a)
+{
+ if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_8, false);
+}
+
+static bool trans_LDREXH(DisasContext *s, arg_LDREX *a)
+{
+ if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_16, false);
+}
+
+static bool trans_LDAEX(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_32, true);
+}
+
+static bool trans_LDAEXD_a32(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rt & 1) {
+ unallocated_encoding(s);
+ return true;
+ }
+ a->rt2 = a->rt + 1;
+ return op_ldrex(s, a, MO_64, true);
+}
+
+static bool trans_LDAEXD_t32(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_64, true);
+}
+
+static bool trans_LDAEXB(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_8, true);
+}
+
+static bool trans_LDAEXH(DisasContext *s, arg_LDREX *a)
+{
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ return op_ldrex(s, a, MO_16, true);
+}
+
+static bool op_lda(DisasContext *s, arg_LDA *a, MemOp mop)
+{
+ TCGv_i32 addr, tmp;
+
+ if (!ENABLE_ARCH_8) {
+ return false;
+ }
+ /* We UNDEF for these UNPREDICTABLE cases. */
+ if (a->rn == 15 || a->rt == 15) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ addr = load_reg(s, a->rn);
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
+ disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel);
+ tcg_temp_free_i32(addr);
+
+ store_reg(s, a->rt, tmp);
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+ return true;
+}
+
+static bool trans_LDA(DisasContext *s, arg_LDA *a)
+{
+ return op_lda(s, a, MO_UL);
+}
+
+static bool trans_LDAB(DisasContext *s, arg_LDA *a)
+{
+ return op_lda(s, a, MO_UB);
+}
+
+static bool trans_LDAH(DisasContext *s, arg_LDA *a)
+{
+ return op_lda(s, a, MO_UW);
+}
+
+/*
+ * Media instructions
+ */
+
+static bool trans_USADA8(DisasContext *s, arg_USADA8 *a)
+{
+ TCGv_i32 t1, t2;
+
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ gen_helper_usad8(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+ if (a->ra != 15) {
+ t2 = load_reg(s, a->ra);
+ tcg_gen_add_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+ }
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool op_bfx(DisasContext *s, arg_UBFX *a, bool u)
+{
+ TCGv_i32 tmp;
+ int width = a->widthm1 + 1;
+ int shift = a->lsb;
+
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+ if (shift + width > 32) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ tmp = load_reg(s, a->rn);
+ if (u) {
+ tcg_gen_extract_i32(tmp, tmp, shift, width);
+ } else {
+ tcg_gen_sextract_i32(tmp, tmp, shift, width);
+ }
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_SBFX(DisasContext *s, arg_SBFX *a)
+{
+ return op_bfx(s, a, false);
+}
+
+static bool trans_UBFX(DisasContext *s, arg_UBFX *a)
+{
+ return op_bfx(s, a, true);
+}
+
+static bool trans_BFCI(DisasContext *s, arg_BFCI *a)
+{
+ TCGv_i32 tmp;
+ int msb = a->msb, lsb = a->lsb;
+ int width;
+
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+ if (msb < lsb) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ unallocated_encoding(s);
+ return true;
+ }
+
+ width = msb + 1 - lsb;
+ if (a->rn == 15) {
+ /* BFC */
+ tmp = tcg_const_i32(0);
+ } else {
+ /* BFI */
+ tmp = load_reg(s, a->rn);
+ }
+ if (width != 32) {
+ TCGv_i32 tmp2 = load_reg(s, a->rd);
+ tcg_gen_deposit_i32(tmp, tmp2, tmp, lsb, width);
+ tcg_temp_free_i32(tmp2);
+ }
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_UDF(DisasContext *s, arg_UDF *a)
+{
+ unallocated_encoding(s);
+ return true;
+}
+
+/*
+ * Parallel addition and subtraction
+ */
+
+static bool op_par_addsub(DisasContext *s, arg_rrr *a,
+ void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0, t1;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rn);
+ t1 = load_reg(s, a->rm);
+
+ gen(t0, t0, t1);
+
+ tcg_temp_free_i32(t1);
+ store_reg(s, a->rd, t0);
+ return true;
+}
+
+static bool op_par_addsub_ge(DisasContext *s, arg_rrr *a,
+ void (*gen)(TCGv_i32, TCGv_i32,
+ TCGv_i32, TCGv_ptr))
+{
+ TCGv_i32 t0, t1;
+ TCGv_ptr ge;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t0 = load_reg(s, a->rn);
+ t1 = load_reg(s, a->rm);
+
+ ge = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ge, cpu_env, offsetof(CPUARMState, GE));
+ gen(t0, t0, t1, ge);
+
+ tcg_temp_free_ptr(ge);
+ tcg_temp_free_i32(t1);
+ store_reg(s, a->rd, t0);
+ return true;
+}
+
+#define DO_PAR_ADDSUB(NAME, helper) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a) \
+{ \
+ return op_par_addsub(s, a, helper); \
+}
+
+#define DO_PAR_ADDSUB_GE(NAME, helper) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a) \
+{ \
+ return op_par_addsub_ge(s, a, helper); \
+}
+
+DO_PAR_ADDSUB_GE(SADD16, gen_helper_sadd16)
+DO_PAR_ADDSUB_GE(SASX, gen_helper_saddsubx)
+DO_PAR_ADDSUB_GE(SSAX, gen_helper_ssubaddx)
+DO_PAR_ADDSUB_GE(SSUB16, gen_helper_ssub16)
+DO_PAR_ADDSUB_GE(SADD8, gen_helper_sadd8)
+DO_PAR_ADDSUB_GE(SSUB8, gen_helper_ssub8)
+
+DO_PAR_ADDSUB_GE(UADD16, gen_helper_uadd16)
+DO_PAR_ADDSUB_GE(UASX, gen_helper_uaddsubx)
+DO_PAR_ADDSUB_GE(USAX, gen_helper_usubaddx)
+DO_PAR_ADDSUB_GE(USUB16, gen_helper_usub16)
+DO_PAR_ADDSUB_GE(UADD8, gen_helper_uadd8)
+DO_PAR_ADDSUB_GE(USUB8, gen_helper_usub8)
+
+DO_PAR_ADDSUB(QADD16, gen_helper_qadd16)
+DO_PAR_ADDSUB(QASX, gen_helper_qaddsubx)
+DO_PAR_ADDSUB(QSAX, gen_helper_qsubaddx)
+DO_PAR_ADDSUB(QSUB16, gen_helper_qsub16)
+DO_PAR_ADDSUB(QADD8, gen_helper_qadd8)
+DO_PAR_ADDSUB(QSUB8, gen_helper_qsub8)
+
+DO_PAR_ADDSUB(UQADD16, gen_helper_uqadd16)
+DO_PAR_ADDSUB(UQASX, gen_helper_uqaddsubx)
+DO_PAR_ADDSUB(UQSAX, gen_helper_uqsubaddx)
+DO_PAR_ADDSUB(UQSUB16, gen_helper_uqsub16)
+DO_PAR_ADDSUB(UQADD8, gen_helper_uqadd8)
+DO_PAR_ADDSUB(UQSUB8, gen_helper_uqsub8)
+
+DO_PAR_ADDSUB(SHADD16, gen_helper_shadd16)
+DO_PAR_ADDSUB(SHASX, gen_helper_shaddsubx)
+DO_PAR_ADDSUB(SHSAX, gen_helper_shsubaddx)
+DO_PAR_ADDSUB(SHSUB16, gen_helper_shsub16)
+DO_PAR_ADDSUB(SHADD8, gen_helper_shadd8)
+DO_PAR_ADDSUB(SHSUB8, gen_helper_shsub8)
+
+DO_PAR_ADDSUB(UHADD16, gen_helper_uhadd16)
+DO_PAR_ADDSUB(UHASX, gen_helper_uhaddsubx)
+DO_PAR_ADDSUB(UHSAX, gen_helper_uhsubaddx)
+DO_PAR_ADDSUB(UHSUB16, gen_helper_uhsub16)
+DO_PAR_ADDSUB(UHADD8, gen_helper_uhadd8)
+DO_PAR_ADDSUB(UHSUB8, gen_helper_uhsub8)
+
+#undef DO_PAR_ADDSUB
+#undef DO_PAR_ADDSUB_GE
+
+/*
+ * Packing, unpacking, saturation, and reversal
+ */
+
+static bool trans_PKH(DisasContext *s, arg_PKH *a)
+{
+ TCGv_i32 tn, tm;
+ int shift = a->imm;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ tn = load_reg(s, a->rn);
+ tm = load_reg(s, a->rm);
+ if (a->tb) {
+ /* PKHTB */
+ if (shift == 0) {
+ shift = 31;
+ }
+ tcg_gen_sari_i32(tm, tm, shift);
+ tcg_gen_deposit_i32(tn, tn, tm, 0, 16);
+ } else {
+ /* PKHBT */
+ tcg_gen_shli_i32(tm, tm, shift);
+ tcg_gen_deposit_i32(tn, tm, tn, 0, 16);
+ }
+ tcg_temp_free_i32(tm);
+ store_reg(s, a->rd, tn);
+ return true;
+}
+
+static bool op_sat(DisasContext *s, arg_sat *a,
+ void (*gen)(TCGv_i32, TCGv_env, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 tmp;
+ int shift = a->imm;
+
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+
+ tmp = load_reg(s, a->rn);
+ if (a->sh) {
+ tcg_gen_sari_i32(tmp, tmp, shift ? shift : 31);
+ } else {
+ tcg_gen_shli_i32(tmp, tmp, shift);
+ }
+
+ gen(tmp, cpu_env, tmp, tcg_constant_i32(a->satimm));
+
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_SSAT(DisasContext *s, arg_sat *a)
+{
+ return op_sat(s, a, gen_helper_ssat);
+}
+
+static bool trans_USAT(DisasContext *s, arg_sat *a)
+{
+ return op_sat(s, a, gen_helper_usat);
+}
+
+static bool trans_SSAT16(DisasContext *s, arg_sat *a)
+{
+ if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+ return false;
+ }
+ return op_sat(s, a, gen_helper_ssat16);
+}
+
+static bool trans_USAT16(DisasContext *s, arg_sat *a)
+{
+ if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+ return false;
+ }
+ return op_sat(s, a, gen_helper_usat16);
+}
+
+static bool op_xta(DisasContext *s, arg_rrr_rot *a,
+ void (*gen_extract)(TCGv_i32, TCGv_i32),
+ void (*gen_add)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 tmp;
+
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+
+ tmp = load_reg(s, a->rm);
+ /*
+ * TODO: In many cases we could do a shift instead of a rotate.
+ * Combined with a simple extend, that becomes an extract.
+ */
+ tcg_gen_rotri_i32(tmp, tmp, a->rot * 8);
+ gen_extract(tmp, tmp);
+
+ if (a->rn != 15) {
+ TCGv_i32 tmp2 = load_reg(s, a->rn);
+ gen_add(tmp, tmp, tmp2);
+ tcg_temp_free_i32(tmp2);
+ }
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_SXTAB(DisasContext *s, arg_rrr_rot *a)
+{
+ return op_xta(s, a, tcg_gen_ext8s_i32, tcg_gen_add_i32);
+}
+
+static bool trans_SXTAH(DisasContext *s, arg_rrr_rot *a)
+{
+ return op_xta(s, a, tcg_gen_ext16s_i32, tcg_gen_add_i32);
+}
+
+static bool trans_SXTAB16(DisasContext *s, arg_rrr_rot *a)
+{
+ if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+ return false;
+ }
+ return op_xta(s, a, gen_helper_sxtb16, gen_add16);
+}
+
+static bool trans_UXTAB(DisasContext *s, arg_rrr_rot *a)
+{
+ return op_xta(s, a, tcg_gen_ext8u_i32, tcg_gen_add_i32);
+}
+
+static bool trans_UXTAH(DisasContext *s, arg_rrr_rot *a)
+{
+ return op_xta(s, a, tcg_gen_ext16u_i32, tcg_gen_add_i32);
+}
+
+static bool trans_UXTAB16(DisasContext *s, arg_rrr_rot *a)
+{
+ if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+ return false;
+ }
+ return op_xta(s, a, gen_helper_uxtb16, gen_add16);
+}
+
+static bool trans_SEL(DisasContext *s, arg_rrr *a)
+{
+ TCGv_i32 t1, t2, t3;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ t3 = tcg_temp_new_i32();
+ tcg_gen_ld_i32(t3, cpu_env, offsetof(CPUARMState, GE));
+ gen_helper_sel_flags(t1, t3, t1, t2);
+ tcg_temp_free_i32(t3);
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool op_rr(DisasContext *s, arg_rr *a,
+ void (*gen)(TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 tmp;
+
+ tmp = load_reg(s, a->rm);
+ gen(tmp, tmp);
+ store_reg(s, a->rd, tmp);
+ return true;
+}
+
+static bool trans_REV(DisasContext *s, arg_rr *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ return op_rr(s, a, tcg_gen_bswap32_i32);
+}
+
+static bool trans_REV16(DisasContext *s, arg_rr *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ return op_rr(s, a, gen_rev16);
+}
+
+static bool trans_REVSH(DisasContext *s, arg_rr *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ return op_rr(s, a, gen_revsh);
+}
+
+static bool trans_RBIT(DisasContext *s, arg_rr *a)
+{
+ if (!ENABLE_ARCH_6T2) {
+ return false;
+ }
+ return op_rr(s, a, gen_helper_rbit);
+}
+
+/*
+ * Signed multiply, signed and unsigned divide
+ */
+
+static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
+{
+ TCGv_i32 t1, t2;
+
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ if (m_swap) {
+ gen_swap_half(t2, t2);
+ }
+ gen_smul_dual(t1, t2);
+
+ if (sub) {
+ /*
+ * This subtraction cannot overflow, so we can do a simple
+ * 32-bit subtraction and then a possible 32-bit saturating
+ * addition of Ra.
+ */
+ tcg_gen_sub_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
+
+ if (a->ra != 15) {
+ t2 = load_reg(s, a->ra);
+ gen_helper_add_setq(t1, cpu_env, t1, t2);
+ tcg_temp_free_i32(t2);
+ }
+ } else if (a->ra == 15) {
+ /* Single saturation-checking addition */
+ gen_helper_add_setq(t1, cpu_env, t1, t2);
+ tcg_temp_free_i32(t2);
+ } else {
+ /*
+ * We need to add the products and Ra together and then
+ * determine whether the final result overflowed. Doing
+ * this as two separate add-and-check-overflow steps incorrectly
+ * sets Q for cases like (-32768 * -32768) + (-32768 * -32768) + -1.
+ * Do all the arithmetic at 64-bits and then check for overflow.
+ */
+ TCGv_i64 p64, q64;
+ TCGv_i32 t3, qf, one;
+
+ p64 = tcg_temp_new_i64();
+ q64 = tcg_temp_new_i64();
+ tcg_gen_ext_i32_i64(p64, t1);
+ tcg_gen_ext_i32_i64(q64, t2);
+ tcg_gen_add_i64(p64, p64, q64);
+ load_reg_var(s, t2, a->ra);
+ tcg_gen_ext_i32_i64(q64, t2);
+ tcg_gen_add_i64(p64, p64, q64);
+ tcg_temp_free_i64(q64);
+
+ tcg_gen_extr_i64_i32(t1, t2, p64);
+ tcg_temp_free_i64(p64);
+ /*
+ * t1 is the low half of the result which goes into Rd.
+ * We have overflow and must set Q if the high half (t2)
+ * is different from the sign-extension of t1.
+ */
+ t3 = tcg_temp_new_i32();
+ tcg_gen_sari_i32(t3, t1, 31);
+ qf = load_cpu_field(QF);
+ one = tcg_constant_i32(1);
+ tcg_gen_movcond_i32(TCG_COND_NE, qf, t2, t3, one, qf);
+ store_cpu_field(qf, QF);
+ tcg_temp_free_i32(t3);
+ tcg_temp_free_i32(t2);
+ }
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool trans_SMLAD(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlad(s, a, false, false);
+}
+
+static bool trans_SMLADX(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlad(s, a, true, false);
+}
+
+static bool trans_SMLSD(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlad(s, a, false, true);
+}
+
+static bool trans_SMLSDX(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlad(s, a, true, true);
+}
+
+static bool op_smlald(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
+{
+ TCGv_i32 t1, t2;
+ TCGv_i64 l1, l2;
+
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ if (m_swap) {
+ gen_swap_half(t2, t2);
+ }
+ gen_smul_dual(t1, t2);
+
+ l1 = tcg_temp_new_i64();
+ l2 = tcg_temp_new_i64();
+ tcg_gen_ext_i32_i64(l1, t1);
+ tcg_gen_ext_i32_i64(l2, t2);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t2);
+
+ if (sub) {
+ tcg_gen_sub_i64(l1, l1, l2);
+ } else {
+ tcg_gen_add_i64(l1, l1, l2);
+ }
+ tcg_temp_free_i64(l2);
+
+ gen_addq(s, l1, a->ra, a->rd);
+ gen_storeq_reg(s, a->ra, a->rd, l1);
+ tcg_temp_free_i64(l1);
+ return true;
+}
+
+static bool trans_SMLALD(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlald(s, a, false, false);
+}
+
+static bool trans_SMLALDX(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlald(s, a, true, false);
+}
+
+static bool trans_SMLSLD(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlald(s, a, false, true);
+}
+
+static bool trans_SMLSLDX(DisasContext *s, arg_rrrr *a)
+{
+ return op_smlald(s, a, true, true);
+}
+
+static bool op_smmla(DisasContext *s, arg_rrrr *a, bool round, bool sub)
+{
+ TCGv_i32 t1, t2;
+
+ if (s->thumb
+ ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+ : !ENABLE_ARCH_6) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ tcg_gen_muls2_i32(t2, t1, t1, t2);
+
+ if (a->ra != 15) {
+ TCGv_i32 t3 = load_reg(s, a->ra);
+ if (sub) {
+ /*
+ * For SMMLS, we need a 64-bit subtract. Borrow caused by
+ * a non-zero multiplicand lowpart, and the correct result
+ * lowpart for rounding.
+ */
+ tcg_gen_sub2_i32(t2, t1, tcg_constant_i32(0), t3, t2, t1);
+ } else {
+ tcg_gen_add_i32(t1, t1, t3);
+ }
+ tcg_temp_free_i32(t3);
+ }
+ if (round) {
+ /*
+ * Adding 0x80000000 to the 64-bit quantity means that we have
+ * carry in to the high word when the low word has the msb set.
+ */
+ tcg_gen_shri_i32(t2, t2, 31);
+ tcg_gen_add_i32(t1, t1, t2);
+ }
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool trans_SMMLA(DisasContext *s, arg_rrrr *a)
+{
+ return op_smmla(s, a, false, false);
+}
+
+static bool trans_SMMLAR(DisasContext *s, arg_rrrr *a)
+{
+ return op_smmla(s, a, true, false);
+}
+
+static bool trans_SMMLS(DisasContext *s, arg_rrrr *a)
+{
+ return op_smmla(s, a, false, true);
+}
+
+static bool trans_SMMLSR(DisasContext *s, arg_rrrr *a)
+{
+ return op_smmla(s, a, true, true);
+}
+
+static bool op_div(DisasContext *s, arg_rrr *a, bool u)
+{
+ TCGv_i32 t1, t2;
+
+ if (s->thumb
+ ? !dc_isar_feature(aa32_thumb_div, s)
+ : !dc_isar_feature(aa32_arm_div, s)) {
+ return false;
+ }
+
+ t1 = load_reg(s, a->rn);
+ t2 = load_reg(s, a->rm);
+ if (u) {
+ gen_helper_udiv(t1, cpu_env, t1, t2);
+ } else {
+ gen_helper_sdiv(t1, cpu_env, t1, t2);
+ }
+ tcg_temp_free_i32(t2);
+ store_reg(s, a->rd, t1);
+ return true;
+}
+
+static bool trans_SDIV(DisasContext *s, arg_rrr *a)
+{
+ return op_div(s, a, false);
+}
+
+static bool trans_UDIV(DisasContext *s, arg_rrr *a)
+{
+ return op_div(s, a, true);
+}
+
+/*
+ * Block data transfer
+ */
+
+static TCGv_i32 op_addr_block_pre(DisasContext *s, arg_ldst_block *a, int n)
+{
+ TCGv_i32 addr = load_reg(s, a->rn);
+
+ if (a->b) {
+ if (a->i) {
+ /* pre increment */
+ tcg_gen_addi_i32(addr, addr, 4);
+ } else {
+ /* pre decrement */
+ tcg_gen_addi_i32(addr, addr, -(n * 4));
+ }
+ } else if (!a->i && n != 1) {
+ /* post decrement */
+ tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ /*
+ * If the writeback is incrementing SP rather than
+ * decrementing it, and the initial SP is below the
+ * stack limit but the final written-back SP would
+ * be above, then we must not perform any memory
+ * accesses, but it is IMPDEF whether we generate
+ * an exception. We choose to do so in this case.
+ * At this point 'addr' is the lowest address, so
+ * either the original SP (if incrementing) or our
+ * final SP (if decrementing), so that's what we check.
+ */
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ return addr;
+}
+
+static void op_addr_block_post(DisasContext *s, arg_ldst_block *a,
+ TCGv_i32 addr, int n)
+{
+ if (a->w) {
+ /* write back */
+ if (!a->b) {
+ if (a->i) {
+ /* post increment */
+ tcg_gen_addi_i32(addr, addr, 4);
+ } else {
+ /* post decrement */
+ tcg_gen_addi_i32(addr, addr, -(n * 4));
+ }
+ } else if (!a->i && n != 1) {
+ /* pre decrement */
+ tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+}
+
+static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
+{
+ int i, j, n, list, mem_idx;
+ bool user = a->u;
+ TCGv_i32 addr, tmp;
+
+ if (user) {
+ /* STM (user) */
+ if (IS_USER(s)) {
+ /* Only usable in supervisor mode. */
+ unallocated_encoding(s);
+ return true;
+ }
+ }
+
+ list = a->list;
+ n = ctpop16(list);
+ if (n < min_n || a->rn == 15) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ s->eci_handled = true;
+
+ addr = op_addr_block_pre(s, a, n);
+ mem_idx = get_mem_index(s);
+
+ for (i = j = 0; i < 16; i++) {
+ if (!(list & (1 << i))) {
+ continue;
+ }
+
+ if (user && i != 15) {
+ tmp = tcg_temp_new_i32();
+ gen_helper_get_user_reg(tmp, cpu_env, tcg_constant_i32(i));
+ } else {
+ tmp = load_reg(s, i);
+ }
+ gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ tcg_temp_free_i32(tmp);
+
+ /* No need to add after the last transfer. */
+ if (++j != n) {
+ tcg_gen_addi_i32(addr, addr, 4);
+ }
+ }
+
+ op_addr_block_post(s, a, addr, n);
+ clear_eci_state(s);
+ return true;
+}
+
+static bool trans_STM(DisasContext *s, arg_ldst_block *a)
+{
+ /* BitCount(list) < 1 is UNPREDICTABLE */
+ return op_stm(s, a, 1);
+}
+
+static bool trans_STM_t32(DisasContext *s, arg_ldst_block *a)
+{
+ /* Writeback register in register list is UNPREDICTABLE for T32. */
+ if (a->w && (a->list & (1 << a->rn))) {
+ unallocated_encoding(s);
+ return true;
+ }
+ /* BitCount(list) < 2 is UNPREDICTABLE */
+ return op_stm(s, a, 2);
+}
+
+static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
+{
+ int i, j, n, list, mem_idx;
+ bool loaded_base;
+ bool user = a->u;
+ bool exc_return = false;
+ TCGv_i32 addr, tmp, loaded_var;
+
+ if (user) {
+ /* LDM (user), LDM (exception return) */
+ if (IS_USER(s)) {
+ /* Only usable in supervisor mode. */
+ unallocated_encoding(s);
+ return true;
+ }
+ if (extract32(a->list, 15, 1)) {
+ exc_return = true;
+ user = false;
+ } else {
+ /* LDM (user) does not allow writeback. */
+ if (a->w) {
+ unallocated_encoding(s);
+ return true;
+ }
+ }
+ }
+
+ list = a->list;
+ n = ctpop16(list);
+ if (n < min_n || a->rn == 15) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ s->eci_handled = true;
+
+ addr = op_addr_block_pre(s, a, n);
+ mem_idx = get_mem_index(s);
+ loaded_base = false;
+ loaded_var = NULL;
+
+ for (i = j = 0; i < 16; i++) {
+ if (!(list & (1 << i))) {
+ continue;
+ }
+
+ tmp = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+ if (user) {
+ gen_helper_set_user_reg(cpu_env, tcg_constant_i32(i), tmp);
+ tcg_temp_free_i32(tmp);
+ } else if (i == a->rn) {
+ loaded_var = tmp;
+ loaded_base = true;
+ } else if (i == 15 && exc_return) {
+ store_pc_exc_ret(s, tmp);
+ } else {
+ store_reg_from_load(s, i, tmp);
+ }
+
+ /* No need to add after the last transfer. */
+ if (++j != n) {
+ tcg_gen_addi_i32(addr, addr, 4);
+ }
+ }
+
+ op_addr_block_post(s, a, addr, n);
+
+ if (loaded_base) {
+ /* Note that we reject base == pc above. */
+ store_reg(s, a->rn, loaded_var);
+ }
+
+ if (exc_return) {
+ /* Restore CPSR from SPSR. */
+ tmp = load_cpu_field(spsr);
+ if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+ gen_io_start();
+ }
+ gen_helper_cpsr_write_eret(cpu_env, tmp);
+ tcg_temp_free_i32(tmp);
+ /* Must exit loop to check un-masked IRQs */
+ s->base.is_jmp = DISAS_EXIT;
+ }
+ clear_eci_state(s);
+ return true;
+}
+
+static bool trans_LDM_a32(DisasContext *s, arg_ldst_block *a)
+{
+ /*
+ * Writeback register in register list is UNPREDICTABLE
+ * for ArchVersion() >= 7. Prior to v7, A32 would write
+ * an UNKNOWN value to the base register.
+ */
+ if (ENABLE_ARCH_7 && a->w && (a->list & (1 << a->rn))) {
+ unallocated_encoding(s);
+ return true;
+ }
+ /* BitCount(list) < 1 is UNPREDICTABLE */
+ return do_ldm(s, a, 1);
+}
+
+static bool trans_LDM_t32(DisasContext *s, arg_ldst_block *a)
+{
+ /* Writeback register in register list is UNPREDICTABLE for T32. */
+ if (a->w && (a->list & (1 << a->rn))) {
+ unallocated_encoding(s);
+ return true;
+ }
+ /* BitCount(list) < 2 is UNPREDICTABLE */
+ return do_ldm(s, a, 2);
+}
+
+static bool trans_LDM_t16(DisasContext *s, arg_ldst_block *a)
+{
+ /* Writeback is conditional on the base register not being loaded. */
+ a->w = !(a->list & (1 << a->rn));
+ /* BitCount(list) < 1 is UNPREDICTABLE */
+ return do_ldm(s, a, 1);
+}
+
+static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
+{
+ int i;
+ TCGv_i32 zero;
+
+ if (!dc_isar_feature(aa32_m_sec_state, s)) {
+ return false;
+ }
+
+ if (extract32(a->list, 13, 1)) {
+ return false;
+ }
+
+ if (!a->list) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ return false;
+ }
+
+ s->eci_handled = true;
+
+ zero = tcg_constant_i32(0);
+ for (i = 0; i < 15; i++) {
+ if (extract32(a->list, i, 1)) {
+ /* Clear R[i] */
+ tcg_gen_mov_i32(cpu_R[i], zero);
+ }
+ }
+ if (extract32(a->list, 15, 1)) {
+ /*
+ * Clear APSR (by calling the MSR helper with the same argument
+ * as for "MSR APSR_nzcvqg, Rn": mask = 0b1100, SYSM=0)
+ */
+ gen_helper_v7m_msr(cpu_env, tcg_constant_i32(0xc00), zero);
+ }
+ clear_eci_state(s);
+ return true;
+}
+
+/*
+ * Branch, branch with link
+ */
+
+static bool trans_B(DisasContext *s, arg_i *a)
+{
+ gen_jmp(s, jmp_diff(s, a->imm));
+ return true;
+}
+
+static bool trans_B_cond_thumb(DisasContext *s, arg_ci *a)
+{
+ /* This has cond from encoding, required to be outside IT block. */
+ if (a->cond >= 0xe) {
+ return false;
+ }
+ if (s->condexec_mask) {
+ unallocated_encoding(s);
+ return true;
+ }
+ arm_skip_unless(s, a->cond);
+ gen_jmp(s, jmp_diff(s, a->imm));
+ return true;
+}
+
+static bool trans_BL(DisasContext *s, arg_i *a)
+{
+ gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+ gen_jmp(s, jmp_diff(s, a->imm));
+ return true;
+}
+
+static bool trans_BLX_i(DisasContext *s, arg_BLX_i *a)
+{
+ /*
+ * BLX <imm> would be useless on M-profile; the encoding space
+ * is used for other insns from v8.1M onward, and UNDEFs before that.
+ */
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+
+ /* For A32, ARM_FEATURE_V5 is checked near the start of the uncond block. */
+ if (s->thumb && (a->imm & 2)) {
+ return false;
+ }
+ gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+ store_cpu_field_constant(!s->thumb, thumb);
+ /* This jump is computed from an aligned PC: subtract off the low bits. */
+ gen_jmp(s, jmp_diff(s, a->imm - (s->pc_curr & 3)));
+ return true;
+}
+
+static bool trans_BL_BLX_prefix(DisasContext *s, arg_BL_BLX_prefix *a)
+{
+ assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+ gen_pc_plus_diff(s, cpu_R[14], jmp_diff(s, a->imm << 12));
+ return true;
+}
+
+static bool trans_BL_suffix(DisasContext *s, arg_BL_suffix *a)
+{
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+ tcg_gen_addi_i32(tmp, cpu_R[14], (a->imm << 1) | 1);
+ gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
+ gen_bx(s, tmp);
+ return true;
+}
+
+static bool trans_BLX_suffix(DisasContext *s, arg_BLX_suffix *a)
+{
+ TCGv_i32 tmp;
+
+ assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+ if (!ENABLE_ARCH_5) {
+ return false;
+ }
+ tmp = tcg_temp_new_i32();
+ tcg_gen_addi_i32(tmp, cpu_R[14], a->imm << 1);
+ tcg_gen_andi_i32(tmp, tmp, 0xfffffffc);
+ gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
+ gen_bx(s, tmp);
+ return true;
+}
+
+static bool trans_BF(DisasContext *s, arg_BF *a)
+{
+ /*
+ * M-profile branch future insns. The architecture permits an
+ * implementation to implement these as NOPs (equivalent to
+ * discarding the LO_BRANCH_INFO cache immediately), and we
+ * take that IMPDEF option because for QEMU a "real" implementation
+ * would be complicated and wouldn't execute any faster.
+ */
+ if (!dc_isar_feature(aa32_lob, s)) {
+ return false;
+ }
+ if (a->boff == 0) {
+ /* SEE "Related encodings" (loop insns) */
+ return false;
+ }
+ /* Handle as NOP */
+ return true;
+}
+
+static bool trans_DLS(DisasContext *s, arg_DLS *a)
+{
+ /* M-profile low-overhead loop start */
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_lob, s)) {
+ return false;
+ }
+ if (a->rn == 13 || a->rn == 15) {
+ /*
+ * For DLSTP rn == 15 is a related encoding (LCTP); the
+ * other cases caught by this condition are all
+ * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
+ */
+ return false;
+ }
+
+ if (a->size != 4) {
+ /* DLSTP */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+ }
+
+ /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */
+ tmp = load_reg(s, a->rn);
+ store_reg(s, 14, tmp);
+ if (a->size != 4) {
+ /* DLSTP: set FPSCR.LTPSIZE */
+ store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ }
+ return true;
+}
+
+static bool trans_WLS(DisasContext *s, arg_WLS *a)
+{
+ /* M-profile low-overhead while-loop start */
+ TCGv_i32 tmp;
+ DisasLabel nextlabel;
+
+ if (!dc_isar_feature(aa32_lob, s)) {
+ return false;
+ }
+ if (a->rn == 13 || a->rn == 15) {
+ /*
+ * For WLSTP rn == 15 is a related encoding (LE); the
+ * other cases caught by this condition are all
+ * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
+ */
+ return false;
+ }
+ if (s->condexec_mask) {
+ /*
+ * WLS in an IT block is CONSTRAINED UNPREDICTABLE;
+ * we choose to UNDEF, because otherwise our use of
+ * gen_goto_tb(1) would clash with the use of TB exit 1
+ * in the dc->condjmp condition-failed codepath in
+ * arm_tr_tb_stop() and we'd get an assertion.
+ */
+ return false;
+ }
+ if (a->size != 4) {
+ /* WLSTP */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+ /*
+ * We need to check that the FPU is enabled here, but mustn't
+ * call vfp_access_check() to do that because we don't want to
+ * do the lazy state preservation in the "loop count is zero" case.
+ * Do the check-and-raise-exception by hand.
+ */
+ if (s->fp_excp_el) {
+ gen_exception_insn_el(s, 0, EXCP_NOCP,
+ syn_uncategorized(), s->fp_excp_el);
+ return true;
+ }
+ }
+
+ nextlabel = gen_disas_label(s);
+ tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel.label);
+ tmp = load_reg(s, a->rn);
+ store_reg(s, 14, tmp);
+ if (a->size != 4) {
+ /*
+ * WLSTP: set FPSCR.LTPSIZE. This requires that we do the
+ * lazy state preservation, new FP context creation, etc,
+ * that vfp_access_check() does. We know that the actual
+ * access check will succeed (ie it won't generate code that
+ * throws an exception) because we did that check by hand earlier.
+ */
+ bool ok = vfp_access_check(s);
+ assert(ok);
+ store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
+ /*
+ * LTPSIZE updated, but MVE_NO_PRED will always be the same thing (0)
+ * when we take this upcoming exit from this TB, so gen_jmp_tb() is OK.
+ */
+ }
+ gen_jmp_tb(s, curr_insn_len(s), 1);
+
+ set_disas_label(s, nextlabel);
+ gen_jmp(s, jmp_diff(s, a->imm));
+ return true;
+}
+
+static bool trans_LE(DisasContext *s, arg_LE *a)
+{
+ /*
+ * M-profile low-overhead loop end. The architecture permits an
+ * implementation to discard the LO_BRANCH_INFO cache at any time,
+ * and we take the IMPDEF option to never set it in the first place
+ * (equivalent to always discarding it immediately), because for QEMU
+ * a "real" implementation would be complicated and wouldn't execute
+ * any faster.
+ */
+ TCGv_i32 tmp;
+ DisasLabel loopend;
+ bool fpu_active;
+
+ if (!dc_isar_feature(aa32_lob, s)) {
+ return false;
+ }
+ if (a->f && a->tp) {
+ return false;
+ }
+ if (s->condexec_mask) {
+ /*
+ * LE in an IT block is CONSTRAINED UNPREDICTABLE;
+ * we choose to UNDEF, because otherwise our use of
+ * gen_goto_tb(1) would clash with the use of TB exit 1
+ * in the dc->condjmp condition-failed codepath in
+ * arm_tr_tb_stop() and we'd get an assertion.
+ */
+ return false;
+ }
+ if (a->tp) {
+ /* LETP */
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+ if (!vfp_access_check(s)) {
+ s->eci_handled = true;
+ return true;
+ }
+ }
+
+ /* LE/LETP is OK with ECI set and leaves it untouched */
+ s->eci_handled = true;
+
+ /*
+ * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE
+ * UsageFault exception for the LE insn in that case. Note that we
+ * are not directly checking FPSCR.LTPSIZE but instead check the
+ * pseudocode LTPSIZE() function, which returns 4 if the FPU is
+ * not currently active (ie ActiveFPState() returns false). We
+ * can identify not-active purely from our TB state flags, as the
+ * FPU is active only if:
+ * the FPU is enabled
+ * AND lazy state preservation is not active
+ * AND we do not need a new fp context (this is the ASPEN/FPCA check)
+ *
+ * Usually we don't need to care about this distinction between
+ * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check()
+ * will either take an exception or clear the conditions that make
+ * the FPU not active. But LE is an unusual case of a non-FP insn
+ * that looks at LTPSIZE.
+ */
+ fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed;
+
+ if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) {
+ /* Need to do a runtime check for LTPSIZE != 4 */
+ DisasLabel skipexc = gen_disas_label(s);
+ tmp = load_cpu_field(v7m.ltpsize);
+ tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc.label);
+ tcg_temp_free_i32(tmp);
+ gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+ set_disas_label(s, skipexc);
+ }
+
+ if (a->f) {
+ /* Loop-forever: just jump back to the loop start */
+ gen_jmp(s, jmp_diff(s, -a->imm));
+ return true;
+ }
+
+ /*
+ * Not loop-forever. If LR <= loop-decrement-value this is the last loop.
+ * For LE, we know at this point that LTPSIZE must be 4 and the
+ * loop decrement value is 1. For LETP we need to calculate the decrement
+ * value from LTPSIZE.
+ */
+ loopend = gen_disas_label(s);
+ if (!a->tp) {
+ tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend.label);
+ tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1);
+ } else {
+ /*
+ * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local
+ * so that decr stays live after the brcondi.
+ */
+ TCGv_i32 decr = tcg_temp_local_new_i32();
+ TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize);
+ tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize);
+ tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr);
+ tcg_temp_free_i32(ltpsize);
+
+ tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend.label);
+
+ tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr);
+ tcg_temp_free_i32(decr);
+ }
+ /* Jump back to the loop start */
+ gen_jmp(s, jmp_diff(s, -a->imm));
+
+ set_disas_label(s, loopend);
+ if (a->tp) {
+ /* Exits from tail-pred loops must reset LTPSIZE to 4 */
+ store_cpu_field(tcg_constant_i32(4), v7m.ltpsize);
+ }
+ /* End TB, continuing to following insn */
+ gen_jmp_tb(s, curr_insn_len(s), 1);
+ return true;
+}
+
+static bool trans_LCTP(DisasContext *s, arg_LCTP *a)
+{
+ /*
+ * M-profile Loop Clear with Tail Predication. Since our implementation
+ * doesn't cache branch information, all we need to do is reset
+ * FPSCR.LTPSIZE to 4.
+ */
+
+ if (!dc_isar_feature(aa32_lob, s) ||
+ !dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ store_cpu_field_constant(4, v7m.ltpsize);
+ return true;
+}
+
+static bool trans_VCTP(DisasContext *s, arg_VCTP *a)
+{
+ /*
+ * M-profile Create Vector Tail Predicate. This insn is itself
+ * predicated and is subject to beatwise execution.
+ */
+ TCGv_i32 rn_shifted, masklen;
+
+ if (!dc_isar_feature(aa32_mve, s) || a->rn == 13 || a->rn == 15) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * We pre-calculate the mask length here to avoid having
+ * to have multiple helpers specialized for size.
+ * We pass the helper "rn <= (1 << (4 - size)) ? (rn << size) : 16".
+ */
+ rn_shifted = tcg_temp_new_i32();
+ masklen = load_reg(s, a->rn);
+ tcg_gen_shli_i32(rn_shifted, masklen, a->size);
+ tcg_gen_movcond_i32(TCG_COND_LEU, masklen,
+ masklen, tcg_constant_i32(1 << (4 - a->size)),
+ rn_shifted, tcg_constant_i32(16));
+ gen_helper_mve_vctp(cpu_env, masklen);
+ tcg_temp_free_i32(masklen);
+ tcg_temp_free_i32(rn_shifted);
+ /* This insn updates predication bits */
+ s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+ mve_update_eci(s);
+ return true;
+}
+
+static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
+{
+ TCGv_i32 addr, tmp;
+
+ tmp = load_reg(s, a->rm);
+ if (half) {
+ tcg_gen_add_i32(tmp, tmp, tmp);
+ }
+ addr = load_reg(s, a->rn);
+ tcg_gen_add_i32(addr, addr, tmp);
+
+ gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), half ? MO_UW : MO_UB);
+
+ tcg_gen_add_i32(tmp, tmp, tmp);
+ gen_pc_plus_diff(s, addr, jmp_diff(s, 0));
+ tcg_gen_add_i32(tmp, tmp, addr);
+ tcg_temp_free_i32(addr);
+ store_reg(s, 15, tmp);
+ return true;
+}
+
+static bool trans_TBB(DisasContext *s, arg_tbranch *a)
+{
+ return op_tbranch(s, a, false);
+}
+
+static bool trans_TBH(DisasContext *s, arg_tbranch *a)
+{
+ return op_tbranch(s, a, true);
+}
+
+static bool trans_CBZ(DisasContext *s, arg_CBZ *a)
+{
+ TCGv_i32 tmp = load_reg(s, a->rn);
+
+ arm_gen_condlabel(s);
+ tcg_gen_brcondi_i32(a->nz ? TCG_COND_EQ : TCG_COND_NE,
+ tmp, 0, s->condlabel.label);
+ tcg_temp_free_i32(tmp);
+ gen_jmp(s, jmp_diff(s, a->imm));
+ return true;
+}
+
+/*
+ * Supervisor call - both T32 & A32 come here so we need to check
+ * which mode we are in when checking for semihosting.
+ */
+
+static bool trans_SVC(DisasContext *s, arg_SVC *a)
+{
+ const uint32_t semihost_imm = s->thumb ? 0xab : 0x123456;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M) &&
+ semihosting_enabled(s->current_el == 0) &&
+ (a->imm == semihost_imm)) {
+ gen_exception_internal_insn(s, EXCP_SEMIHOST);
+ } else {
+ if (s->fgt_svc) {
+ uint32_t syndrome = syn_aa32_svc(a->imm, s->thumb);
+ gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
+ } else {
+ gen_update_pc(s, curr_insn_len(s));
+ s->svc_imm = a->imm;
+ s->base.is_jmp = DISAS_SWI;
+ }
+ }
+ return true;
+}
+
+/*
+ * Unconditional system instructions
+ */
+
+static bool trans_RFE(DisasContext *s, arg_RFE *a)
+{
+ static const int8_t pre_offset[4] = {
+ /* DA */ -4, /* IA */ 0, /* DB */ -8, /* IB */ 4
+ };
+ static const int8_t post_offset[4] = {
+ /* DA */ -8, /* IA */ 4, /* DB */ -4, /* IB */ 0
+ };
+ TCGv_i32 addr, t1, t2;
+
+ if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ addr = load_reg(s, a->rn);
+ tcg_gen_addi_i32(addr, addr, pre_offset[a->pu]);
+
+ /* Load PC into tmp and CPSR into tmp2. */
+ t1 = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, t1, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+ tcg_gen_addi_i32(addr, addr, 4);
+ t2 = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, t2, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+
+ if (a->w) {
+ /* Base writeback. */
+ tcg_gen_addi_i32(addr, addr, post_offset[a->pu]);
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+ gen_rfe(s, t1, t2);
+ return true;
+}
+
+static bool trans_SRS(DisasContext *s, arg_SRS *a)
+{
+ if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ gen_srs(s, a->mode, a->pu, a->w);
+ return true;
+}
+
+static bool trans_CPS(DisasContext *s, arg_CPS *a)
+{
+ uint32_t mask, val;
+
+ if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ /* Implemented as NOP in user mode. */
+ return true;
+ }
+ /* TODO: There are quite a lot of UNPREDICTABLE argument combinations. */
+
+ mask = val = 0;
+ if (a->imod & 2) {
+ if (a->A) {
+ mask |= CPSR_A;
+ }
+ if (a->I) {
+ mask |= CPSR_I;
+ }
+ if (a->F) {
+ mask |= CPSR_F;
+ }
+ if (a->imod & 1) {
+ val |= mask;
+ }
+ }
+ if (a->M) {
+ mask |= CPSR_M;
+ val |= a->mode;
+ }
+ if (mask) {
+ gen_set_psr_im(s, mask, 0, val);
+ }
+ return true;
+}
+
+static bool trans_CPS_v7m(DisasContext *s, arg_CPS_v7m *a)
+{
+ TCGv_i32 tmp, addr;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ if (IS_USER(s)) {
+ /* Implemented as NOP in user mode. */
+ return true;
+ }
+
+ tmp = tcg_constant_i32(a->im);
+ /* FAULTMASK */
+ if (a->F) {
+ addr = tcg_constant_i32(19);
+ gen_helper_v7m_msr(cpu_env, addr, tmp);
+ }
+ /* PRIMASK */
+ if (a->I) {
+ addr = tcg_constant_i32(16);
+ gen_helper_v7m_msr(cpu_env, addr, tmp);
+ }
+ gen_rebuild_hflags(s, false);
+ gen_lookup_tb(s);
+ return true;
+}
+
+/*
+ * Clear-Exclusive, Barriers
+ */
+
+static bool trans_CLREX(DisasContext *s, arg_CLREX *a)
+{
+ if (s->thumb
+ ? !ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)
+ : !ENABLE_ARCH_6K) {
+ return false;
+ }
+ gen_clrex(s);
+ return true;
+}
+
+static bool trans_DSB(DisasContext *s, arg_DSB *a)
+{
+ if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+ return true;
+}
+
+static bool trans_DMB(DisasContext *s, arg_DMB *a)
+{
+ return trans_DSB(s, NULL);
+}
+
+static bool trans_ISB(DisasContext *s, arg_ISB *a)
+{
+ if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
+ return false;
+ }
+ /*
+ * We need to break the TB after this insn to execute
+ * self-modifying code correctly and also to take
+ * any pending interrupts immediately.
+ */
+ s->base.is_jmp = DISAS_TOO_MANY;
+ return true;
+}
+
+static bool trans_SB(DisasContext *s, arg_SB *a)
+{
+ if (!dc_isar_feature(aa32_sb, s)) {
+ return false;
+ }
+ /*
+ * TODO: There is no speculation barrier opcode
+ * for TCG; MB and end the TB instead.
+ */
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+ s->base.is_jmp = DISAS_TOO_MANY;
+ return true;
+}
+
+static bool trans_SETEND(DisasContext *s, arg_SETEND *a)
+{
+ if (!ENABLE_ARCH_6) {
+ return false;
+ }
+ if (a->E != (s->be_data == MO_BE)) {
+ gen_helper_setend(cpu_env);
+ s->base.is_jmp = DISAS_UPDATE_EXIT;
+ }
+ return true;
+}
+
+/*
+ * Preload instructions
+ * All are nops, contingent on the appropriate arch level.
+ */
+
+static bool trans_PLD(DisasContext *s, arg_PLD *a)
+{
+ return ENABLE_ARCH_5TE;
+}
+
+static bool trans_PLDW(DisasContext *s, arg_PLD *a)
+{
+ return arm_dc_feature(s, ARM_FEATURE_V7MP);
+}
+
+static bool trans_PLI(DisasContext *s, arg_PLD *a)
+{
+ return ENABLE_ARCH_7;
+}
+
+/*
+ * If-then
+ */
+
+static bool trans_IT(DisasContext *s, arg_IT *a)
+{
+ int cond_mask = a->cond_mask;
+
+ /*
+ * No actual code generated for this insn, just setup state.
+ *
+ * Combinations of firstcond and mask which set up an 0b1111
+ * condition are UNPREDICTABLE; we take the CONSTRAINED
+ * UNPREDICTABLE choice to treat 0b1111 the same as 0b1110,
+ * i.e. both meaning "execute always".
+ */
+ s->condexec_cond = (cond_mask >> 4) & 0xe;
+ s->condexec_mask = cond_mask & 0x1f;
+ return true;
+}
+
+/* v8.1M CSEL/CSINC/CSNEG/CSINV */
+static bool trans_CSEL(DisasContext *s, arg_CSEL *a)
+{
+ TCGv_i32 rn, rm, zero;
+ DisasCompare c;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+
+ if (a->rm == 13) {
+ /* SEE "Related encodings" (MVE shifts) */
+ return false;
+ }
+
+ if (a->rd == 13 || a->rd == 15 || a->rn == 13 || a->fcond >= 14) {
+ /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+ return false;
+ }
+
+ /* In this insn input reg fields of 0b1111 mean "zero", not "PC" */
+ zero = tcg_constant_i32(0);
+ if (a->rn == 15) {
+ rn = zero;
+ } else {
+ rn = load_reg(s, a->rn);
+ }
+ if (a->rm == 15) {
+ rm = zero;
+ } else {
+ rm = load_reg(s, a->rm);
+ }
+
+ switch (a->op) {
+ case 0: /* CSEL */
+ break;
+ case 1: /* CSINC */
+ tcg_gen_addi_i32(rm, rm, 1);
+ break;
+ case 2: /* CSINV */
+ tcg_gen_not_i32(rm, rm);
+ break;
+ case 3: /* CSNEG */
+ tcg_gen_neg_i32(rm, rm);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ arm_test_cc(&c, a->fcond);
+ tcg_gen_movcond_i32(c.cond, rn, c.value, zero, rn, rm);
+ arm_free_cc(&c);
+
+ store_reg(s, a->rd, rn);
+ tcg_temp_free_i32(rm);
+
+ return true;
+}
+
+/*
+ * Legacy decoder.
+ */
+
+static void disas_arm_insn(DisasContext *s, unsigned int insn)
+{
+ unsigned int cond = insn >> 28;
+
+ /* M variants do not implement ARM mode; this must raise the INVSTATE
+ * UsageFault exception.
+ */
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+ return;
+ }
+
+ if (s->pstate_il) {
+ /*
+ * Illegal execution state. This has priority over BTI
+ * exceptions, but comes after instruction abort exceptions.
+ */
+ gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
+ return;
+ }
+
+ if (cond == 0xf) {
+ /* In ARMv3 and v4 the NV condition is UNPREDICTABLE; we
+ * choose to UNDEF. In ARMv5 and above the space is used
+ * for miscellaneous unconditional instructions.
+ */
+ if (!arm_dc_feature(s, ARM_FEATURE_V5)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Unconditional instructions. */
+ /* TODO: Perhaps merge these into one decodetree output file. */
+ if (disas_a32_uncond(s, insn) ||
+ disas_vfp_uncond(s, insn) ||
+ disas_neon_dp(s, insn) ||
+ disas_neon_ls(s, insn) ||
+ disas_neon_shared(s, insn)) {
+ return;
+ }
+ /* fall back to legacy decoder */
+
+ if ((insn & 0x0e000f00) == 0x0c000100) {
+ if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
+ /* iWMMXt register transfer. */
+ if (extract32(s->c15_cpar, 1, 1)) {
+ if (!disas_iwmmxt_insn(s, insn)) {
+ return;
+ }
+ }
+ }
+ }
+ goto illegal_op;
+ }
+ if (cond != 0xe) {
+ /* if not always execute, we generate a conditional jump to
+ next instruction */
+ arm_skip_unless(s, cond);
+ }
+
+ /* TODO: Perhaps merge these into one decodetree output file. */
+ if (disas_a32(s, insn) ||
+ disas_vfp(s, insn)) {
+ return;
+ }
+ /* fall back to legacy decoder */
+ /* TODO: convert xscale/iwmmxt decoder to decodetree ?? */
+ if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
+ if (((insn & 0x0c000e00) == 0x0c000000)
+ && ((insn & 0x03000000) != 0x03000000)) {
+ /* Coprocessor insn, coprocessor 0 or 1 */
+ disas_xscale_insn(s, insn);
+ return;
+ }
+ }
+
+illegal_op:
+ unallocated_encoding(s);
+}
+
+static bool thumb_insn_is_16bit(DisasContext *s, uint32_t pc, uint32_t insn)
+{
+ /*
+ * Return true if this is a 16 bit instruction. We must be precise
+ * about this (matching the decode).
+ */
+ if ((insn >> 11) < 0x1d) {
+ /* Definitely a 16-bit instruction */
+ return true;
+ }
+
+ /* Top five bits 0b11101 / 0b11110 / 0b11111 : this is the
+ * first half of a 32-bit Thumb insn. Thumb-1 cores might
+ * end up actually treating this as two 16-bit insns, though,
+ * if it's half of a bl/blx pair that might span a page boundary.
+ */
+ if (arm_dc_feature(s, ARM_FEATURE_THUMB2) ||
+ arm_dc_feature(s, ARM_FEATURE_M)) {
+ /* Thumb2 cores (including all M profile ones) always treat
+ * 32-bit insns as 32-bit.
+ */
+ return false;
+ }
+
+ if ((insn >> 11) == 0x1e && pc - s->page_start < TARGET_PAGE_SIZE - 3) {
+ /* 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix, and the suffix
+ * is not on the next page; we merge this into a 32-bit
+ * insn.
+ */
+ return false;
+ }
+ /* 0b1110_1xxx_xxxx_xxxx : BLX suffix (or UNDEF);
+ * 0b1111_1xxx_xxxx_xxxx : BL suffix;
+ * 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix on the end of a page
+ * -- handle as single 16 bit insn
+ */
+ return true;
+}
+
+/* Translate a 32-bit thumb instruction. */
+static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
+{
+ /*
+ * ARMv6-M supports a limited subset of Thumb2 instructions.
+ * Other Thumb1 architectures allow only 32-bit
+ * combined BL/BLX prefix and suffix.
+ */
+ if (arm_dc_feature(s, ARM_FEATURE_M) &&
+ !arm_dc_feature(s, ARM_FEATURE_V7)) {
+ int i;
+ bool found = false;
+ static const uint32_t armv6m_insn[] = {0xf3808000 /* msr */,
+ 0xf3b08040 /* dsb */,
+ 0xf3b08050 /* dmb */,
+ 0xf3b08060 /* isb */,
+ 0xf3e08000 /* mrs */,
+ 0xf000d000 /* bl */};
+ static const uint32_t armv6m_mask[] = {0xffe0d000,
+ 0xfff0d0f0,
+ 0xfff0d0f0,
+ 0xfff0d0f0,
+ 0xffe0d000,
+ 0xf800d000};
+
+ for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) {
+ if ((insn & armv6m_mask[i]) == armv6m_insn[i]) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ goto illegal_op;
+ }
+ } else if ((insn & 0xf800e800) != 0xf000e800) {
+ if (!arm_dc_feature(s, ARM_FEATURE_THUMB2)) {
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ /*
+ * NOCP takes precedence over any UNDEF for (almost) the
+ * entire wide range of coprocessor-space encodings, so check
+ * for it first before proceeding to actually decode eg VFP
+ * insns. This decode also handles the few insns which are
+ * in copro space but do not have NOCP checks (eg VLLDM, VLSTM).
+ */
+ if (disas_m_nocp(s, insn)) {
+ return;
+ }
+ }
+
+ if ((insn & 0xef000000) == 0xef000000) {
+ /*
+ * T32 encodings 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+ * transform into
+ * A32 encodings 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+ */
+ uint32_t a32_insn = (insn & 0xe2ffffff) |
+ ((insn & (1 << 28)) >> 4) | (1 << 28);
+
+ if (disas_neon_dp(s, a32_insn)) {
+ return;
+ }
+ }
+
+ if ((insn & 0xff100000) == 0xf9000000) {
+ /*
+ * T32 encodings 0b1111_1001_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
+ * transform into
+ * A32 encodings 0b1111_0100_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
+ */
+ uint32_t a32_insn = (insn & 0x00ffffff) | 0xf4000000;
+
+ if (disas_neon_ls(s, a32_insn)) {
+ return;
+ }
+ }
+
+ /*
+ * TODO: Perhaps merge these into one decodetree output file.
+ * Note disas_vfp is written for a32 with cond field in the
+ * top nibble. The t32 encoding requires 0xe in the top nibble.
+ */
+ if (disas_t32(s, insn) ||
+ disas_vfp_uncond(s, insn) ||
+ disas_neon_shared(s, insn) ||
+ disas_mve(s, insn) ||
+ ((insn >> 28) == 0xe && disas_vfp(s, insn))) {
+ return;
+ }
+
+illegal_op:
+ unallocated_encoding(s);
+}
+
+static void disas_thumb_insn(DisasContext *s, uint32_t insn)
+{
+ if (!disas_t16(s, insn)) {
+ unallocated_encoding(s);
+ }
+}
+
+static bool insn_crosses_page(CPUARMState *env, DisasContext *s)
+{
+ /* Return true if the insn at dc->base.pc_next might cross a page boundary.
+ * (False positives are OK, false negatives are not.)
+ * We know this is a Thumb insn, and our caller ensures we are
+ * only called if dc->base.pc_next is less than 4 bytes from the page
+ * boundary, so we cross the page if the first 16 bits indicate
+ * that this is a 32 bit insn.
+ */
+ uint16_t insn = arm_lduw_code(env, &s->base, s->base.pc_next, s->sctlr_b);
+
+ return !thumb_insn_is_16bit(s, s->base.pc_next, insn);
+}
+
+static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ CPUARMState *env = cs->env_ptr;
+ ARMCPU *cpu = env_archcpu(env);
+ CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
+ uint32_t condexec, core_mmu_idx;
+
+ dc->isar = &cpu->isar;
+ dc->condjmp = 0;
+ dc->pc_save = dc->base.pc_first;
+ dc->aarch64 = false;
+ dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB);
+ dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
+ condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC);
+ /*
+ * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this
+ * is always the IT bits. On M-profile, some of the reserved encodings
+ * of IT are used instead to indicate either ICI or ECI, which
+ * indicate partial progress of a restartable insn that was interrupted
+ * partway through by an exception:
+ * * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits
+ * * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits
+ * In all cases CONDEXEC == 0 means "not in IT block or restartable
+ * insn, behave normally".
+ */
+ dc->eci = dc->condexec_mask = dc->condexec_cond = 0;
+ dc->eci_handled = false;
+ if (condexec & 0xf) {
+ dc->condexec_mask = (condexec & 0xf) << 1;
+ dc->condexec_cond = condexec >> 4;
+ } else {
+ if (arm_feature(env, ARM_FEATURE_M)) {
+ dc->eci = condexec >> 4;
+ }
+ }
+
+ core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
+ dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx);
+ dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
+#if !defined(CONFIG_USER_ONLY)
+ dc->user = (dc->current_el == 0);
+#endif
+ dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
+ dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
+ dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
+ dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
+ dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
+
+ if (arm_feature(env, ARM_FEATURE_M)) {
+ dc->vfp_enabled = 1;
+ dc->be_data = MO_TE;
+ dc->v7m_handler_mode = EX_TBFLAG_M32(tb_flags, HANDLER);
+ dc->v8m_secure = EX_TBFLAG_M32(tb_flags, SECURE);
+ dc->v8m_stackcheck = EX_TBFLAG_M32(tb_flags, STACKCHECK);
+ dc->v8m_fpccr_s_wrong = EX_TBFLAG_M32(tb_flags, FPCCR_S_WRONG);
+ dc->v7m_new_fp_ctxt_needed =
+ EX_TBFLAG_M32(tb_flags, NEW_FP_CTXT_NEEDED);
+ dc->v7m_lspact = EX_TBFLAG_M32(tb_flags, LSPACT);
+ dc->mve_no_pred = EX_TBFLAG_M32(tb_flags, MVE_NO_PRED);
+ } else {
+ dc->sctlr_b = EX_TBFLAG_A32(tb_flags, SCTLR__B);
+ dc->hstr_active = EX_TBFLAG_A32(tb_flags, HSTR_ACTIVE);
+ dc->ns = EX_TBFLAG_A32(tb_flags, NS);
+ dc->vfp_enabled = EX_TBFLAG_A32(tb_flags, VFPEN);
+ if (arm_feature(env, ARM_FEATURE_XSCALE)) {
+ dc->c15_cpar = EX_TBFLAG_A32(tb_flags, XSCALE_CPAR);
+ } else {
+ dc->vec_len = EX_TBFLAG_A32(tb_flags, VECLEN);
+ dc->vec_stride = EX_TBFLAG_A32(tb_flags, VECSTRIDE);
+ }
+ dc->sme_trap_nonstreaming =
+ EX_TBFLAG_A32(tb_flags, SME_TRAP_NONSTREAMING);
+ }
+ dc->cp_regs = cpu->cp_regs;
+ dc->features = env->features;
+
+ /* Single step state. The code-generation logic here is:
+ * SS_ACTIVE == 0:
+ * generate code with no special handling for single-stepping (except
+ * that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+ * this happens anyway because those changes are all system register or
+ * PSTATE writes).
+ * SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+ * emit code for one insn
+ * emit code to clear PSTATE.SS
+ * emit code to generate software step exception for completed step
+ * end TB (as usual for having generated an exception)
+ * SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+ * emit code to generate a software step exception
+ * end the TB
+ */
+ dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
+ dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
+ dc->is_ldex = false;
+
+ dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK;
+
+ /* If architectural single step active, limit to 1. */
+ if (dc->ss_active) {
+ dc->base.max_insns = 1;
+ }
+
+ /* ARM is a fixed-length ISA. Bound the number of insns to execute
+ to those left on the page. */
+ if (!dc->thumb) {
+ int bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
+ dc->base.max_insns = MIN(dc->base.max_insns, bound);
+ }
+
+ cpu_V0 = tcg_temp_new_i64();
+ cpu_V1 = tcg_temp_new_i64();
+ cpu_M0 = tcg_temp_new_i64();
+}
+
+static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+ /* A note on handling of the condexec (IT) bits:
+ *
+ * We want to avoid the overhead of having to write the updated condexec
+ * bits back to the CPUARMState for every instruction in an IT block. So:
+ * (1) if the condexec bits are not already zero then we write
+ * zero back into the CPUARMState now. This avoids complications trying
+ * to do it at the end of the block. (For example if we don't do this
+ * it's hard to identify whether we can safely skip writing condexec
+ * at the end of the TB, which we definitely want to do for the case
+ * where a TB doesn't do anything with the IT state at all.)
+ * (2) if we are going to leave the TB then we call gen_set_condexec()
+ * which will write the correct value into CPUARMState if zero is wrong.
+ * This is done both for leaving the TB at the end, and for leaving
+ * it because of an exception we know will happen, which is done in
+ * gen_exception_insn(). The latter is necessary because we need to
+ * leave the TB with the PC/IT state just prior to execution of the
+ * instruction which caused the exception.
+ * (3) if we leave the TB unexpectedly (eg a data abort on a load)
+ * then the CPUARMState will be wrong and we need to reset it.
+ * This is handled in the same way as restoration of the
+ * PC in these situations; we save the value of the condexec bits
+ * for each PC via tcg_gen_insn_start(), and restore_state_to_opc()
+ * then uses this to restore them after an exception.
+ *
+ * Note that there are no instructions which can read the condexec
+ * bits, and none which can write non-static values to them, so
+ * we don't need to care about whether CPUARMState is correct in the
+ * middle of a TB.
+ */
+
+ /* Reset the conditional execution bits immediately. This avoids
+ complications trying to do it at the end of the block. */
+ if (dc->condexec_mask || dc->condexec_cond) {
+ store_cpu_field_constant(0, condexec_bits);
+ }
+}
+
+static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ /*
+ * The ECI/ICI bits share PSR bits with the IT bits, so we
+ * need to reconstitute the bits from the split-out DisasContext
+ * fields here.
+ */
+ uint32_t condexec_bits;
+ target_ulong pc_arg = dc->base.pc_next;
+
+ if (TARGET_TB_PCREL) {
+ pc_arg &= ~TARGET_PAGE_MASK;
+ }
+ if (dc->eci) {
+ condexec_bits = dc->eci << 4;
+ } else {
+ condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1);
+ }
+ tcg_gen_insn_start(pc_arg, condexec_bits, 0);
+ dc->insn_start = tcg_last_op();
+}
+
+static bool arm_check_kernelpage(DisasContext *dc)
+{
+#ifdef CONFIG_USER_ONLY
+ /* Intercept jump to the magic kernel page. */
+ if (dc->base.pc_next >= 0xffff0000) {
+ /* We always get here via a jump, so know we are not in a
+ conditional execution block. */
+ gen_exception_internal(EXCP_KERNEL_TRAP);
+ dc->base.is_jmp = DISAS_NORETURN;
+ return true;
+ }
+#endif
+ return false;
+}
+
+static bool arm_check_ss_active(DisasContext *dc)
+{
+ if (dc->ss_active && !dc->pstate_ss) {
+ /* Singlestep state is Active-pending.
+ * If we're in this state at the start of a TB then either
+ * a) we just took an exception to an EL which is being debugged
+ * and this is the first insn in the exception handler
+ * b) debug exceptions were masked and we just unmasked them
+ * without changing EL (eg by clearing PSTATE.D)
+ * In either case we're going to take a swstep exception in the
+ * "did not step an insn" case, and so the syndrome ISV and EX
+ * bits should be zero.
+ */
+ assert(dc->base.num_insns == 1);
+ gen_swstep_exception(dc, 0, 0);
+ dc->base.is_jmp = DISAS_NORETURN;
+ return true;
+ }
+
+ return false;
+}
+
+static void arm_post_translate_insn(DisasContext *dc)
+{
+ if (dc->condjmp && dc->base.is_jmp == DISAS_NEXT) {
+ if (dc->pc_save != dc->condlabel.pc_save) {
+ gen_update_pc(dc, dc->condlabel.pc_save - dc->pc_save);
+ }
+ gen_set_label(dc->condlabel.label);
+ dc->condjmp = 0;
+ }
+ translator_loop_temp_check(&dc->base);
+}
+
+static void arm_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ CPUARMState *env = cpu->env_ptr;
+ uint32_t pc = dc->base.pc_next;
+ unsigned int insn;
+
+ /* Singlestep exceptions have the highest priority. */
+ if (arm_check_ss_active(dc)) {
+ dc->base.pc_next = pc + 4;
+ return;
+ }
+
+ if (pc & 3) {
+ /*
+ * PC alignment fault. This has priority over the instruction abort
+ * that we would receive from a translation fault via arm_ldl_code
+ * (or the execution of the kernelpage entrypoint). This should only
+ * be possible after an indirect branch, at the start of the TB.
+ */
+ assert(dc->base.num_insns == 1);
+ gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
+ dc->base.is_jmp = DISAS_NORETURN;
+ dc->base.pc_next = QEMU_ALIGN_UP(pc, 4);
+ return;
+ }
+
+ if (arm_check_kernelpage(dc)) {
+ dc->base.pc_next = pc + 4;
+ return;
+ }
+
+ dc->pc_curr = pc;
+ insn = arm_ldl_code(env, &dc->base, pc, dc->sctlr_b);
+ dc->insn = insn;
+ dc->base.pc_next = pc + 4;
+ disas_arm_insn(dc, insn);
+
+ arm_post_translate_insn(dc);
+
+ /* ARM is a fixed-length ISA. We performed the cross-page check
+ in init_disas_context by adjusting max_insns. */
+}
+
+static bool thumb_insn_is_unconditional(DisasContext *s, uint32_t insn)
+{
+ /* Return true if this Thumb insn is always unconditional,
+ * even inside an IT block. This is true of only a very few
+ * instructions: BKPT, HLT, and SG.
+ *
+ * A larger class of instructions are UNPREDICTABLE if used
+ * inside an IT block; we do not need to detect those here, because
+ * what we do by default (perform the cc check and update the IT
+ * bits state machine) is a permitted CONSTRAINED UNPREDICTABLE
+ * choice for those situations.
+ *
+ * insn is either a 16-bit or a 32-bit instruction; the two are
+ * distinguishable because for the 16-bit case the top 16 bits
+ * are zeroes, and that isn't a valid 32-bit encoding.
+ */
+ if ((insn & 0xffffff00) == 0xbe00) {
+ /* BKPT */
+ return true;
+ }
+
+ if ((insn & 0xffffffc0) == 0xba80 && arm_dc_feature(s, ARM_FEATURE_V8) &&
+ !arm_dc_feature(s, ARM_FEATURE_M)) {
+ /* HLT: v8A only. This is unconditional even when it is going to
+ * UNDEF; see the v8A ARM ARM DDI0487B.a H3.3.
+ * For v7 cores this was a plain old undefined encoding and so
+ * honours its cc check. (We might be using the encoding as
+ * a semihosting trap, but we don't change the cc check behaviour
+ * on that account, because a debugger connected to a real v7A
+ * core and emulating semihosting traps by catching the UNDEF
+ * exception would also only see cases where the cc check passed.
+ * No guest code should be trying to do a HLT semihosting trap
+ * in an IT block anyway.
+ */
+ return true;
+ }
+
+ if (insn == 0xe97fe97f && arm_dc_feature(s, ARM_FEATURE_V8) &&
+ arm_dc_feature(s, ARM_FEATURE_M)) {
+ /* SG: v8M only */
+ return true;
+ }
+
+ return false;
+}
+
+static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+ CPUARMState *env = cpu->env_ptr;
+ uint32_t pc = dc->base.pc_next;
+ uint32_t insn;
+ bool is_16bit;
+ /* TCG op to rewind to if this turns out to be an invalid ECI state */
+ TCGOp *insn_eci_rewind = NULL;
+ target_ulong insn_eci_pc_save = -1;
+
+ /* Misaligned thumb PC is architecturally impossible. */
+ assert((dc->base.pc_next & 1) == 0);
+
+ if (arm_check_ss_active(dc) || arm_check_kernelpage(dc)) {
+ dc->base.pc_next = pc + 2;
+ return;
+ }
+
+ dc->pc_curr = pc;
+ insn = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
+ is_16bit = thumb_insn_is_16bit(dc, dc->base.pc_next, insn);
+ pc += 2;
+ if (!is_16bit) {
+ uint32_t insn2 = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
+ insn = insn << 16 | insn2;
+ pc += 2;
+ }
+ dc->base.pc_next = pc;
+ dc->insn = insn;
+
+ if (dc->pstate_il) {
+ /*
+ * Illegal execution state. This has priority over BTI
+ * exceptions, but comes after instruction abort exceptions.
+ */
+ gen_exception_insn(dc, 0, EXCP_UDEF, syn_illegalstate());
+ return;
+ }
+
+ if (dc->eci) {
+ /*
+ * For M-profile continuable instructions, ECI/ICI handling
+ * falls into these cases:
+ * - interrupt-continuable instructions
+ * These are the various load/store multiple insns (both
+ * integer and fp). The ICI bits indicate the register
+ * where the load/store can resume. We make the IMPDEF
+ * choice to always do "instruction restart", ie ignore
+ * the ICI value and always execute the ldm/stm from the
+ * start. So all we need to do is zero PSR.ICI if the
+ * insn executes.
+ * - MVE instructions subject to beat-wise execution
+ * Here the ECI bits indicate which beats have already been
+ * executed, and we must honour this. Each insn of this
+ * type will handle it correctly. We will update PSR.ECI
+ * in the helper function for the insn (some ECI values
+ * mean that the following insn also has been partially
+ * executed).
+ * - Special cases which don't advance ECI
+ * The insns LE, LETP and BKPT leave the ECI/ICI state
+ * bits untouched.
+ * - all other insns (the common case)
+ * Non-zero ECI/ICI means an INVSTATE UsageFault.
+ * We place a rewind-marker here. Insns in the previous
+ * three categories will set a flag in the DisasContext.
+ * If the flag isn't set after we call disas_thumb_insn()
+ * or disas_thumb2_insn() then we know we have a "some other
+ * insn" case. We will rewind to the marker (ie throwing away
+ * all the generated code) and instead emit "take exception".
+ */
+ insn_eci_rewind = tcg_last_op();
+ insn_eci_pc_save = dc->pc_save;
+ }
+
+ if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) {
+ uint32_t cond = dc->condexec_cond;
+
+ /*
+ * Conditionally skip the insn. Note that both 0xe and 0xf mean
+ * "always"; 0xf is not "never".
+ */
+ if (cond < 0x0e) {
+ arm_skip_unless(dc, cond);
+ }
+ }
+
+ if (is_16bit) {
+ disas_thumb_insn(dc, insn);
+ } else {
+ disas_thumb2_insn(dc, insn);
+ }
+
+ /* Advance the Thumb condexec condition. */
+ if (dc->condexec_mask) {
+ dc->condexec_cond = ((dc->condexec_cond & 0xe) |
+ ((dc->condexec_mask >> 4) & 1));
+ dc->condexec_mask = (dc->condexec_mask << 1) & 0x1f;
+ if (dc->condexec_mask == 0) {
+ dc->condexec_cond = 0;
+ }
+ }
+
+ if (dc->eci && !dc->eci_handled) {
+ /*
+ * Insn wasn't valid for ECI/ICI at all: undo what we
+ * just generated and instead emit an exception
+ */
+ tcg_remove_ops_after(insn_eci_rewind);
+ dc->pc_save = insn_eci_pc_save;
+ dc->condjmp = 0;
+ gen_exception_insn(dc, 0, EXCP_INVSTATE, syn_uncategorized());
+ }
+
+ arm_post_translate_insn(dc);
+
+ /* Thumb is a variable-length ISA. Stop translation when the next insn
+ * will touch a new page. This ensures that prefetch aborts occur at
+ * the right place.
+ *
+ * We want to stop the TB if the next insn starts in a new page,
+ * or if it spans between this page and the next. This means that
+ * if we're looking at the last halfword in the page we need to
+ * see if it's a 16-bit Thumb insn (which will fit in this TB)
+ * or a 32-bit Thumb insn (which won't).
+ * This is to avoid generating a silly TB with a single 16-bit insn
+ * in it at the end of this page (which would execute correctly
+ * but isn't very efficient).
+ */
+ if (dc->base.is_jmp == DISAS_NEXT
+ && (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE
+ || (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE - 3
+ && insn_crosses_page(env, dc)))) {
+ dc->base.is_jmp = DISAS_TOO_MANY;
+ }
+}
+
+static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+ /* At this stage dc->condjmp will only be set when the skipped
+ instruction was a conditional branch or trap, and the PC has
+ already been written. */
+ gen_set_condexec(dc);
+ if (dc->base.is_jmp == DISAS_BX_EXCRET) {
+ /* Exception return branches need some special case code at the
+ * end of the TB, which is complex enough that it has to
+ * handle the single-step vs not and the condition-failed
+ * insn codepath itself.
+ */
+ gen_bx_excret_final_code(dc);
+ } else if (unlikely(dc->ss_active)) {
+ /* Unconditional and "condition passed" instruction codepath. */
+ switch (dc->base.is_jmp) {
+ case DISAS_SWI:
+ gen_ss_advance(dc);
+ gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+ break;
+ case DISAS_HVC:
+ gen_ss_advance(dc);
+ gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
+ break;
+ case DISAS_SMC:
+ gen_ss_advance(dc);
+ gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
+ break;
+ case DISAS_NEXT:
+ case DISAS_TOO_MANY:
+ case DISAS_UPDATE_EXIT:
+ case DISAS_UPDATE_NOCHAIN:
+ gen_update_pc(dc, curr_insn_len(dc));
+ /* fall through */
+ default:
+ /* FIXME: Single stepping a WFI insn will not halt the CPU. */
+ gen_singlestep_exception(dc);
+ break;
+ case DISAS_NORETURN:
+ break;
+ }
+ } else {
+ /* While branches must always occur at the end of an IT block,
+ there are a few other things that can cause us to terminate
+ the TB in the middle of an IT block:
+ - Exception generating instructions (bkpt, swi, undefined).
+ - Page boundaries.
+ - Hardware watchpoints.
+ Hardware breakpoints have already been handled and skip this code.
+ */
+ switch (dc->base.is_jmp) {
+ case DISAS_NEXT:
+ case DISAS_TOO_MANY:
+ gen_goto_tb(dc, 1, curr_insn_len(dc));
+ break;
+ case DISAS_UPDATE_NOCHAIN:
+ gen_update_pc(dc, curr_insn_len(dc));
+ /* fall through */
+ case DISAS_JUMP:
+ gen_goto_ptr();
+ break;
+ case DISAS_UPDATE_EXIT:
+ gen_update_pc(dc, curr_insn_len(dc));
+ /* fall through */
+ default:
+ /* indicate that the hash table must be used to find the next TB */
+ tcg_gen_exit_tb(NULL, 0);
+ break;
+ case DISAS_NORETURN:
+ /* nothing more to generate */
+ break;
+ case DISAS_WFI:
+ gen_helper_wfi(cpu_env, tcg_constant_i32(curr_insn_len(dc)));
+ /*
+ * The helper doesn't necessarily throw an exception, but we
+ * must go back to the main loop to check for interrupts anyway.
+ */
+ tcg_gen_exit_tb(NULL, 0);
+ break;
+ case DISAS_WFE:
+ gen_helper_wfe(cpu_env);
+ break;
+ case DISAS_YIELD:
+ gen_helper_yield(cpu_env);
+ break;
+ case DISAS_SWI:
+ gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+ break;
+ case DISAS_HVC:
+ gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
+ break;
+ case DISAS_SMC:
+ gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
+ break;
+ }
+ }
+
+ if (dc->condjmp) {
+ /* "Condition failed" instruction codepath for the branch/trap insn */
+ set_disas_label(dc, dc->condlabel);
+ gen_set_condexec(dc);
+ if (unlikely(dc->ss_active)) {
+ gen_update_pc(dc, curr_insn_len(dc));
+ gen_singlestep_exception(dc);
+ } else {
+ gen_goto_tb(dc, 1, curr_insn_len(dc));
+ }
+ }
+}
+
+static void arm_tr_disas_log(const DisasContextBase *dcbase,
+ CPUState *cpu, FILE *logfile)
+{
+ DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+ fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
+ target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
+}
+
+static const TranslatorOps arm_translator_ops = {
+ .init_disas_context = arm_tr_init_disas_context,
+ .tb_start = arm_tr_tb_start,
+ .insn_start = arm_tr_insn_start,
+ .translate_insn = arm_tr_translate_insn,
+ .tb_stop = arm_tr_tb_stop,
+ .disas_log = arm_tr_disas_log,
+};
+
+static const TranslatorOps thumb_translator_ops = {
+ .init_disas_context = arm_tr_init_disas_context,
+ .tb_start = arm_tr_tb_start,
+ .insn_start = arm_tr_insn_start,
+ .translate_insn = thumb_tr_translate_insn,
+ .tb_stop = arm_tr_tb_stop,
+ .disas_log = arm_tr_disas_log,
+};
+
+/* generate intermediate code for basic block 'tb'. */
+void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
+ target_ulong pc, void *host_pc)
+{
+ DisasContext dc = { };
+ const TranslatorOps *ops = &arm_translator_ops;
+ CPUARMTBFlags tb_flags = arm_tbflags_from_tb(tb);
+
+ if (EX_TBFLAG_AM32(tb_flags, THUMB)) {
+ ops = &thumb_translator_ops;
+ }
+#ifdef TARGET_AARCH64
+ if (EX_TBFLAG_ANY(tb_flags, AARCH64_STATE)) {
+ ops = &aarch64_translator_ops;
+ }
+#endif
+
+ translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base);
+}
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
new file mode 100644
index 0000000..3717824
--- /dev/null
+++ b/target/arm/tcg/translate.h
@@ -0,0 +1,644 @@
+#ifndef TARGET_ARM_TRANSLATE_H
+#define TARGET_ARM_TRANSLATE_H
+
+#include "exec/translator.h"
+#include "internals.h"
+
+
+/* internal defines */
+
+/*
+ * Save pc_save across a branch, so that we may restore the value from
+ * before the branch at the point the label is emitted.
+ */
+typedef struct DisasLabel {
+ TCGLabel *label;
+ target_ulong pc_save;
+} DisasLabel;
+
+typedef struct DisasContext {
+ DisasContextBase base;
+ const ARMISARegisters *isar;
+
+ /* The address of the current instruction being translated. */
+ target_ulong pc_curr;
+ /*
+ * For TARGET_TB_PCREL, the full value of cpu_pc is not known
+ * (although the page offset is known). For convenience, the
+ * translation loop uses the full virtual address that triggered
+ * the translation, from base.pc_start through pc_curr.
+ * For efficiency, we do not update cpu_pc for every instruction.
+ * Instead, pc_save has the value of pc_curr at the time of the
+ * last update to cpu_pc, which allows us to compute the addend
+ * needed to bring cpu_pc current: pc_curr - pc_save.
+ * If cpu_pc now contains the destination of an indirect branch,
+ * pc_save contains -1 to indicate that relative updates are no
+ * longer possible.
+ */
+ target_ulong pc_save;
+ target_ulong page_start;
+ uint32_t insn;
+ /* Nonzero if this instruction has been conditionally skipped. */
+ int condjmp;
+ /* The label that will be jumped to when the instruction is skipped. */
+ DisasLabel condlabel;
+ /* Thumb-2 conditional execution bits. */
+ int condexec_mask;
+ int condexec_cond;
+ /* M-profile ECI/ICI exception-continuable instruction state */
+ int eci;
+ /*
+ * trans_ functions for insns which are continuable should set this true
+ * after decode (ie after any UNDEF checks)
+ */
+ bool eci_handled;
+ int sctlr_b;
+ MemOp be_data;
+#if !defined(CONFIG_USER_ONLY)
+ int user;
+#endif
+ ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */
+ uint8_t tbii; /* TBI1|TBI0 for insns */
+ uint8_t tbid; /* TBI1|TBI0 for data */
+ uint8_t tcma; /* TCMA1|TCMA0 for MTE */
+ bool ns; /* Use non-secure CPREG bank on access */
+ int fp_excp_el; /* FP exception EL or 0 if enabled */
+ int sve_excp_el; /* SVE exception EL or 0 if enabled */
+ int sme_excp_el; /* SME exception EL or 0 if enabled */
+ int vl; /* current vector length in bytes */
+ int svl; /* current streaming vector length in bytes */
+ bool vfp_enabled; /* FP enabled via FPSCR.EN */
+ int vec_len;
+ int vec_stride;
+ bool v7m_handler_mode;
+ bool v8m_secure; /* true if v8M and we're in Secure mode */
+ bool v8m_stackcheck; /* true if we need to perform v8M stack limit checks */
+ bool v8m_fpccr_s_wrong; /* true if v8M FPCCR.S != v8m_secure */
+ bool v7m_new_fp_ctxt_needed; /* ASPEN set but no active FP context */
+ bool v7m_lspact; /* FPCCR.LSPACT set */
+ /* Immediate value in AArch32 SVC insn; must be set if is_jmp == DISAS_SWI
+ * so that top level loop can generate correct syndrome information.
+ */
+ uint32_t svc_imm;
+ int current_el;
+ GHashTable *cp_regs;
+ uint64_t features; /* CPU features bits */
+ bool aarch64;
+ bool thumb;
+ /* Because unallocated encodings generate different exception syndrome
+ * information from traps due to FP being disabled, we can't do a single
+ * "is fp access disabled" check at a high level in the decode tree.
+ * To help in catching bugs where the access check was forgotten in some
+ * code path, we set this flag when the access check is done, and assert
+ * that it is set at the point where we actually touch the FP regs.
+ */
+ bool fp_access_checked;
+ bool sve_access_checked;
+ /* ARMv8 single-step state (this is distinct from the QEMU gdbstub
+ * single-step support).
+ */
+ bool ss_active;
+ bool pstate_ss;
+ /* True if the insn just emitted was a load-exclusive instruction
+ * (necessary for syndrome information for single step exceptions),
+ * ie A64 LDX*, LDAX*, A32/T32 LDREX*, LDAEX*.
+ */
+ bool is_ldex;
+ /* True if AccType_UNPRIV should be used for LDTR et al */
+ bool unpriv;
+ /* True if v8.3-PAuth is active. */
+ bool pauth_active;
+ /* True if v8.5-MTE access to tags is enabled. */
+ bool ata;
+ /* True if v8.5-MTE tag checks affect the PE; index with is_unpriv. */
+ bool mte_active[2];
+ /* True with v8.5-BTI and SCTLR_ELx.BT* set. */
+ bool bt;
+ /* True if any CP15 access is trapped by HSTR_EL2 */
+ bool hstr_active;
+ /* True if memory operations require alignment */
+ bool align_mem;
+ /* True if PSTATE.IL is set */
+ bool pstate_il;
+ /* True if PSTATE.SM is set. */
+ bool pstate_sm;
+ /* True if PSTATE.ZA is set. */
+ bool pstate_za;
+ /* True if non-streaming insns should raise an SME Streaming exception. */
+ bool sme_trap_nonstreaming;
+ /* True if the current instruction is non-streaming. */
+ bool is_nonstreaming;
+ /* True if MVE insns are definitely not predicated by VPR or LTPSIZE */
+ bool mve_no_pred;
+ /* True if fine-grained traps are active */
+ bool fgt_active;
+ /* True if fine-grained trap on ERET is enabled */
+ bool fgt_eret;
+ /* True if fine-grained trap on SVC is enabled */
+ bool fgt_svc;
+ /*
+ * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI.
+ * < 0, set by the current instruction.
+ */
+ int8_t btype;
+ /* A copy of cpu->dcz_blocksize. */
+ uint8_t dcz_blocksize;
+ /* True if this page is guarded. */
+ bool guarded_page;
+ /* Bottom two bits of XScale c15_cpar coprocessor access control reg */
+ int c15_cpar;
+ /* TCG op of the current insn_start. */
+ TCGOp *insn_start;
+#define TMP_A64_MAX 16
+ int tmp_a64_count;
+ TCGv_i64 tmp_a64[TMP_A64_MAX];
+} DisasContext;
+
+typedef struct DisasCompare {
+ TCGCond cond;
+ TCGv_i32 value;
+ bool value_global;
+} DisasCompare;
+
+/* Share the TCG temporaries common between 32 and 64 bit modes. */
+extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
+extern TCGv_i64 cpu_exclusive_addr;
+extern TCGv_i64 cpu_exclusive_val;
+
+/*
+ * Constant expanders for the decoders.
+ */
+
+static inline int negate(DisasContext *s, int x)
+{
+ return -x;
+}
+
+static inline int plus_1(DisasContext *s, int x)
+{
+ return x + 1;
+}
+
+static inline int plus_2(DisasContext *s, int x)
+{
+ return x + 2;
+}
+
+static inline int plus_12(DisasContext *s, int x)
+{
+ return x + 12;
+}
+
+static inline int times_2(DisasContext *s, int x)
+{
+ return x * 2;
+}
+
+static inline int times_4(DisasContext *s, int x)
+{
+ return x * 4;
+}
+
+static inline int times_2_plus_1(DisasContext *s, int x)
+{
+ return x * 2 + 1;
+}
+
+static inline int rsub_64(DisasContext *s, int x)
+{
+ return 64 - x;
+}
+
+static inline int rsub_32(DisasContext *s, int x)
+{
+ return 32 - x;
+}
+
+static inline int rsub_16(DisasContext *s, int x)
+{
+ return 16 - x;
+}
+
+static inline int rsub_8(DisasContext *s, int x)
+{
+ return 8 - x;
+}
+
+static inline int neon_3same_fp_size(DisasContext *s, int x)
+{
+ /* Convert 0==fp32, 1==fp16 into a MO_* value */
+ return MO_32 - x;
+}
+
+static inline int arm_dc_feature(DisasContext *dc, int feature)
+{
+ return (dc->features & (1ULL << feature)) != 0;
+}
+
+static inline int get_mem_index(DisasContext *s)
+{
+ return arm_to_core_mmu_idx(s->mmu_idx);
+}
+
+static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn)
+{
+ /* We don't need to save all of the syndrome so we mask and shift
+ * out unneeded bits to help the sleb128 encoder do a better job.
+ */
+ syn &= ARM_INSN_START_WORD2_MASK;
+ syn >>= ARM_INSN_START_WORD2_SHIFT;
+
+ /* We check and clear insn_start_idx to catch multiple updates. */
+ assert(s->insn_start != NULL);
+ tcg_set_insn_start_param(s->insn_start, 2, syn);
+ s->insn_start = NULL;
+}
+
+static inline int curr_insn_len(DisasContext *s)
+{
+ return s->base.pc_next - s->pc_curr;
+}
+
+/* is_jmp field values */
+#define DISAS_JUMP DISAS_TARGET_0 /* only pc was modified dynamically */
+/* CPU state was modified dynamically; exit to main loop for interrupts. */
+#define DISAS_UPDATE_EXIT DISAS_TARGET_1
+/* These instructions trap after executing, so the A32/T32 decoder must
+ * defer them until after the conditional execution state has been updated.
+ * WFI also needs special handling when single-stepping.
+ */
+#define DISAS_WFI DISAS_TARGET_2
+#define DISAS_SWI DISAS_TARGET_3
+/* WFE */
+#define DISAS_WFE DISAS_TARGET_4
+#define DISAS_HVC DISAS_TARGET_5
+#define DISAS_SMC DISAS_TARGET_6
+#define DISAS_YIELD DISAS_TARGET_7
+/* M profile branch which might be an exception return (and so needs
+ * custom end-of-TB code)
+ */
+#define DISAS_BX_EXCRET DISAS_TARGET_8
+/*
+ * For instructions which want an immediate exit to the main loop, as opposed
+ * to attempting to use lookup_and_goto_ptr. Unlike DISAS_UPDATE_EXIT, this
+ * doesn't write the PC on exiting the translation loop so you need to ensure
+ * something (gen_a64_update_pc or runtime helper) has done so before we reach
+ * return from cpu_tb_exec.
+ */
+#define DISAS_EXIT DISAS_TARGET_9
+/* CPU state was modified dynamically; no need to exit, but do not chain. */
+#define DISAS_UPDATE_NOCHAIN DISAS_TARGET_10
+
+#ifdef TARGET_AARCH64
+void a64_translate_init(void);
+void gen_a64_update_pc(DisasContext *s, target_long diff);
+extern const TranslatorOps aarch64_translator_ops;
+#else
+static inline void a64_translate_init(void)
+{
+}
+
+static inline void gen_a64_update_pc(DisasContext *s, target_long diff)
+{
+}
+#endif
+
+void arm_test_cc(DisasCompare *cmp, int cc);
+void arm_free_cc(DisasCompare *cmp);
+void arm_jump_cc(DisasCompare *cmp, TCGLabel *label);
+void arm_gen_test_cc(int cc, TCGLabel *label);
+MemOp pow2_align(unsigned i);
+void unallocated_encoding(DisasContext *s);
+void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
+ uint32_t syn, uint32_t target_el);
+void gen_exception_insn(DisasContext *s, target_long pc_diff,
+ int excp, uint32_t syn);
+
+/* Return state of Alternate Half-precision flag, caller frees result */
+static inline TCGv_i32 get_ahp_flag(void)
+{
+ TCGv_i32 ret = tcg_temp_new_i32();
+
+ tcg_gen_ld_i32(ret, cpu_env,
+ offsetof(CPUARMState, vfp.xregs[ARM_VFP_FPSCR]));
+ tcg_gen_extract_i32(ret, ret, 26, 1);
+
+ return ret;
+}
+
+/* Set bits within PSTATE. */
+static inline void set_pstate_bits(uint32_t bits)
+{
+ TCGv_i32 p = tcg_temp_new_i32();
+
+ tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
+
+ tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+ tcg_gen_ori_i32(p, p, bits);
+ tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+ tcg_temp_free_i32(p);
+}
+
+/* Clear bits within PSTATE. */
+static inline void clear_pstate_bits(uint32_t bits)
+{
+ TCGv_i32 p = tcg_temp_new_i32();
+
+ tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
+
+ tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+ tcg_gen_andi_i32(p, p, ~bits);
+ tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+ tcg_temp_free_i32(p);
+}
+
+/* If the singlestep state is Active-not-pending, advance to Active-pending. */
+static inline void gen_ss_advance(DisasContext *s)
+{
+ if (s->ss_active) {
+ s->pstate_ss = 0;
+ clear_pstate_bits(PSTATE_SS);
+ }
+}
+
+/* Generate an architectural singlestep exception */
+static inline void gen_swstep_exception(DisasContext *s, int isv, int ex)
+{
+ /* Fill in the same_el field of the syndrome in the helper. */
+ uint32_t syn = syn_swstep(false, isv, ex);
+ gen_helper_exception_swstep(cpu_env, tcg_constant_i32(syn));
+}
+
+/*
+ * Given a VFP floating point constant encoded into an 8 bit immediate in an
+ * instruction, expand it to the actual constant value of the specified
+ * size, as per the VFPExpandImm() pseudocode in the Arm ARM.
+ */
+uint64_t vfp_expand_imm(int size, uint8_t imm8);
+
+/* Vector operations shared between ARM and AArch64. */
+void gen_gvec_ceq0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_clt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cgt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cle0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cge0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void gen_ushl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+/*
+ * Forward to the isar_feature_* tests given a DisasContext pointer.
+ */
+#define dc_isar_feature(name, ctx) \
+ ({ DisasContext *ctx_ = (ctx); isar_feature_##name(ctx_->isar); })
+
+/* Note that the gvec expanders operate on offsets + sizes. */
+typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t,
+ uint32_t, uint32_t);
+typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
+ uint32_t, uint32_t, uint32_t);
+typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
+ uint32_t, uint32_t, uint32_t);
+
+/* Function prototype for gen_ functions for calling Neon helpers */
+typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32);
+typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32);
+typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
+ TCGv_i32, TCGv_i32);
+typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
+typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
+typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
+typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
+typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
+typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32);
+typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr);
+typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
+typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
+typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64);
+typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
+typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
+typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
+typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
+typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift);
+typedef void ShiftFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+
+/**
+ * arm_tbflags_from_tb:
+ * @tb: the TranslationBlock
+ *
+ * Extract the flag values from @tb.
+ */
+static inline CPUARMTBFlags arm_tbflags_from_tb(const TranslationBlock *tb)
+{
+ return (CPUARMTBFlags){ tb->flags, tb->cs_base };
+}
+
+/*
+ * Enum for argument to fpstatus_ptr().
+ */
+typedef enum ARMFPStatusFlavour {
+ FPST_FPCR,
+ FPST_FPCR_F16,
+ FPST_STD,
+ FPST_STD_F16,
+} ARMFPStatusFlavour;
+
+/**
+ * fpstatus_ptr: return TCGv_ptr to the specified fp_status field
+ *
+ * We have multiple softfloat float_status fields in the Arm CPU state struct
+ * (see the comment in cpu.h for details). Return a TCGv_ptr which has
+ * been set up to point to the requested field in the CPU state struct.
+ * The options are:
+ *
+ * FPST_FPCR
+ * for non-FP16 operations controlled by the FPCR
+ * FPST_FPCR_F16
+ * for operations controlled by the FPCR where FPCR.FZ16 is to be used
+ * FPST_STD
+ * for A32/T32 Neon operations using the "standard FPSCR value"
+ * FPST_STD_F16
+ * as FPST_STD, but where FPCR.FZ16 is to be used
+ */
+static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour)
+{
+ TCGv_ptr statusptr = tcg_temp_new_ptr();
+ int offset;
+
+ switch (flavour) {
+ case FPST_FPCR:
+ offset = offsetof(CPUARMState, vfp.fp_status);
+ break;
+ case FPST_FPCR_F16:
+ offset = offsetof(CPUARMState, vfp.fp_status_f16);
+ break;
+ case FPST_STD:
+ offset = offsetof(CPUARMState, vfp.standard_fp_status);
+ break;
+ case FPST_STD_F16:
+ offset = offsetof(CPUARMState, vfp.standard_fp_status_f16);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_gen_addi_ptr(statusptr, cpu_env, offset);
+ return statusptr;
+}
+
+/**
+ * finalize_memop:
+ * @s: DisasContext
+ * @opc: size+sign+align of the memory operation
+ *
+ * Build the complete MemOp for a memory operation, including alignment
+ * and endianness.
+ *
+ * If (op & MO_AMASK) then the operation already contains the required
+ * alignment, e.g. for AccType_ATOMIC. Otherwise, this an optionally
+ * unaligned operation, e.g. for AccType_NORMAL.
+ *
+ * In the latter case, there are configuration bits that require alignment,
+ * and this is applied here. Note that there is no way to indicate that
+ * no alignment should ever be enforced; this must be handled manually.
+ */
+static inline MemOp finalize_memop(DisasContext *s, MemOp opc)
+{
+ if (s->align_mem && !(opc & MO_AMASK)) {
+ opc |= MO_ALIGN;
+ }
+ return opc | s->be_data;
+}
+
+/**
+ * asimd_imm_const: Expand an encoded SIMD constant value
+ *
+ * Expand a SIMD constant value. This is essentially the pseudocode
+ * AdvSIMDExpandImm, except that we also perform the boolean NOT needed for
+ * VMVN and VBIC (when cmode < 14 && op == 1).
+ *
+ * The combination cmode == 15 op == 1 is a reserved encoding for AArch32;
+ * callers must catch this; we return the 64-bit constant value defined
+ * for AArch64.
+ *
+ * cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but
+ * is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A;
+ * we produce an immediate constant value of 0 in these cases.
+ */
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op);
+
+/*
+ * gen_disas_label:
+ * Create a label and cache a copy of pc_save.
+ */
+static inline DisasLabel gen_disas_label(DisasContext *s)
+{
+ return (DisasLabel){
+ .label = gen_new_label(),
+ .pc_save = s->pc_save,
+ };
+}
+
+/*
+ * set_disas_label:
+ * Emit a label and restore the cached copy of pc_save.
+ */
+static inline void set_disas_label(DisasContext *s, DisasLabel l)
+{
+ gen_set_label(l.label);
+ s->pc_save = l.pc_save;
+}
+
+static inline TCGv_ptr gen_lookup_cp_reg(uint32_t key)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ gen_helper_lookup_cp_reg(ret, cpu_env, tcg_constant_i32(key));
+ return ret;
+}
+
+/*
+ * Helpers for implementing sets of trans_* functions.
+ * Defer the implementation of NAME to FUNC, with optional extra arguments.
+ */
+#define TRANS(NAME, FUNC, ...) \
+ static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
+ { return FUNC(s, __VA_ARGS__); }
+#define TRANS_FEAT(NAME, FEAT, FUNC, ...) \
+ static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
+ { return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); }
+
+#define TRANS_FEAT_NONSTREAMING(NAME, FEAT, FUNC, ...) \
+ static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
+ { \
+ s->is_nonstreaming = true; \
+ return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); \
+ }
+
+#endif /* TARGET_ARM_TRANSLATE_H */
diff --git a/target/arm/tcg/vfp-uncond.decode b/target/arm/tcg/vfp-uncond.decode
new file mode 100644
index 0000000..5c50447
--- /dev/null
+++ b/target/arm/tcg/vfp-uncond.decode
@@ -0,0 +1,82 @@
+# AArch32 VFP instruction descriptions (unconditional insns)
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the unconditional VFP instructions are here:
+# generally anything matching A32
+# 1111 1110 .... .... .... 101. ...0 ....
+# and T32
+# 1111 110. .... .... .... 101. .... ....
+# 1111 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp 5:1 0:4
+%vm_sp 0:4 5:1
+%vn_dp 7:1 16:4
+%vn_sp 16:4 7:1
+%vd_dp 22:1 12:4
+%vd_sp 12:4 22:1
+
+@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
+@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VSEL 1111 1110 0. cc:2 .... .... 1001 .0.0 .... \
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1
+VSEL 1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2
+VSEL 1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3
+
+VMAXNM_hp 1111 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
+VMINNM_hp 1111 1110 1.00 .... .... 1001 .1.0 .... @vfp_dnm_s
+
+VMAXNM_sp 1111 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
+VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s
+
+VMAXNM_dp 1111 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
+VMINNM_dp 1111 1110 1.00 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VRINT 1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=1
+VRINT 1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=2
+VRINT 1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
+ vm=%vm_dp vd=%vd_dp sz=3
+
+# VCVT float to int with specified rounding mode; Vd is always single-precision
+VCVT 1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=1
+VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=2
+VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
+ vm=%vm_dp vd=%vd_sp sz=3
+
+VMOVX 1111 1110 1.11 0000 .... 1010 01 . 0 .... \
+ vd=%vd_sp vm=%vm_sp
+
+VINS 1111 1110 1.11 0000 .... 1010 11 . 0 .... \
+ vd=%vd_sp vm=%vm_sp
diff --git a/target/arm/tcg/vfp.decode b/target/arm/tcg/vfp.decode
new file mode 100644
index 0000000..5405e80
--- /dev/null
+++ b/target/arm/tcg/vfp.decode
@@ -0,0 +1,247 @@
+# AArch32 VFP instruction descriptions (conditional insns)
+#
+# Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the conditional VFP instructions are here:
+# generally anything matching A32
+# cccc 11.. .... .... .... 101. .... ....
+# and T32
+# 1110 110. .... .... .... 101. .... ....
+# 1110 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp 5:1 0:4
+%vm_sp 0:4 5:1
+%vn_dp 7:1 16:4
+%vn_sp 16:4 7:1
+%vd_dp 22:1 12:4
+%vd_sp 12:4 22:1
+
+%vmov_idx_b 21:1 5:2
+%vmov_idx_h 21:1 6:1
+
+%vmov_imm 16:4 0:4
+
+@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
+@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+@vfp_dm_ss ................................ vm=%vm_sp vd=%vd_sp
+@vfp_dm_dd ................................ vm=%vm_dp vd=%vd_dp
+@vfp_dm_ds ................................ vm=%vm_sp vd=%vd_dp
+@vfp_dm_sd ................................ vm=%vm_dp vd=%vd_sp
+
+# VMOV scalar to general-purpose register; note that this does
+# include some Neon cases.
+VMOV_to_gp ---- 1110 u:1 1. 1 .... rt:4 1011 ... 1 0000 \
+ vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_to_gp ---- 1110 u:1 0. 1 .... rt:4 1011 ..1 1 0000 \
+ vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_to_gp ---- 1110 0 0 index:1 1 .... rt:4 1011 .00 1 0000 \
+ vn=%vn_dp size=2 u=0
+
+VMOV_from_gp ---- 1110 0 1. 0 .... rt:4 1011 ... 1 0000 \
+ vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_from_gp ---- 1110 0 0. 0 .... rt:4 1011 ..1 1 0000 \
+ vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_from_gp ---- 1110 0 0 index:1 0 .... rt:4 1011 .00 1 0000 \
+ vn=%vn_dp size=2
+
+VDUP ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
+ vn=%vn_dp
+
+VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+VMOV_half ---- 1110 000 l:1 .... rt:4 1001 . 001 0000 vn=%vn_sp
+VMOV_single ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 vn=%vn_sp
+
+VMOV_64_sp ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... vm=%vm_sp
+VMOV_64_dp ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 .... vm=%vm_dp
+
+VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp
+VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp
+VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp
+
+# We split the load/store multiple up into two patterns to avoid
+# overlap with other insns in the "Advanced SIMD load/store and 64-bit move"
+# grouping:
+# P=0 U=0 W=0 is 64-bit VMOV
+# P=1 W=0 is VLDR/VSTR
+# P=U W=1 is UNDEF
+# leaving P=0 U=1 W=x and P=1 U=0 W=1 for load/store multiple.
+# These include FSTM/FLDM.
+VLDM_VSTM_sp ---- 1100 1 . w:1 l:1 rn:4 .... 1010 imm:8 \
+ vd=%vd_sp p=0 u=1
+VLDM_VSTM_dp ---- 1100 1 . w:1 l:1 rn:4 .... 1011 imm:8 \
+ vd=%vd_dp p=0 u=1
+
+VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
+ vd=%vd_sp p=1 u=0 w=1
+VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
+ vd=%vd_dp p=1 u=0 w=1
+
+# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_hp ---- 1110 0.00 .... .... 1001 .0.0 .... @vfp_dnm_s
+VMLA_sp ---- 1110 0.00 .... .... 1010 .0.0 .... @vfp_dnm_s
+VMLA_dp ---- 1110 0.00 .... .... 1011 .0.0 .... @vfp_dnm_d
+
+VMLS_hp ---- 1110 0.00 .... .... 1001 .1.0 .... @vfp_dnm_s
+VMLS_sp ---- 1110 0.00 .... .... 1010 .1.0 .... @vfp_dnm_s
+VMLS_dp ---- 1110 0.00 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VNMLS_hp ---- 1110 0.01 .... .... 1001 .0.0 .... @vfp_dnm_s
+VNMLS_sp ---- 1110 0.01 .... .... 1010 .0.0 .... @vfp_dnm_s
+VNMLS_dp ---- 1110 0.01 .... .... 1011 .0.0 .... @vfp_dnm_d
+
+VNMLA_hp ---- 1110 0.01 .... .... 1001 .1.0 .... @vfp_dnm_s
+VNMLA_sp ---- 1110 0.01 .... .... 1010 .1.0 .... @vfp_dnm_s
+VNMLA_dp ---- 1110 0.01 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VMUL_hp ---- 1110 0.10 .... .... 1001 .0.0 .... @vfp_dnm_s
+VMUL_sp ---- 1110 0.10 .... .... 1010 .0.0 .... @vfp_dnm_s
+VMUL_dp ---- 1110 0.10 .... .... 1011 .0.0 .... @vfp_dnm_d
+
+VNMUL_hp ---- 1110 0.10 .... .... 1001 .1.0 .... @vfp_dnm_s
+VNMUL_sp ---- 1110 0.10 .... .... 1010 .1.0 .... @vfp_dnm_s
+VNMUL_dp ---- 1110 0.10 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VADD_hp ---- 1110 0.11 .... .... 1001 .0.0 .... @vfp_dnm_s
+VADD_sp ---- 1110 0.11 .... .... 1010 .0.0 .... @vfp_dnm_s
+VADD_dp ---- 1110 0.11 .... .... 1011 .0.0 .... @vfp_dnm_d
+
+VSUB_hp ---- 1110 0.11 .... .... 1001 .1.0 .... @vfp_dnm_s
+VSUB_sp ---- 1110 0.11 .... .... 1010 .1.0 .... @vfp_dnm_s
+VSUB_dp ---- 1110 0.11 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VDIV_hp ---- 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
+VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
+VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
+
+VFMA_hp ---- 1110 1.10 .... .... 1001 .0. 0 .... @vfp_dnm_s
+VFMS_hp ---- 1110 1.10 .... .... 1001 .1. 0 .... @vfp_dnm_s
+VFNMA_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s
+VFNMS_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s
+
+VFMA_sp ---- 1110 1.10 .... .... 1010 .0. 0 .... @vfp_dnm_s
+VFMS_sp ---- 1110 1.10 .... .... 1010 .1. 0 .... @vfp_dnm_s
+VFNMA_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s
+VFNMS_sp ---- 1110 1.01 .... .... 1010 .1. 0 .... @vfp_dnm_s
+
+VFMA_dp ---- 1110 1.10 .... .... 1011 .0.0 .... @vfp_dnm_d
+VFMS_dp ---- 1110 1.10 .... .... 1011 .1.0 .... @vfp_dnm_d
+VFNMA_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d
+VFNMS_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d
+
+VMOV_imm_hp ---- 1110 1.11 .... .... 1001 0000 .... \
+ vd=%vd_sp imm=%vmov_imm
+VMOV_imm_sp ---- 1110 1.11 .... .... 1010 0000 .... \
+ vd=%vd_sp imm=%vmov_imm
+VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \
+ vd=%vd_dp imm=%vmov_imm
+
+VMOV_reg_sp ---- 1110 1.11 0000 .... 1010 01.0 .... @vfp_dm_ss
+VMOV_reg_dp ---- 1110 1.11 0000 .... 1011 01.0 .... @vfp_dm_dd
+
+VABS_hp ---- 1110 1.11 0000 .... 1001 11.0 .... @vfp_dm_ss
+VABS_sp ---- 1110 1.11 0000 .... 1010 11.0 .... @vfp_dm_ss
+VABS_dp ---- 1110 1.11 0000 .... 1011 11.0 .... @vfp_dm_dd
+
+VNEG_hp ---- 1110 1.11 0001 .... 1001 01.0 .... @vfp_dm_ss
+VNEG_sp ---- 1110 1.11 0001 .... 1010 01.0 .... @vfp_dm_ss
+VNEG_dp ---- 1110 1.11 0001 .... 1011 01.0 .... @vfp_dm_dd
+
+VSQRT_hp ---- 1110 1.11 0001 .... 1001 11.0 .... @vfp_dm_ss
+VSQRT_sp ---- 1110 1.11 0001 .... 1010 11.0 .... @vfp_dm_ss
+VSQRT_dp ---- 1110 1.11 0001 .... 1011 11.0 .... @vfp_dm_dd
+
+VCMP_hp ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCMP_sp ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCMP_dp ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
+ vd=%vd_dp vm=%vm_dp
+
+# VCVTT and VCVTB from f16: Vd format depends on size bit; Vm is always vm_sp
+VCVT_f32_f16 ---- 1110 1.11 0010 .... 1010 t:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \
+ vd=%vd_dp vm=%vm_sp
+
+# VCVTB and VCVTT to f16: Vd format is always vd_sp;
+# Vm format depends on size bit
+VCVT_b16_f32 ---- 1110 1.11 0011 .... 1001 t:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
+ vd=%vd_sp vm=%vm_dp
+
+VRINTR_hp ---- 1110 1.11 0110 .... 1001 01.0 .... @vfp_dm_ss
+VRINTR_sp ---- 1110 1.11 0110 .... 1010 01.0 .... @vfp_dm_ss
+VRINTR_dp ---- 1110 1.11 0110 .... 1011 01.0 .... @vfp_dm_dd
+
+VRINTZ_hp ---- 1110 1.11 0110 .... 1001 11.0 .... @vfp_dm_ss
+VRINTZ_sp ---- 1110 1.11 0110 .... 1010 11.0 .... @vfp_dm_ss
+VRINTZ_dp ---- 1110 1.11 0110 .... 1011 11.0 .... @vfp_dm_dd
+
+VRINTX_hp ---- 1110 1.11 0111 .... 1001 01.0 .... @vfp_dm_ss
+VRINTX_sp ---- 1110 1.11 0111 .... 1010 01.0 .... @vfp_dm_ss
+VRINTX_dp ---- 1110 1.11 0111 .... 1011 01.0 .... @vfp_dm_dd
+
+# VCVT between single and double:
+# Vm precision depends on size; Vd is its reverse
+VCVT_sp ---- 1110 1.11 0111 .... 1010 11.0 .... @vfp_dm_ds
+VCVT_dp ---- 1110 1.11 0111 .... 1011 11.0 .... @vfp_dm_sd
+
+# VCVT from integer to floating point: Vm always single; Vd depends on size
+VCVT_int_hp ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_int_sp ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_int_dp ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
+ vd=%vd_dp vm=%vm_sp
+
+# VJCVT is always dp to sp
+VJCVT ---- 1110 1.11 1001 .... 1011 11.0 .... @vfp_dm_sd
+
+# VCVT between floating-point and fixed-point. The immediate value
+# is in the same format as a Vm single-precision register number.
+# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
+# for the convenience of the trans_VCVT_fix functions.
+%vcvt_fix_op 18:1 16:1 7:1
+VCVT_fix_hp ---- 1110 1.11 1.1. .... 1001 .1.0 .... \
+ vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
+VCVT_fix_sp ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
+ vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
+VCVT_fix_dp ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
+ vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
+
+# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
+VCVT_hp_int ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_sp_int ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
+VCVT_dp_int ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
+ vd=%vd_sp vm=%vm_dp