diff options
author | Fabiano Rosas <farosas@suse.de> | 2023-02-17 17:11:29 -0300 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2023-02-27 13:27:04 +0000 |
commit | f0984d4040c328d1c021ae6680479cbbe13c485b (patch) | |
tree | 96d3b38a11fe7dc6ba19c24e47f06f48eb0cb8e5 /target/arm/tcg | |
parent | 2059ec754f9040a6a9f62a9abfeb76a9d8655e11 (diff) | |
download | qemu-f0984d4040c328d1c021ae6680479cbbe13c485b.zip qemu-f0984d4040c328d1c021ae6680479cbbe13c485b.tar.gz qemu-f0984d4040c328d1c021ae6680479cbbe13c485b.tar.bz2 |
target/arm: move translate modules to tcg/
Introduce the target/arm/tcg directory. Its purpose is to hold the TCG
code that is selected by CONFIG_TCG.
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target/arm/tcg')
25 files changed, 50203 insertions, 0 deletions
diff --git a/target/arm/tcg/a32-uncond.decode b/target/arm/tcg/a32-uncond.decode new file mode 100644 index 0000000..2339de2 --- /dev/null +++ b/target/arm/tcg/a32-uncond.decode @@ -0,0 +1,74 @@ +# A32 unconditional instructions +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# All insns that have 0xf in insn[31:28] are decoded here. +# All of those that have a COND field in insn[31:28] are in a32.decode +# + +&empty !extern +&i !extern imm +&setend E + +# Branch with Link and Exchange + +%imm24h 0:s24 24:1 !function=times_2 + +BLX_i 1111 101 . ........................ &i imm=%imm24h + +# System Instructions + +&rfe rn w pu +&srs mode w pu +&cps mode imod M A I F + +RFE 1111 100 pu:2 0 w:1 1 rn:4 0000 1010 0000 0000 &rfe +SRS 1111 100 pu:2 1 w:1 0 1101 0000 0101 000 mode:5 &srs +CPS 1111 0001 0000 imod:2 M:1 0 0000 000 A:1 I:1 F:1 0 mode:5 \ + &cps + +# Clear-Exclusive, Barriers + +# QEMU does not require the option field for the barriers. +CLREX 1111 0101 0111 1111 1111 0000 0001 1111 +DSB 1111 0101 0111 1111 1111 0000 0100 ---- +DMB 1111 0101 0111 1111 1111 0000 0101 ---- +ISB 1111 0101 0111 1111 1111 0000 0110 ---- +SB 1111 0101 0111 1111 1111 0000 0111 0000 + +# Set Endianness +SETEND 1111 0001 0000 0001 0000 00 E:1 0 0000 0000 &setend + +# Preload instructions + +PLD 1111 0101 -101 ---- 1111 ---- ---- ---- # (imm, lit) 5te +PLDW 1111 0101 -001 ---- 1111 ---- ---- ---- # (imm, lit) 7mp +PLI 1111 0100 -101 ---- 1111 ---- ---- ---- # (imm, lit) 7 + +PLD 1111 0111 -101 ---- 1111 ----- -- 0 ---- # (register) 5te +PLDW 1111 0111 -001 ---- 1111 ----- -- 0 ---- # (register) 7mp +PLI 1111 0110 -101 ---- 1111 ----- -- 0 ---- # (register) 7 + +# Unallocated memory hints +# +# Since these are v7MP nops, and PLDW is v7MP and implemented as nop, +# (ab)use the PLDW helper. + +PLDW 1111 0100 -001 ---- ---- ---- ---- ---- +PLDW 1111 0110 -001 ---- ---- ---- ---0 ---- diff --git a/target/arm/tcg/a32.decode b/target/arm/tcg/a32.decode new file mode 100644 index 0000000..f2ca480 --- /dev/null +++ b/target/arm/tcg/a32.decode @@ -0,0 +1,557 @@ +# A32 conditional instructions +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# All of the insn that have a COND field in insn[31:28] are here. +# All insns that have 0xf in insn[31:28] are in a32-uncond.decode. +# + +&empty +&s_rrr_shi s rd rn rm shim shty +&s_rrr_shr s rn rd rm rs shty +&s_rri_rot s rn rd imm rot +&s_rrrr s rd rn rm ra +&rrrr rd rn rm ra +&rrr_rot rd rn rm rot +&rrr rd rn rm +&rr rd rm +&ri rd imm +&r rm +&i imm +&msr_reg rn r mask +&mrs_reg rd r +&msr_bank rn r sysm +&mrs_bank rd r sysm +&ldst_rr p w u rn rt rm shimm shtype +&ldst_ri p w u rn rt imm +&ldst_block rn i b u w list +&strex rn rd rt rt2 imm +&ldrex rn rt rt2 imm +&bfx rd rn lsb widthm1 +&bfi rd rn lsb msb +&sat rd rn satimm imm sh +&pkh rd rn rm imm tb +&mcr cp opc1 crn crm opc2 rt +&mcrr cp opc1 crm rt rt2 + +# Data-processing (register) + +@s_rrr_shi ---- ... .... s:1 rn:4 rd:4 shim:5 shty:2 . rm:4 \ + &s_rrr_shi +@s_rxr_shi ---- ... .... s:1 .... rd:4 shim:5 shty:2 . rm:4 \ + &s_rrr_shi rn=0 +@S_xrr_shi ---- ... .... . rn:4 .... shim:5 shty:2 . rm:4 \ + &s_rrr_shi s=1 rd=0 + +AND_rrri .... 000 0000 . .... .... ..... .. 0 .... @s_rrr_shi +EOR_rrri .... 000 0001 . .... .... ..... .. 0 .... @s_rrr_shi +SUB_rrri .... 000 0010 . .... .... ..... .. 0 .... @s_rrr_shi +RSB_rrri .... 000 0011 . .... .... ..... .. 0 .... @s_rrr_shi +ADD_rrri .... 000 0100 . .... .... ..... .. 0 .... @s_rrr_shi +ADC_rrri .... 000 0101 . .... .... ..... .. 0 .... @s_rrr_shi +SBC_rrri .... 000 0110 . .... .... ..... .. 0 .... @s_rrr_shi +RSC_rrri .... 000 0111 . .... .... ..... .. 0 .... @s_rrr_shi +TST_xrri .... 000 1000 1 .... 0000 ..... .. 0 .... @S_xrr_shi +TEQ_xrri .... 000 1001 1 .... 0000 ..... .. 0 .... @S_xrr_shi +CMP_xrri .... 000 1010 1 .... 0000 ..... .. 0 .... @S_xrr_shi +CMN_xrri .... 000 1011 1 .... 0000 ..... .. 0 .... @S_xrr_shi +ORR_rrri .... 000 1100 . .... .... ..... .. 0 .... @s_rrr_shi +MOV_rxri .... 000 1101 . 0000 .... ..... .. 0 .... @s_rxr_shi +BIC_rrri .... 000 1110 . .... .... ..... .. 0 .... @s_rrr_shi +MVN_rxri .... 000 1111 . 0000 .... ..... .. 0 .... @s_rxr_shi + +%imm16 16:4 0:12 +@mov16 ---- .... .... .... rd:4 ............ &ri imm=%imm16 + +MOVW .... 0011 0000 .... .... ............ @mov16 +MOVT .... 0011 0100 .... .... ............ @mov16 + +# Data-processing (register-shifted register) + +@s_rrr_shr ---- ... .... s:1 rn:4 rd:4 rs:4 . shty:2 . rm:4 \ + &s_rrr_shr +@s_rxr_shr ---- ... .... s:1 .... rd:4 rs:4 . shty:2 . rm:4 \ + &s_rrr_shr rn=0 +@S_xrr_shr ---- ... .... . rn:4 .... rs:4 . shty:2 . rm:4 \ + &s_rrr_shr rd=0 s=1 + +AND_rrrr .... 000 0000 . .... .... .... 0 .. 1 .... @s_rrr_shr +EOR_rrrr .... 000 0001 . .... .... .... 0 .. 1 .... @s_rrr_shr +SUB_rrrr .... 000 0010 . .... .... .... 0 .. 1 .... @s_rrr_shr +RSB_rrrr .... 000 0011 . .... .... .... 0 .. 1 .... @s_rrr_shr +ADD_rrrr .... 000 0100 . .... .... .... 0 .. 1 .... @s_rrr_shr +ADC_rrrr .... 000 0101 . .... .... .... 0 .. 1 .... @s_rrr_shr +SBC_rrrr .... 000 0110 . .... .... .... 0 .. 1 .... @s_rrr_shr +RSC_rrrr .... 000 0111 . .... .... .... 0 .. 1 .... @s_rrr_shr +TST_xrrr .... 000 1000 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr +TEQ_xrrr .... 000 1001 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr +CMP_xrrr .... 000 1010 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr +CMN_xrrr .... 000 1011 1 .... 0000 .... 0 .. 1 .... @S_xrr_shr +ORR_rrrr .... 000 1100 . .... .... .... 0 .. 1 .... @s_rrr_shr +MOV_rxrr .... 000 1101 . 0000 .... .... 0 .. 1 .... @s_rxr_shr +BIC_rrrr .... 000 1110 . .... .... .... 0 .. 1 .... @s_rrr_shr +MVN_rxrr .... 000 1111 . 0000 .... .... 0 .. 1 .... @s_rxr_shr + +# Data-processing (immediate) + +%a32extrot 8:4 !function=times_2 + +@s_rri_rot ---- ... .... s:1 rn:4 rd:4 .... imm:8 \ + &s_rri_rot rot=%a32extrot +@s_rxi_rot ---- ... .... s:1 .... rd:4 .... imm:8 \ + &s_rri_rot rot=%a32extrot rn=0 +@S_xri_rot ---- ... .... . rn:4 .... .... imm:8 \ + &s_rri_rot rot=%a32extrot rd=0 s=1 + +AND_rri .... 001 0000 . .... .... ............ @s_rri_rot +EOR_rri .... 001 0001 . .... .... ............ @s_rri_rot +SUB_rri .... 001 0010 . .... .... ............ @s_rri_rot +RSB_rri .... 001 0011 . .... .... ............ @s_rri_rot +ADD_rri .... 001 0100 . .... .... ............ @s_rri_rot +ADC_rri .... 001 0101 . .... .... ............ @s_rri_rot +SBC_rri .... 001 0110 . .... .... ............ @s_rri_rot +RSC_rri .... 001 0111 . .... .... ............ @s_rri_rot +TST_xri .... 001 1000 1 .... 0000 ............ @S_xri_rot +TEQ_xri .... 001 1001 1 .... 0000 ............ @S_xri_rot +CMP_xri .... 001 1010 1 .... 0000 ............ @S_xri_rot +CMN_xri .... 001 1011 1 .... 0000 ............ @S_xri_rot +ORR_rri .... 001 1100 . .... .... ............ @s_rri_rot +MOV_rxi .... 001 1101 . 0000 .... ............ @s_rxi_rot +BIC_rri .... 001 1110 . .... .... ............ @s_rri_rot +MVN_rxi .... 001 1111 . 0000 .... ............ @s_rxi_rot + +# Multiply and multiply accumulate + +@s_rdamn ---- .... ... s:1 rd:4 ra:4 rm:4 .... rn:4 &s_rrrr +@s_rd0mn ---- .... ... s:1 rd:4 .... rm:4 .... rn:4 &s_rrrr ra=0 +@rdamn ---- .... ... . rd:4 ra:4 rm:4 .... rn:4 &rrrr +@rd0mn ---- .... ... . rd:4 .... rm:4 .... rn:4 &rrrr ra=0 + +MUL .... 0000 000 . .... 0000 .... 1001 .... @s_rd0mn +MLA .... 0000 001 . .... .... .... 1001 .... @s_rdamn +UMAAL .... 0000 010 0 .... .... .... 1001 .... @rdamn +MLS .... 0000 011 0 .... .... .... 1001 .... @rdamn +UMULL .... 0000 100 . .... .... .... 1001 .... @s_rdamn +UMLAL .... 0000 101 . .... .... .... 1001 .... @s_rdamn +SMULL .... 0000 110 . .... .... .... 1001 .... @s_rdamn +SMLAL .... 0000 111 . .... .... .... 1001 .... @s_rdamn + +# Saturating addition and subtraction + +@rndm ---- .... .... rn:4 rd:4 .... .... rm:4 &rrr + +QADD .... 0001 0000 .... .... 0000 0101 .... @rndm +QSUB .... 0001 0010 .... .... 0000 0101 .... @rndm +QDADD .... 0001 0100 .... .... 0000 0101 .... @rndm +QDSUB .... 0001 0110 .... .... 0000 0101 .... @rndm + +# Halfword multiply and multiply accumulate + +SMLABB .... 0001 0000 .... .... .... 1000 .... @rdamn +SMLABT .... 0001 0000 .... .... .... 1100 .... @rdamn +SMLATB .... 0001 0000 .... .... .... 1010 .... @rdamn +SMLATT .... 0001 0000 .... .... .... 1110 .... @rdamn +SMLAWB .... 0001 0010 .... .... .... 1000 .... @rdamn +SMULWB .... 0001 0010 .... 0000 .... 1010 .... @rd0mn +SMLAWT .... 0001 0010 .... .... .... 1100 .... @rdamn +SMULWT .... 0001 0010 .... 0000 .... 1110 .... @rd0mn +SMLALBB .... 0001 0100 .... .... .... 1000 .... @rdamn +SMLALBT .... 0001 0100 .... .... .... 1100 .... @rdamn +SMLALTB .... 0001 0100 .... .... .... 1010 .... @rdamn +SMLALTT .... 0001 0100 .... .... .... 1110 .... @rdamn +SMULBB .... 0001 0110 .... 0000 .... 1000 .... @rd0mn +SMULBT .... 0001 0110 .... 0000 .... 1100 .... @rd0mn +SMULTB .... 0001 0110 .... 0000 .... 1010 .... @rd0mn +SMULTT .... 0001 0110 .... 0000 .... 1110 .... @rd0mn + +# MSR (immediate) and hints + +&msr_i r mask rot imm +@msr_i ---- .... .... mask:4 .... rot:4 imm:8 &msr_i + +{ + { + [ + YIELD ---- 0011 0010 0000 1111 ---- 0000 0001 + WFE ---- 0011 0010 0000 1111 ---- 0000 0010 + WFI ---- 0011 0010 0000 1111 ---- 0000 0011 + + # TODO: Implement SEV, SEVL; may help SMP performance. + # SEV ---- 0011 0010 0000 1111 ---- 0000 0100 + # SEVL ---- 0011 0010 0000 1111 ---- 0000 0101 + + ESB ---- 0011 0010 0000 1111 ---- 0001 0000 + ] + + # The canonical nop ends in 00000000, but the whole of the + # rest of the space executes as nop if otherwise unsupported. + NOP ---- 0011 0010 0000 1111 ---- ---- ---- + } + # Note mask = 0 is covered by NOP + MSR_imm .... 0011 0010 .... 1111 .... .... .... @msr_i r=0 +} +MSR_imm .... 0011 0110 .... 1111 .... .... .... @msr_i r=1 + +# Cyclic Redundancy Check + +CRC32B .... 0001 0000 .... .... 0000 0100 .... @rndm +CRC32H .... 0001 0010 .... .... 0000 0100 .... @rndm +CRC32W .... 0001 0100 .... .... 0000 0100 .... @rndm +CRC32CB .... 0001 0000 .... .... 0010 0100 .... @rndm +CRC32CH .... 0001 0010 .... .... 0010 0100 .... @rndm +CRC32CW .... 0001 0100 .... .... 0010 0100 .... @rndm + +# Miscellaneous instructions + +%sysm 8:1 16:4 +%imm16_8_0 8:12 0:4 + +@rm ---- .... .... .... .... .... .... rm:4 &r +@rdm ---- .... .... .... rd:4 .... .... rm:4 &rr +@i16 ---- .... .... .... .... .... .... .... &i imm=%imm16_8_0 + +MRS_bank ---- 0001 0 r:1 00 .... rd:4 001. 0000 0000 &mrs_bank %sysm +MSR_bank ---- 0001 0 r:1 10 .... 1111 001. 0000 rn:4 &msr_bank %sysm + +MRS_reg ---- 0001 0 r:1 00 1111 rd:4 0000 0000 0000 &mrs_reg +MSR_reg ---- 0001 0 r:1 10 mask:4 1111 0000 0000 rn:4 &msr_reg + +BX .... 0001 0010 1111 1111 1111 0001 .... @rm +BXJ .... 0001 0010 1111 1111 1111 0010 .... @rm +BLX_r .... 0001 0010 1111 1111 1111 0011 .... @rm + +CLZ .... 0001 0110 1111 .... 1111 0001 .... @rdm + +ERET ---- 0001 0110 0000 0000 0000 0110 1110 + +HLT .... 0001 0000 .... .... .... 0111 .... @i16 +BKPT .... 0001 0010 .... .... .... 0111 .... @i16 +HVC .... 0001 0100 .... .... .... 0111 .... @i16 +SMC ---- 0001 0110 0000 0000 0000 0111 imm:4 &i + +# Load/Store Dual, Half, Signed Byte (register) + +@ldst_rr_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... rm:4 \ + &ldst_rr p=1 shimm=0 shtype=0 +@ldst_rr_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 .... .... rm:4 \ + &ldst_rr p=0 w=0 shimm=0 shtype=0 + +STRH_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_pw0 +STRH_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_p1w + +LDRD_rr .... 000. .0.0 .... .... 0000 1101 .... @ldst_rr_pw0 +LDRD_rr .... 000. .0.0 .... .... 0000 1101 .... @ldst_rr_p1w + +STRD_rr .... 000. .0.0 .... .... 0000 1111 .... @ldst_rr_pw0 +STRD_rr .... 000. .0.0 .... .... 0000 1111 .... @ldst_rr_p1w + +LDRH_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_pw0 +LDRH_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_p1w + +LDRSB_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_pw0 +LDRSB_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_p1w + +LDRSH_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_pw0 +LDRSH_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_p1w + +# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding, +# and act as normal post-indexed (P=0, W=0). +@ldst_rr_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 .... .... rm:4 \ + &ldst_rr p=0 w=0 shimm=0 shtype=0 + +STRHT_rr .... 000. .0.0 .... .... 0000 1011 .... @ldst_rr_p0w1 +LDRHT_rr .... 000. .0.1 .... .... 0000 1011 .... @ldst_rr_p0w1 +LDRSBT_rr .... 000. .0.1 .... .... 0000 1101 .... @ldst_rr_p0w1 +LDRSHT_rr .... 000. .0.1 .... .... 0000 1111 .... @ldst_rr_p0w1 + +# Load/Store word and unsigned byte (register) + +@ldst_rs_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \ + &ldst_rr p=1 +@ldst_rs_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \ + &ldst_rr p=0 w=0 + +STR_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_pw0 +STR_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_p1w +STRB_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_pw0 +STRB_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_p1w + +LDR_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_pw0 +LDR_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_p1w +LDRB_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_pw0 +LDRB_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_p1w + +@ldst_rs_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \ + &ldst_rr p=0 w=0 + +STRT_rr .... 011. .0.0 .... .... .... ...0 .... @ldst_rs_p0w1 +STRBT_rr .... 011. .1.0 .... .... .... ...0 .... @ldst_rs_p0w1 +LDRT_rr .... 011. .0.1 .... .... .... ...0 .... @ldst_rs_p0w1 +LDRBT_rr .... 011. .1.1 .... .... .... ...0 .... @ldst_rs_p0w1 + +# Load/Store Dual, Half, Signed Byte (immediate) + +%imm8s_8_0 8:4 0:4 +@ldst_ri8_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... .... \ + &ldst_ri imm=%imm8s_8_0 p=1 +@ldst_ri8_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 .... .... .... \ + &ldst_ri imm=%imm8s_8_0 p=0 w=0 + +STRH_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_pw0 +STRH_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_p1w + +LDRD_ri_a32 .... 000. .1.0 .... .... .... 1101 .... @ldst_ri8_pw0 +LDRD_ri_a32 .... 000. .1.0 .... .... .... 1101 .... @ldst_ri8_p1w + +STRD_ri_a32 .... 000. .1.0 .... .... .... 1111 .... @ldst_ri8_pw0 +STRD_ri_a32 .... 000. .1.0 .... .... .... 1111 .... @ldst_ri8_p1w + +LDRH_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_pw0 +LDRH_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_p1w + +LDRSB_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_pw0 +LDRSB_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_p1w + +LDRSH_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_pw0 +LDRSH_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_p1w + +# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding, +# and act as normal post-indexed (P=0, W=0). +@ldst_ri8_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 .... .... .... \ + &ldst_ri imm=%imm8s_8_0 p=0 w=0 + +STRHT_ri .... 000. .1.0 .... .... .... 1011 .... @ldst_ri8_p0w1 +LDRHT_ri .... 000. .1.1 .... .... .... 1011 .... @ldst_ri8_p0w1 +LDRSBT_ri .... 000. .1.1 .... .... .... 1101 .... @ldst_ri8_p0w1 +LDRSHT_ri .... 000. .1.1 .... .... .... 1111 .... @ldst_ri8_p0w1 + +# Load/Store word and unsigned byte (immediate) + +@ldst_ri12_p1w ---- ...1 u:1 . w:1 . rn:4 rt:4 imm:12 &ldst_ri p=1 +@ldst_ri12_pw0 ---- ...0 u:1 . 0 . rn:4 rt:4 imm:12 &ldst_ri p=0 w=0 + +STR_ri .... 010. .0.0 .... .... ............ @ldst_ri12_p1w +STR_ri .... 010. .0.0 .... .... ............ @ldst_ri12_pw0 +STRB_ri .... 010. .1.0 .... .... ............ @ldst_ri12_p1w +STRB_ri .... 010. .1.0 .... .... ............ @ldst_ri12_pw0 + +LDR_ri .... 010. .0.1 .... .... ............ @ldst_ri12_p1w +LDR_ri .... 010. .0.1 .... .... ............ @ldst_ri12_pw0 +LDRB_ri .... 010. .1.1 .... .... ............ @ldst_ri12_p1w +LDRB_ri .... 010. .1.1 .... .... ............ @ldst_ri12_pw0 + +@ldst_ri12_p0w1 ---- ...0 u:1 . 1 . rn:4 rt:4 imm:12 &ldst_ri p=0 w=0 + +STRT_ri .... 010. .0.0 .... .... ............ @ldst_ri12_p0w1 +STRBT_ri .... 010. .1.0 .... .... ............ @ldst_ri12_p0w1 +LDRT_ri .... 010. .0.1 .... .... ............ @ldst_ri12_p0w1 +LDRBT_ri .... 010. .1.1 .... .... ............ @ldst_ri12_p0w1 + +# Synchronization primitives + +@swp ---- .... .... rn:4 rt:4 .... .... rt2:4 + +SWP .... 0001 0000 .... .... 0000 1001 .... @swp +SWPB .... 0001 0100 .... .... 0000 1001 .... @swp + +# Load/Store Exclusive and Load-Acquire/Store-Release +# +# Note rt2 for STREXD/LDREXD is set by the helper after checking rt is even. + +@strex ---- .... .... rn:4 rd:4 .... .... rt:4 \ + &strex imm=0 rt2=15 +@ldrex ---- .... .... rn:4 rt:4 .... .... .... \ + &ldrex imm=0 rt2=15 +@stl ---- .... .... rn:4 .... .... .... rt:4 \ + &ldrex imm=0 rt2=15 + +STREX .... 0001 1000 .... .... 1111 1001 .... @strex +STREXD_a32 .... 0001 1010 .... .... 1111 1001 .... @strex +STREXB .... 0001 1100 .... .... 1111 1001 .... @strex +STREXH .... 0001 1110 .... .... 1111 1001 .... @strex + +STLEX .... 0001 1000 .... .... 1110 1001 .... @strex +STLEXD_a32 .... 0001 1010 .... .... 1110 1001 .... @strex +STLEXB .... 0001 1100 .... .... 1110 1001 .... @strex +STLEXH .... 0001 1110 .... .... 1110 1001 .... @strex + +STL .... 0001 1000 .... 1111 1100 1001 .... @stl +STLB .... 0001 1100 .... 1111 1100 1001 .... @stl +STLH .... 0001 1110 .... 1111 1100 1001 .... @stl + +LDREX .... 0001 1001 .... .... 1111 1001 1111 @ldrex +LDREXD_a32 .... 0001 1011 .... .... 1111 1001 1111 @ldrex +LDREXB .... 0001 1101 .... .... 1111 1001 1111 @ldrex +LDREXH .... 0001 1111 .... .... 1111 1001 1111 @ldrex + +LDAEX .... 0001 1001 .... .... 1110 1001 1111 @ldrex +LDAEXD_a32 .... 0001 1011 .... .... 1110 1001 1111 @ldrex +LDAEXB .... 0001 1101 .... .... 1110 1001 1111 @ldrex +LDAEXH .... 0001 1111 .... .... 1110 1001 1111 @ldrex + +LDA .... 0001 1001 .... .... 1100 1001 1111 @ldrex +LDAB .... 0001 1101 .... .... 1100 1001 1111 @ldrex +LDAH .... 0001 1111 .... .... 1100 1001 1111 @ldrex + +# Media instructions + +# usad8 is usada8 w/ ra=15 +USADA8 ---- 0111 1000 rd:4 ra:4 rm:4 0001 rn:4 + +# ubfx and sbfx +@bfx ---- .... ... widthm1:5 rd:4 lsb:5 ... rn:4 &bfx + +SBFX .... 0111 101 ..... .... ..... 101 .... @bfx +UBFX .... 0111 111 ..... .... ..... 101 .... @bfx + +# bfc is bfi w/ rn=15 +BFCI ---- 0111 110 msb:5 rd:4 lsb:5 001 rn:4 &bfi + +# While we could get UDEF by not including this, add the pattern for +# documentation and to conflict with any other typos in this file. +UDF 1110 0111 1111 ---- ---- ---- 1111 ---- + +# Parallel addition and subtraction + +SADD16 .... 0110 0001 .... .... 1111 0001 .... @rndm +SASX .... 0110 0001 .... .... 1111 0011 .... @rndm +SSAX .... 0110 0001 .... .... 1111 0101 .... @rndm +SSUB16 .... 0110 0001 .... .... 1111 0111 .... @rndm +SADD8 .... 0110 0001 .... .... 1111 1001 .... @rndm +SSUB8 .... 0110 0001 .... .... 1111 1111 .... @rndm + +QADD16 .... 0110 0010 .... .... 1111 0001 .... @rndm +QASX .... 0110 0010 .... .... 1111 0011 .... @rndm +QSAX .... 0110 0010 .... .... 1111 0101 .... @rndm +QSUB16 .... 0110 0010 .... .... 1111 0111 .... @rndm +QADD8 .... 0110 0010 .... .... 1111 1001 .... @rndm +QSUB8 .... 0110 0010 .... .... 1111 1111 .... @rndm + +SHADD16 .... 0110 0011 .... .... 1111 0001 .... @rndm +SHASX .... 0110 0011 .... .... 1111 0011 .... @rndm +SHSAX .... 0110 0011 .... .... 1111 0101 .... @rndm +SHSUB16 .... 0110 0011 .... .... 1111 0111 .... @rndm +SHADD8 .... 0110 0011 .... .... 1111 1001 .... @rndm +SHSUB8 .... 0110 0011 .... .... 1111 1111 .... @rndm + +UADD16 .... 0110 0101 .... .... 1111 0001 .... @rndm +UASX .... 0110 0101 .... .... 1111 0011 .... @rndm +USAX .... 0110 0101 .... .... 1111 0101 .... @rndm +USUB16 .... 0110 0101 .... .... 1111 0111 .... @rndm +UADD8 .... 0110 0101 .... .... 1111 1001 .... @rndm +USUB8 .... 0110 0101 .... .... 1111 1111 .... @rndm + +UQADD16 .... 0110 0110 .... .... 1111 0001 .... @rndm +UQASX .... 0110 0110 .... .... 1111 0011 .... @rndm +UQSAX .... 0110 0110 .... .... 1111 0101 .... @rndm +UQSUB16 .... 0110 0110 .... .... 1111 0111 .... @rndm +UQADD8 .... 0110 0110 .... .... 1111 1001 .... @rndm +UQSUB8 .... 0110 0110 .... .... 1111 1111 .... @rndm + +UHADD16 .... 0110 0111 .... .... 1111 0001 .... @rndm +UHASX .... 0110 0111 .... .... 1111 0011 .... @rndm +UHSAX .... 0110 0111 .... .... 1111 0101 .... @rndm +UHSUB16 .... 0110 0111 .... .... 1111 0111 .... @rndm +UHADD8 .... 0110 0111 .... .... 1111 1001 .... @rndm +UHSUB8 .... 0110 0111 .... .... 1111 1111 .... @rndm + +# Packing, unpacking, saturation, and reversal + +PKH ---- 0110 1000 rn:4 rd:4 imm:5 tb:1 01 rm:4 &pkh + +@sat ---- .... ... satimm:5 rd:4 imm:5 sh:1 .. rn:4 &sat +@sat16 ---- .... .... satimm:4 rd:4 .... .... rn:4 \ + &sat imm=0 sh=0 + +SSAT .... 0110 101. .... .... .... ..01 .... @sat +USAT .... 0110 111. .... .... .... ..01 .... @sat + +SSAT16 .... 0110 1010 .... .... 1111 0011 .... @sat16 +USAT16 .... 0110 1110 .... .... 1111 0011 .... @sat16 + +@rrr_rot ---- .... .... rn:4 rd:4 rot:2 ...... rm:4 &rrr_rot + +SXTAB16 .... 0110 1000 .... .... ..00 0111 .... @rrr_rot +SXTAB .... 0110 1010 .... .... ..00 0111 .... @rrr_rot +SXTAH .... 0110 1011 .... .... ..00 0111 .... @rrr_rot +UXTAB16 .... 0110 1100 .... .... ..00 0111 .... @rrr_rot +UXTAB .... 0110 1110 .... .... ..00 0111 .... @rrr_rot +UXTAH .... 0110 1111 .... .... ..00 0111 .... @rrr_rot + +SEL .... 0110 1000 .... .... 1111 1011 .... @rndm +REV .... 0110 1011 1111 .... 1111 0011 .... @rdm +REV16 .... 0110 1011 1111 .... 1111 1011 .... @rdm +REVSH .... 0110 1111 1111 .... 1111 1011 .... @rdm +RBIT .... 0110 1111 1111 .... 1111 0011 .... @rdm + +# Signed multiply, signed and unsigned divide + +@rdmn ---- .... .... rd:4 .... rm:4 .... rn:4 &rrr + +SMLAD .... 0111 0000 .... .... .... 0001 .... @rdamn +SMLADX .... 0111 0000 .... .... .... 0011 .... @rdamn +SMLSD .... 0111 0000 .... .... .... 0101 .... @rdamn +SMLSDX .... 0111 0000 .... .... .... 0111 .... @rdamn + +SDIV .... 0111 0001 .... 1111 .... 0001 .... @rdmn +UDIV .... 0111 0011 .... 1111 .... 0001 .... @rdmn + +SMLALD .... 0111 0100 .... .... .... 0001 .... @rdamn +SMLALDX .... 0111 0100 .... .... .... 0011 .... @rdamn +SMLSLD .... 0111 0100 .... .... .... 0101 .... @rdamn +SMLSLDX .... 0111 0100 .... .... .... 0111 .... @rdamn + +SMMLA .... 0111 0101 .... .... .... 0001 .... @rdamn +SMMLAR .... 0111 0101 .... .... .... 0011 .... @rdamn +SMMLS .... 0111 0101 .... .... .... 1101 .... @rdamn +SMMLSR .... 0111 0101 .... .... .... 1111 .... @rdamn + +# Block data transfer + +STM ---- 100 b:1 i:1 u:1 w:1 0 rn:4 list:16 &ldst_block +LDM_a32 ---- 100 b:1 i:1 u:1 w:1 1 rn:4 list:16 &ldst_block + +# Branch, branch with link + +%imm26 0:s24 !function=times_4 +@branch ---- .... ........................ &i imm=%imm26 + +B .... 1010 ........................ @branch +BL .... 1011 ........................ @branch + +# Coprocessor instructions + +# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the +# other coprocessor instructions always UNDEF. +# The trans_ functions for these will ignore cp values 8..13 for v7 or +# earlier, and 0..13 for v8 and later, because those areas of the +# encoding space may be used for other things, such as VFP or Neon. + +@mcr ---- .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4 &mcr +@mcrr ---- .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4 &mcrr + +MCRR .... 1100 0100 .... .... .... .... .... @mcrr +MRRC .... 1100 0101 .... .... .... .... .... @mcrr + +MCR .... 1110 ... 0 .... .... .... ... 1 .... @mcr +MRC .... 1110 ... 1 .... .... .... ... 1 .... @mcr + +# Supervisor call + +SVC ---- 1111 imm:24 &i diff --git a/target/arm/tcg/m-nocp.decode b/target/arm/tcg/m-nocp.decode new file mode 100644 index 0000000..b65c801 --- /dev/null +++ b/target/arm/tcg/m-nocp.decode @@ -0,0 +1,72 @@ +# M-profile UserFault.NOCP exception handling +# +# Copyright (c) 2020 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# For M-profile, the architecture specifies that NOCP UsageFaults +# should take precedence over UNDEF faults over the whole wide +# range of coprocessor-space encodings, with the exception of +# VLLDM and VLSTM. (Compare v8.1M IsCPInstruction() pseudocode and +# v8M Arm ARM rule R_QLGM.) This isn't mandatory for v8.0M but we choose +# to behave the same as v8.1M. +# This decode is handled before any others (and in particular before +# decoding FP instructions which are in the coprocessor space). +# If the coprocessor is not present or disabled then we will generate +# the NOCP exception; otherwise we let the insn through to the main decode. + +%vd_dp 22:1 12:4 +%vd_sp 12:4 22:1 + +&nocp cp + +# M-profile VLDR/VSTR to sysreg +%vldr_sysreg 22:1 13:3 +%imm7_0x4 0:7 !function=times_4 + +&vldr_sysreg rn reg imm a w p +@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \ + reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg + +{ + # Special cases which do not take an early NOCP: VLLDM and VLSTM + VLLDM_VLSTM 1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000 + # VSCCLRM (new in v8.1M) is similar: + VSCCLRM 1110 1100 1.01 1111 .... 1011 imm:7 0 vd=%vd_dp size=3 + VSCCLRM 1110 1100 1.01 1111 .... 1010 imm:8 vd=%vd_sp size=2 + + # FP system register accesses: these are a special case because accesses + # to FPCXT_NS succeed even if the FPU is disabled. We therefore need + # to handle them before the big NOCP blocks. Note that within these + # insns NOCP still has higher priority than UNDEFs; this is implemented + # by their returning 'false' for UNDEF so as to fall through into the + # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding() + # for the UNDEFs there that must take precedence over NOCP.) + + VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000 + + # P=0 W=0 is SEE "Related encodings", so split into two patterns + VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + + NOCP 111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp + NOCP 111- 110- ---- ---- ---- cp:4 ---- ---- &nocp + # From v8.1M onwards this range will also NOCP: + NOCP_8_1 111- 1111 ---- ---- ---- ---- ---- ---- &nocp cp=10 +} diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build new file mode 100644 index 0000000..044561b --- /dev/null +++ b/target/arm/tcg/meson.build @@ -0,0 +1,32 @@ +gen = [ + decodetree.process('sve.decode', extra_args: '--decode=disas_sve'), + decodetree.process('sme.decode', extra_args: '--decode=disas_sme'), + decodetree.process('sme-fa64.decode', extra_args: '--static-decode=disas_sme_fa64'), + decodetree.process('neon-shared.decode', extra_args: '--decode=disas_neon_shared'), + decodetree.process('neon-dp.decode', extra_args: '--decode=disas_neon_dp'), + decodetree.process('neon-ls.decode', extra_args: '--decode=disas_neon_ls'), + decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'), + decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'), + decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'), + decodetree.process('mve.decode', extra_args: '--decode=disas_mve'), + decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'), + decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'), + decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'), + decodetree.process('t16.decode', extra_args: ['-w', '16', '--static-decode=disas_t16']), +] + +arm_ss.add(gen) + +arm_ss.add(files( + 'translate.c', + 'translate-m-nocp.c', + 'translate-mve.c', + 'translate-neon.c', + 'translate-vfp.c', +)) + +arm_ss.add(when: 'TARGET_AARCH64', if_true: files( + 'translate-a64.c', + 'translate-sve.c', + 'translate-sme.c', +)) diff --git a/target/arm/tcg/mve.decode b/target/arm/tcg/mve.decode new file mode 100644 index 0000000..14a4f39 --- /dev/null +++ b/target/arm/tcg/mve.decode @@ -0,0 +1,832 @@ +# M-profile MVE instruction descriptions +# +# Copyright (c) 2021 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +%qd 22:1 13:3 +%qm 5:1 1:3 +%qn 7:1 17:3 + +# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit +%size_28 28:1 !function=plus_1 + +# 2 operand fp insns have size in bit 20: 1 for 16 bit, 0 for 32 bit, +# like Neon FP insns. +%2op_fp_size 20:1 !function=neon_3same_fp_size +# VCADD is an exception, where bit 20 is 0 for 16 bit and 1 for 32 bit +%2op_fp_size_rev 20:1 !function=plus_1 +# FP scalars have size in bit 28, 1 for 16 bit, 0 for 32 bit +%2op_fp_scalar_size 28:1 !function=neon_3same_fp_size + +# 1imm format immediate +%imm_28_16_0 28:1 16:3 0:4 + +&vldr_vstr rn qd imm p a w size l u +&1op qd qm size +&2op qd qm qn size +&2scalar qd qn rm size +&1imm qd imm cmode op +&2shift qd qm shift size +&vidup qd rn size imm +&viwdup qd rn rm size imm +&vcmp qm qn size mask +&vcmp_scalar qn rm size mask +&shl_scalar qda rm size +&vmaxv qm rda size +&vabav qn qm rda size +&vldst_sg qd qm rn size msize os +&vldst_sg_imm qd qm a w imm +&vldst_il qd rn size pat w + +# scatter-gather memory size is in bits 6:4 +%sg_msize 6:1 4:1 + +@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 +# Note that both Rn and Qd are 3 bits only (no D bit) +@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr + +@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \ + qd=%qd qm=%qm msize=%sg_msize + +# Qm is in the fields usually labeled Qn +@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \ + qd=%qd qm=%qn + +# Deinterleaving load/interleaving store +@vldst_il .... .... .. w:1 . rn:4 .... ... size:2 pat:2 ..... &vldst_il \ + qd=%qd + +@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm +@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 +@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn +@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 +@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \ + size=%size_28 +@1imm .... .... .... .... .... cmode:4 .. op:1 . .... &1imm qd=%qd imm=%imm_28_16_0 + +# The _rev suffix indicates that Vn and Vm are reversed. This is +# the case for shifts. In the Arm ARM these insns are documented +# with the Vm and Vn fields in their usual places, but in the +# assembly the operands are listed "backwards", ie in the order +# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose +# to consider Vm and Vn as being in different fields in the insn. +# This gives us consistency with A64 and Neon. +@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm + +@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn +@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn + +@2_shl_b .... .... .. 001 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0 +@2_shl_h .... .... .. 01 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1 +@2_shl_w .... .... .. 1 shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2 + +@2_shll_b .... .... ... 01 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0 +@2_shll_h .... .... ... 1 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1 +# VSHLL encoding T2 where shift == esize +@2_shll_esize_b .... .... .... 00 .. .... .... .... .... &2shift \ + qd=%qd qm=%qm size=0 shift=8 +@2_shll_esize_h .... .... .... 01 .. .... .... .... .... &2shift \ + qd=%qd qm=%qm size=1 shift=16 + +# Right shifts are encoded as N - shift, where N is the element size in bits. +%rshift_i5 16:5 !function=rsub_32 +%rshift_i4 16:4 !function=rsub_16 +%rshift_i3 16:3 !function=rsub_8 + +@2_shr_b .... .... .. 001 ... .... .... .... .... &2shift qd=%qd qm=%qm \ + size=0 shift=%rshift_i3 +@2_shr_h .... .... .. 01 .... .... .... .... .... &2shift qd=%qd qm=%qm \ + size=1 shift=%rshift_i4 +@2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \ + size=2 shift=%rshift_i5 + +@shl_scalar .... .... .... size:2 .. .... .... .... rm:4 &shl_scalar qda=%qd + +# Vector comparison; 4-bit Qm but 3-bit Qn +%mask_22_13 22:1 13:3 +@vcmp .... .... .. size:2 qn:3 . .... .... .... .... &vcmp qm=%qm mask=%mask_22_13 +@vcmp_scalar .... .... .. size:2 qn:3 . .... .... .... rm:4 &vcmp_scalar \ + mask=%mask_22_13 + +@vcmp_fp .... .... .... qn:3 . .... .... .... .... &vcmp \ + qm=%qm size=%2op_fp_scalar_size mask=%mask_22_13 + +# Bit 28 is a 2op_fp_scalar_size bit, but we do not decode it in this +# format to avoid complicated overlapping-instruction-groups +@vcmp_fp_scalar .... .... .... qn:3 . .... .... .... rm:4 &vcmp_scalar \ + mask=%mask_22_13 + +@vmaxv .... .... .... size:2 .. rda:4 .... .... .... &vmaxv qm=%qm + +@2op_fp .... .... .... .... .... .... .... .... &2op \ + qd=%qd qn=%qn qm=%qm size=%2op_fp_size + +@2op_fp_size_rev .... .... .... .... .... .... .... .... &2op \ + qd=%qd qn=%qn qm=%qm size=%2op_fp_size_rev + +# 2-operand, but Qd and Qn share a field. Size is in bit 28, but we +# don't decode it in this format +@vmaxnma .... .... .... .... .... .... .... .... &2op \ + qd=%qd qn=%qd qm=%qm + +# Here also we don't decode the bit 28 size in the format to avoid +# awkward nested overlap groups +@vmaxnmv .... .... .... .... rda:4 .... .... .... &vmaxv qm=%qm + +@2op_fp_scalar .... .... .... .... .... .... .... rm:4 &2scalar \ + qd=%qd qn=%qn size=%2op_fp_scalar_size + +# Vector loads and stores + +# Widening loads and narrowing stores: +# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding' +# This means we need to expand out to multiple patterns for P, W, SZ. +# For stores the U bit must be 0 but we catch that in the trans_ function. +# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from +# signed halfword element in register", etc. +VLDSTB_H 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=0 w=1 size=1 +VLDSTB_H 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=1 size=1 +VLDSTB_W 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTB_W 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 +VLDSTH_W 111 . 110 0 a:1 0 1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTH_W 111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 + +# Non-widening loads/stores (P=0 W=0 is 'related encoding') +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=0 w=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=1 + +# gather loads/scatter stores +VLDR_S_sg 111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg +VLDR_U_sg 111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg +VSTR_sg 111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg + +VLDRW_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm +VLDRD_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm +VSTRW_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm +VSTRD_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm + +# deinterleaving loads/interleaving stores +VLD2 1111 1100 1 .. 1 .... ... 1 111 .. .. 00000 @vldst_il +VLD4 1111 1100 1 .. 1 .... ... 1 111 .. .. 00001 @vldst_il +VST2 1111 1100 1 .. 0 .... ... 1 111 .. .. 00000 @vldst_il +VST4 1111 1100 1 .. 0 .... ... 1 111 .. .. 00001 @vldst_il + +# Moves between 2 32-bit vector lanes and 2 general purpose registers +VMOV_to_2gp 1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd +VMOV_from_2gp 1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd + +# Vector 2-op +VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz + +VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op + +# The VSHLL T2 encoding is not a @2op pattern, but is here because it +# overlaps what would be size=0b11 VMULH/VRMULH +{ + VCVTB_SH 111 0 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz + + VMAXNMA 111 0 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=2 + + VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b + VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h + + VQMOVUNB 111 0 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op + VQMOVN_BS 111 0 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op + + VMAXA 111 0 1110 0 . 11 .. 11 ... 0 1110 1 0 . 0 ... 1 @1op + + VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op +} + +{ + VCVTB_HS 111 1 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz + + VMAXNMA 111 1 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=1 + + VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b + VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h + + VMOVNB 111 1 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op + VQMOVN_BU 111 1 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op + + VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op +} + +{ + VCVTT_SH 111 0 1110 0 . 11 1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz + + VMINNMA 111 0 1110 0 . 11 1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=2 + VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b + VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h + + VQMOVUNT 111 0 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op + VQMOVN_TS 111 0 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op + + VMINA 111 0 1110 0 . 11 .. 11 ... 1 1110 1 0 . 0 ... 1 @1op + + VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op +} + +{ + VCVTT_HS 111 1 1110 0 . 11 1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz + + VMINNMA 111 1 1110 0 . 11 1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=1 + VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b + VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h + + VMOVNT 111 1 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op + VQMOVN_TU 111 1 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op + + VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op +} + +VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op +VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op + +VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op +VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op + +VHADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op +VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op + +{ + VMULLP_B 111 . 1110 0 . 11 ... 1 ... 0 1110 . 0 . 0 ... 0 @2op_sz28 + VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op + VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +} +{ + VMULLP_T 111 . 1110 0 . 11 ... 1 ... 1 1110 . 0 . 0 ... 0 @2op_sz28 + VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op + VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +} + +VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op +VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op + +VQADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op +VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op + +VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev +VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev + +VRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev +VRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev + +VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev +VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev + +VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev +VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev + +{ + VCMUL0 111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 0 @2op_sz28 + VQDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op + VQDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op +} + +{ + VCMUL180 111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 0 @2op_sz28 + VQDMLADHX 111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op + VQDMLSDHX 111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op +} + +{ + VCMUL90 111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 1 @2op_sz28 + VQRDMLADH 111 0 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op + VQRDMLSDH 111 1 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op +} + +{ + VCMUL270 111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 1 @2op_sz28 + VQRDMLADHX 111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op + VQRDMLSDHX 111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op +} + +VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28 +VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 + +VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op +VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op + +{ + VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VHCADD90 1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VHCADD270 1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} + +{ + VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VCADD90 1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VCADD270 1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} + +# Vector miscellaneous + +VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op +VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op + +VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op +VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op +VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op + +VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz + +VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op +VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op +VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op +VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op + +VQABS 1111 1111 1 . 11 .. 00 ... 0 0111 01 . 0 ... 0 @1op +VQNEG 1111 1111 1 . 11 .. 00 ... 0 0111 11 . 0 ... 0 @1op + +&vdup qd rt size +# Qd is in the fields usually named Qn +@vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup + +# B and E bits encode size, which we decode here to the usual size values +VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 + +# Incrementing and decrementing dup + +# VIDUP, VDDUP format immediate: 1 << (immh:imml) +%imm_vidup 7:1 0:1 !function=vidup_imm + +# VIDUP, VDDUP registers: Rm bits [3:1] from insn, bit 0 is 1; +# Rn bits [3:1] from insn, bit 0 is 0 +%vidup_rm 1:3 !function=times_2_plus_1 +%vidup_rn 17:3 !function=times_2 + +@vidup .... .... . . size:2 .... .... .... .... .... \ + qd=%qd imm=%imm_vidup rn=%vidup_rn &vidup +@viwdup .... .... . . size:2 .... .... .... .... .... \ + qd=%qd imm=%imm_vidup rm=%vidup_rm rn=%vidup_rn &viwdup +{ + VIDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 111 . @vidup + VIWDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 ... . @viwdup +} +{ + VCMPGT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=2 + VCMPLE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=2 + VDDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 111 . @vidup + VDWDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 ... . @viwdup +} + +# multiply-add long dual accumulate +# rdahi: bits [3:1] from insn, bit 0 is 1 +# rdalo: bits [3:1] from insn, bit 0 is 0 +%rdahi 20:3 !function=times_2_plus_1 +%rdalo 13:3 !function=times_2 +# size bit is 0 for 16 bit, 1 for 32 bit +%size_16 16:1 !function=plus_1 + +&vmlaldav rdahi rdalo size qn qm x a +&vmladav rda size qn qm x a + +@vmlaldav .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav +@vmlaldav_nosz .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav +@vmladav .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \ + qn=%qn rda=%rdalo size=%size_16 &vmladav +@vmladav_nosz .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \ + qn=%qn rda=%rdalo size=0 &vmladav + +{ + VMLADAV_S 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav + VMLALDAV_S 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav +} +{ + VMLADAV_U 1111 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav + VMLALDAV_U 1111 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav +} + +{ + VMLSDAV 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 1 @vmladav + VMLSLDAV 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 1 @vmlaldav +} + +{ + VMLSDAV 1111 1110 1111 ... 0 ... . 1110 . 0 . 0 ... 1 @vmladav_nosz + VRMLSLDAVH 1111 1110 1 ... ... 0 ... . 1110 . 0 . 0 ... 1 @vmlaldav_nosz +} + +VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz +VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz + +{ + [ + VMAXNMAV 1110 1110 1110 11 00 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=2 + VMINNMAV 1110 1110 1110 11 00 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=2 + VMAXNMV 1110 1110 1110 11 10 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=2 + VMINNMV 1110 1110 1110 11 10 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=2 + ] + [ + VMAXV_S 1110 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINV_S 1110 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv + VMAXAV 1110 1110 1110 .. 00 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINAV 1110 1110 1110 .. 00 .... 1111 1 0 . 0 ... 0 @vmaxv + ] + VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz + VRMLALDAVH_S 1110 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz +} + +{ + [ + VMAXNMAV 1111 1110 1110 11 00 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=1 + VMINNMAV 1111 1110 1110 11 00 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=1 + VMAXNMV 1111 1110 1110 11 10 .... 1111 0 0 . 0 ... 0 @vmaxnmv size=1 + VMINNMV 1111 1110 1110 11 10 .... 1111 1 0 . 0 ... 0 @vmaxnmv size=1 + ] + [ + VMAXV_U 1111 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINV_U 1111 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv + ] + VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz + VRMLALDAVH_U 1111 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz +} + +# Scalar operations + +{ + VCMPEQ_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=2 + VCMPNE_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=2 + VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar +} + +{ + VCMPLT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=2 + VCMPGE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=2 + VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar +} + +{ + VSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar + VRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar + VQSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar + VQRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar + VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +} + +{ + VSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar + VRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar + VQSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar + VQRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar + VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +} + +{ + VADD_fp_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 100 .... @2op_fp_scalar + VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar + VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar +} + +{ + VSUB_fp_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 100 .... @2op_fp_scalar + VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar + VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar +} + +{ + VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + +{ + VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + +{ + VMUL_fp_scalar 111 . 1110 0 . 11 ... 1 ... 0 1110 . 110 .... @2op_fp_scalar + VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar + VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar +} + +{ + VFMA_scalar 111 . 1110 0 . 11 ... 1 ... 0 1110 . 100 .... @2op_fp_scalar + # The U bit (28) is don't-care because it does not affect the result + VMLA 111 - 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar +} + +{ + VFMAS_scalar 111 . 1110 0 . 11 ... 1 ... 1 1110 . 100 .... @2op_fp_scalar + # The U bit (28) is don't-care because it does not affect the result + VMLAS 111 - 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar +} + +VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar +VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar +VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar +VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar + +# Vector add across vector +{ + VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo + VADDLV 111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \ + rdahi=%rdahi rdalo=%rdalo +} + +@vabav .... .... .. size:2 .... rda:4 .... .... .... &vabav qn=%qn qm=%qm + +VABAV_S 111 0 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav +VABAV_U 111 1 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav + +# Logical immediate operations (1 reg and modified-immediate) + +# The cmode/op bits here decode VORR/VBIC/VMOV/VMVN, but +# not in a way we can conveniently represent in decodetree without +# a lot of repetition: +# VORR: op=0, (cmode & 1) && cmode < 12 +# VBIC: op=1, (cmode & 1) && cmode < 12 +# VMOV: everything else +# So we have a single decode line and check the cmode/op in the +# trans function. +Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm + +# Shifts by immediate + +VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b +VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h +VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w + +VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b +VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h +VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w + +VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b +VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h +VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w + +VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b +VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h +VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w + +VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b +VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h +VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w + +VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b +VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h +VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w + +VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b +VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h +VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w + +VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b +VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h +VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w + +# VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file +# Note that VMOVL is encoded as "VSHLL with a zero shift count"; we +# implement it that way rather than special-casing it in the decode. +VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b +VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h + +VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b +VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h + +VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b +VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h + +VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b +VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h + +# Shift-and-insert +VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_b +VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_h +VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w + +VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b +VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h +VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w + +# Narrowing shifts (which only support b and h sizes) +VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b +VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h +VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b +VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h + +VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b +VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h +VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b +VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h + +VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b +VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h +VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b +VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h +VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b +VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h +VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b +VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h + +VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b +VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h +VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b +VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h + +VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b +VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h +VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b +VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h +VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b +VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h +VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b +VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h + +VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b +VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h +VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b +VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h + +VSHLC 111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd + +# Comparisons. We expand out the conditions which are split across +# encodings T1, T2, T3 and the fc bits. These include VPT, which is +# effectively "VCMP then VPST". A plain "VCMP" has a mask field of zero. +{ + VCMPEQ_fp 111 . 1110 0 . 11 ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp_fp + VCMPEQ 111 1 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp +} + +{ + VCMPNE_fp 111 . 1110 0 . 11 ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp_fp + VCMPNE 111 1 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp +} + +{ + VCMPGE_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp_fp + VCMPGE 111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp +} + +{ + VCMPLT_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp_fp + VCMPLT 111 1 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp +} + +{ + VCMPGT_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp_fp + VCMPGT 111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp +} + +{ + VCMPLE_fp 111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp_fp + VCMPLE 1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp +} + +{ + VPSEL 1111 1110 0 . 11 ... 1 ... 0 1111 . 0 . 0 ... 1 @2op_nosz + VCMPCS 1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 1 @vcmp + VCMPHI 1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 1 @vcmp +} + +{ + VPNOT 1111 1110 0 0 11 000 1 000 0 1111 0100 1101 + VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 + VCMPEQ_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=1 + VCMPEQ_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0100 .... @vcmp_scalar +} + +{ + VCMPNE_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=1 + VCMPNE_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1100 .... @vcmp_scalar +} + +{ + VCMPGT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=1 + VCMPGT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0110 .... @vcmp_scalar +} + +{ + VCMPLE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=1 + VCMPLE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1110 .... @vcmp_scalar +} + +{ + VCMPGE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=1 + VCMPGE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0100 .... @vcmp_scalar +} +{ + VCMPLT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=1 + VCMPLT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1100 .... @vcmp_scalar +} + +VCMPCS_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0 1 1 0 .... @vcmp_scalar +VCMPHI_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1 1 1 0 .... @vcmp_scalar + +# 2-operand FP +VADD_fp 1110 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp +VSUB_fp 1110 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp +VMUL_fp 1111 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 1 ... 0 @2op_fp +VABD_fp 1111 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp + +VMAXNM 1111 1111 0 . 0 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp +VMINNM 1111 1111 0 . 1 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp + +VCADD90_fp 1111 1100 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev +VCADD270_fp 1111 1101 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev + +VFMA 1110 1111 0 . 0 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp +VFMS 1110 1111 0 . 1 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp + +VCMLA0 1111 110 00 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev +VCMLA90 1111 110 01 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev +VCMLA180 1111 110 10 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev +VCMLA270 1111 110 11 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev + +# floating-point <-> fixed-point conversions. Naming convention: +# VCVT_<from><to>, S = signed int, U = unsigned int, H = halfprec, F = singleprec +@vcvt .... .... .. 1 ..... .... .. 1 . .... .... &2shift \ + qd=%qd qm=%qm shift=%rshift_i5 size=2 +@vcvt_f16 .... .... .. 11 .... .... .. 0 . .... .... &2shift \ + qd=%qd qm=%qm shift=%rshift_i4 size=1 + +VCVT_SH_fixed 1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16 +VCVT_UH_fixed 1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16 + +VCVT_HS_fixed 1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16 +VCVT_HU_fixed 1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16 + +VCVT_SF_fixed 1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt +VCVT_UF_fixed 1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt + +VCVT_FS_fixed 1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt +VCVT_FU_fixed 1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt + +# VCVT between floating point and integer (halfprec and single); +# VCVT_<from><to>, S = signed int, U = unsigned int, F = float +VCVT_SF 1111 1111 1 . 11 .. 11 ... 0 011 00 1 . 0 ... 0 @1op +VCVT_UF 1111 1111 1 . 11 .. 11 ... 0 011 01 1 . 0 ... 0 @1op +VCVT_FS 1111 1111 1 . 11 .. 11 ... 0 011 10 1 . 0 ... 0 @1op +VCVT_FU 1111 1111 1 . 11 .. 11 ... 0 011 11 1 . 0 ... 0 @1op + +# VCVT from floating point to integer with specified rounding mode +VCVTAS 1111 1111 1 . 11 .. 11 ... 000 00 0 1 . 0 ... 0 @1op +VCVTAU 1111 1111 1 . 11 .. 11 ... 000 00 1 1 . 0 ... 0 @1op +VCVTNS 1111 1111 1 . 11 .. 11 ... 000 01 0 1 . 0 ... 0 @1op +VCVTNU 1111 1111 1 . 11 .. 11 ... 000 01 1 1 . 0 ... 0 @1op +VCVTPS 1111 1111 1 . 11 .. 11 ... 000 10 0 1 . 0 ... 0 @1op +VCVTPU 1111 1111 1 . 11 .. 11 ... 000 10 1 1 . 0 ... 0 @1op +VCVTMS 1111 1111 1 . 11 .. 11 ... 000 11 0 1 . 0 ... 0 @1op +VCVTMU 1111 1111 1 . 11 .. 11 ... 000 11 1 1 . 0 ... 0 @1op + +VRINTN 1111 1111 1 . 11 .. 10 ... 001 000 1 . 0 ... 0 @1op +VRINTX 1111 1111 1 . 11 .. 10 ... 001 001 1 . 0 ... 0 @1op +VRINTA 1111 1111 1 . 11 .. 10 ... 001 010 1 . 0 ... 0 @1op +VRINTZ 1111 1111 1 . 11 .. 10 ... 001 011 1 . 0 ... 0 @1op +VRINTM 1111 1111 1 . 11 .. 10 ... 001 101 1 . 0 ... 0 @1op +VRINTP 1111 1111 1 . 11 .. 10 ... 001 111 1 . 0 ... 0 @1op diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode new file mode 100644 index 0000000..fd3a01b --- /dev/null +++ b/target/arm/tcg/neon-dp.decode @@ -0,0 +1,646 @@ +# AArch32 Neon data-processing instruction descriptions +# +# Copyright (c) 2020 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# VFP/Neon register fields; same as vfp.decode +%vm_dp 5:1 0:4 +%vn_dp 7:1 16:4 +%vd_dp 22:1 12:4 + +# Encodings for Neon data processing instructions where the T32 encoding +# is a simple transformation of the A32 encoding. +# More specifically, this file covers instructions where the A32 encoding is +# 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq +# and the T32 encoding is +# 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq +# This file works on the A32 encoding only; calling code for T32 has to +# transform the insn into the A32 version first. + +###################################################################### +# 3-reg-same grouping: +# 1111 001 U 0 D sz:2 Vn:4 Vd:4 opc:4 N Q M op Vm:4 +###################################################################### + +&3same vm vn vd q size + +@3same .... ... . . . size:2 .... .... .... . q:1 . . .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp + +@3same_q0 .... ... . . . size:2 .... .... .... . 0 . . .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 + +# For FP insns the high bit of 'size' is used as part of opcode decode, +# and the 'size' bit is 0 for 32-bit float and 1 for 16-bit float. +# This converts this encoding to the same MO_8/16/32/64 values that the +# integer neon insns use. +%3same_fp_size 20:1 !function=neon_3same_fp_size + +@3same_fp .... ... . . . . . .... .... .... . q:1 . . .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%3same_fp_size +@3same_fp_q0 .... ... . . . . . .... .... .... . 0 . . .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 size=%3same_fp_size + +VHADD_S_3s 1111 001 0 0 . .. .... .... 0000 . . . 0 .... @3same +VHADD_U_3s 1111 001 1 0 . .. .... .... 0000 . . . 0 .... @3same +VQADD_S_3s 1111 001 0 0 . .. .... .... 0000 . . . 1 .... @3same +VQADD_U_3s 1111 001 1 0 . .. .... .... 0000 . . . 1 .... @3same + +VRHADD_S_3s 1111 001 0 0 . .. .... .... 0001 . . . 0 .... @3same +VRHADD_U_3s 1111 001 1 0 . .. .... .... 0001 . . . 0 .... @3same + +@3same_logic .... ... . . . .. .... .... .... . q:1 .. .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0 + +VAND_3s 1111 001 0 0 . 00 .... .... 0001 ... 1 .... @3same_logic +VBIC_3s 1111 001 0 0 . 01 .... .... 0001 ... 1 .... @3same_logic +VORR_3s 1111 001 0 0 . 10 .... .... 0001 ... 1 .... @3same_logic +VORN_3s 1111 001 0 0 . 11 .... .... 0001 ... 1 .... @3same_logic +VEOR_3s 1111 001 1 0 . 00 .... .... 0001 ... 1 .... @3same_logic +VBSL_3s 1111 001 1 0 . 01 .... .... 0001 ... 1 .... @3same_logic +VBIT_3s 1111 001 1 0 . 10 .... .... 0001 ... 1 .... @3same_logic +VBIF_3s 1111 001 1 0 . 11 .... .... 0001 ... 1 .... @3same_logic + +VHSUB_S_3s 1111 001 0 0 . .. .... .... 0010 . . . 0 .... @3same +VHSUB_U_3s 1111 001 1 0 . .. .... .... 0010 . . . 0 .... @3same + +VQSUB_S_3s 1111 001 0 0 . .. .... .... 0010 . . . 1 .... @3same +VQSUB_U_3s 1111 001 1 0 . .. .... .... 0010 . . . 1 .... @3same + +VCGT_S_3s 1111 001 0 0 . .. .... .... 0011 . . . 0 .... @3same +VCGT_U_3s 1111 001 1 0 . .. .... .... 0011 . . . 0 .... @3same +VCGE_S_3s 1111 001 0 0 . .. .... .... 0011 . . . 1 .... @3same +VCGE_U_3s 1111 001 1 0 . .. .... .... 0011 . . . 1 .... @3same + +# The _rev suffix indicates that Vn and Vm are reversed. This is +# the case for shifts. In the Arm ARM these insns are documented +# with the Vm and Vn fields in their usual places, but in the +# assembly the operands are listed "backwards", ie in the order +# Dd, Dm, Dn where other insns use Dd, Dn, Dm. For QEMU we choose +# to consider Vm and Vn as being in different fields in the insn, +# which allows us to avoid special-casing shifts in the trans_ +# function code. We would otherwise need to manually swap the operands +# over to call Neon helper functions that are shared with AArch64, +# which does not have this odd reversed-operand situation. +@3same_rev .... ... . . . size:2 .... .... .... . q:1 . . .... \ + &3same vn=%vm_dp vm=%vn_dp vd=%vd_dp + +VSHL_S_3s 1111 001 0 0 . .. .... .... 0100 . . . 0 .... @3same_rev +VSHL_U_3s 1111 001 1 0 . .. .... .... 0100 . . . 0 .... @3same_rev + +# Insns operating on 64-bit elements (size!=0b11 handled elsewhere) +# The _rev suffix indicates that Vn and Vm are reversed (as explained +# by the comment for the @3same_rev format). +@3same_64_rev .... ... . . . 11 .... .... .... . q:1 . . .... \ + &3same vm=%vn_dp vn=%vm_dp vd=%vd_dp size=3 + +{ + VQSHL_S64_3s 1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev + VQSHL_S_3s 1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_rev +} +{ + VQSHL_U64_3s 1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev + VQSHL_U_3s 1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_rev +} +{ + VRSHL_S64_3s 1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev + VRSHL_S_3s 1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_rev +} +{ + VRSHL_U64_3s 1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev + VRSHL_U_3s 1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_rev +} +{ + VQRSHL_S64_3s 1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev + VQRSHL_S_3s 1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_rev +} +{ + VQRSHL_U64_3s 1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev + VQRSHL_U_3s 1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_rev +} + +VMAX_S_3s 1111 001 0 0 . .. .... .... 0110 . . . 0 .... @3same +VMAX_U_3s 1111 001 1 0 . .. .... .... 0110 . . . 0 .... @3same +VMIN_S_3s 1111 001 0 0 . .. .... .... 0110 . . . 1 .... @3same +VMIN_U_3s 1111 001 1 0 . .. .... .... 0110 . . . 1 .... @3same + +VABD_S_3s 1111 001 0 0 . .. .... .... 0111 . . . 0 .... @3same +VABD_U_3s 1111 001 1 0 . .. .... .... 0111 . . . 0 .... @3same + +VABA_S_3s 1111 001 0 0 . .. .... .... 0111 . . . 1 .... @3same +VABA_U_3s 1111 001 1 0 . .. .... .... 0111 . . . 1 .... @3same + +VADD_3s 1111 001 0 0 . .. .... .... 1000 . . . 0 .... @3same +VSUB_3s 1111 001 1 0 . .. .... .... 1000 . . . 0 .... @3same + +VTST_3s 1111 001 0 0 . .. .... .... 1000 . . . 1 .... @3same +VCEQ_3s 1111 001 1 0 . .. .... .... 1000 . . . 1 .... @3same + +VMLA_3s 1111 001 0 0 . .. .... .... 1001 . . . 0 .... @3same +VMLS_3s 1111 001 1 0 . .. .... .... 1001 . . . 0 .... @3same + +VMUL_3s 1111 001 0 0 . .. .... .... 1001 . . . 1 .... @3same +VMUL_p_3s 1111 001 1 0 . .. .... .... 1001 . . . 1 .... @3same + +VPMAX_S_3s 1111 001 0 0 . .. .... .... 1010 . . . 0 .... @3same_q0 +VPMAX_U_3s 1111 001 1 0 . .. .... .... 1010 . . . 0 .... @3same_q0 + +VPMIN_S_3s 1111 001 0 0 . .. .... .... 1010 . . . 1 .... @3same_q0 +VPMIN_U_3s 1111 001 1 0 . .. .... .... 1010 . . . 1 .... @3same_q0 + +VQDMULH_3s 1111 001 0 0 . .. .... .... 1011 . . . 0 .... @3same +VQRDMULH_3s 1111 001 1 0 . .. .... .... 1011 . . . 0 .... @3same + +VPADD_3s 1111 001 0 0 . .. .... .... 1011 . . . 1 .... @3same_q0 + +VQRDMLAH_3s 1111 001 1 0 . .. .... .... 1011 ... 1 .... @3same + +@3same_crypto .... .... .... .... .... .... .... .... \ + &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0 q=1 + +SHA1C_3s 1111 001 0 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA1P_3s 1111 001 0 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA1M_3s 1111 001 0 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA1SU0_3s 1111 001 0 0 . 11 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA256H_3s 1111 001 1 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA256H2_3s 1111 001 1 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto +SHA256SU1_3s 1111 001 1 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto + +VFMA_fp_3s 1111 001 0 0 . 0 . .... .... 1100 ... 1 .... @3same_fp +VFMS_fp_3s 1111 001 0 0 . 1 . .... .... 1100 ... 1 .... @3same_fp + +VQRDMLSH_3s 1111 001 1 0 . .. .... .... 1100 ... 1 .... @3same + +VADD_fp_3s 1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp +VSUB_fp_3s 1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp +VPADD_fp_3s 1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0 +VABD_fp_3s 1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp +VMLA_fp_3s 1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp +VMLS_fp_3s 1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp +VMUL_fp_3s 1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp +VCEQ_fp_3s 1111 001 0 0 . 0 . .... .... 1110 ... 0 .... @3same_fp +VCGE_fp_3s 1111 001 1 0 . 0 . .... .... 1110 ... 0 .... @3same_fp +VACGE_fp_3s 1111 001 1 0 . 0 . .... .... 1110 ... 1 .... @3same_fp +VCGT_fp_3s 1111 001 1 0 . 1 . .... .... 1110 ... 0 .... @3same_fp +VACGT_fp_3s 1111 001 1 0 . 1 . .... .... 1110 ... 1 .... @3same_fp +VMAX_fp_3s 1111 001 0 0 . 0 . .... .... 1111 ... 0 .... @3same_fp +VMIN_fp_3s 1111 001 0 0 . 1 . .... .... 1111 ... 0 .... @3same_fp +VPMAX_fp_3s 1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0 +VPMIN_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0 +VRECPS_fp_3s 1111 001 0 0 . 0 . .... .... 1111 ... 1 .... @3same_fp +VRSQRTS_fp_3s 1111 001 0 0 . 1 . .... .... 1111 ... 1 .... @3same_fp +VMAXNM_fp_3s 1111 001 1 0 . 0 . .... .... 1111 ... 1 .... @3same_fp +VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp + +###################################################################### +# 2-reg-and-shift grouping: +# 1111 001 U 1 D immH:3 immL:3 Vd:4 opc:4 L Q M 1 Vm:4 +###################################################################### +&2reg_shift vm vd q shift size + +# Right shifts are encoded as N - shift, where N is the element size in bits. +%neon_rshift_i6 16:6 !function=rsub_64 +%neon_rshift_i5 16:5 !function=rsub_32 +%neon_rshift_i4 16:4 !function=rsub_16 +%neon_rshift_i3 16:3 !function=rsub_8 + +@2reg_shr_d .... ... . . . ...... .... .... 1 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=3 shift=%neon_rshift_i6 +@2reg_shr_s .... ... . . . 1 ..... .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5 +@2reg_shr_h .... ... . . . 01 .... .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4 +@2reg_shr_b .... ... . . . 001 ... .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i3 + +@2reg_shl_d .... ... . . . shift:6 .... .... 1 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=3 +@2reg_shl_s .... ... . . . 1 shift:5 .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=2 +@2reg_shl_h .... ... . . . 01 shift:4 .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=1 +@2reg_shl_b .... ... . . . 001 shift:3 .... .... 0 q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=0 + +# Narrowing right shifts: here the Q bit is part of the opcode decode +@2reg_shrn_d .... ... . . . 1 ..... .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=3 q=0 \ + shift=%neon_rshift_i5 +@2reg_shrn_s .... ... . . . 01 .... .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 \ + shift=%neon_rshift_i4 +@2reg_shrn_h .... ... . . . 001 ... .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \ + shift=%neon_rshift_i3 + +# Long left shifts: again Q is part of opcode decode +@2reg_shll_s .... ... . . . 1 shift:5 .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 +@2reg_shll_h .... ... . . . 01 shift:4 .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 +@2reg_shll_b .... ... . . . 001 shift:3 .... .... 0 . . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0 + +@2reg_vcvt .... ... . . . 1 ..... .... .... . q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5 +@2reg_vcvt_f16 .... ... . . . 11 .... .... .... . q:1 . . .... \ + &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4 + +VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d +VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s +VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h +VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b + +VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d +VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s +VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h +VSHR_U_2sh 1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b + +VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d +VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s +VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h +VSRA_S_2sh 1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b + +VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d +VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s +VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h +VSRA_U_2sh 1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b + +VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d +VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s +VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h +VRSHR_S_2sh 1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b + +VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d +VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s +VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h +VRSHR_U_2sh 1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b + +VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d +VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s +VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h +VRSRA_S_2sh 1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b + +VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d +VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s +VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h +VRSRA_U_2sh 1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b + +VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_d +VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_s +VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_h +VSRI_2sh 1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_b + +VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d +VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s +VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h +VSHL_2sh 1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b + +VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d +VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s +VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h +VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b + +VQSHLU_64_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d +VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s +VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h +VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b + +VQSHL_S_64_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d +VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s +VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h +VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b + +VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d +VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s +VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h +VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b + +VSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d +VSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s +VSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h + +VRSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d +VRSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s +VRSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h + +VQSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d +VQSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s +VQSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h + +VQRSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d +VQRSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s +VQRSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h + +# VQSHRN with signed input +VQSHRN_S64_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d +VQSHRN_S32_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s +VQSHRN_S16_2sh 1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h + +# VQRSHRN with signed input +VQRSHRN_S64_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d +VQRSHRN_S32_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s +VQRSHRN_S16_2sh 1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h + +# VQSHRN with unsigned input +VQSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d +VQSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s +VQSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h + +# VQRSHRN with unsigned input +VQRSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d +VQRSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s +VQRSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h + +VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s +VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h +VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b + +VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s +VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h +VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b + +# VCVT fixed<->float conversions +VCVT_SH_2sh 1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16 +VCVT_UH_2sh 1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16 +VCVT_HS_2sh 1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16 +VCVT_HU_2sh 1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16 + +VCVT_SF_2sh 1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt +VCVT_UF_2sh 1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt +VCVT_FS_2sh 1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt +VCVT_FU_2sh 1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt + +###################################################################### +# 1-reg-and-modified-immediate grouping: +# 1111 001 i 1 D 000 imm:3 Vd:4 cmode:4 0 Q op 1 Vm:4 +###################################################################### + +&1reg_imm vd q imm cmode op + +%asimd_imm_value 24:1 16:3 0:4 + +@1reg_imm .... ... . . . ... ... .... .... . q:1 . . .... \ + &1reg_imm imm=%asimd_imm_value vd=%vd_dp + +# The cmode/op bits here decode VORR/VBIC/VMOV/VMNV, but +# not in a way we can conveniently represent in decodetree without +# a lot of repetition: +# VORR: op=0, (cmode & 1) && cmode < 12 +# VBIC: op=1, (cmode & 1) && cmode < 12 +# VMOV: everything else +# So we have a single decode line and check the cmode/op in the +# trans function. +Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm + +###################################################################### +# Within the "two registers, or three registers of different lengths" +# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode +# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar; +# or they are a size field for the three-reg-different-lengths and +# two-reg-and-scalar insn groups (where size cannot be 0b11). This +# is slightly awkward for decodetree: we handle it with this +# non-exclusive group which contains within it two exclusive groups: +# one for the size=0b11 patterns, and one for the size-not-0b11 +# patterns. This allows us to check that none of the insns within +# each subgroup accidentally overlap each other. Note that all the +# trans functions for the size-not-0b11 patterns must check and +# return false for size==3. +###################################################################### +{ + [ + ################################################################## + # Miscellaneous size=0b11 insns + ################################################################## + VEXT 1111 001 0 1 . 11 .... .... imm:4 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp + + VTBL 1111 001 1 1 . 11 .... .... 10 len:2 . op:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp + + VDUP_scalar 1111 001 1 1 . 11 index:3 1 .... 11 000 q:1 . 0 .... \ + vm=%vm_dp vd=%vd_dp size=0 + VDUP_scalar 1111 001 1 1 . 11 index:2 10 .... 11 000 q:1 . 0 .... \ + vm=%vm_dp vd=%vd_dp size=1 + VDUP_scalar 1111 001 1 1 . 11 index:1 100 .... 11 000 q:1 . 0 .... \ + vm=%vm_dp vd=%vd_dp size=2 + + ################################################################## + # 2-reg-misc grouping: + # 1111 001 11 D 11 size:2 opc1:2 Vd:4 0 opc2:4 q:1 M 0 Vm:4 + ################################################################## + + &2misc vd vm q size + + @2misc .... ... .. . .. size:2 .. .... . .... q:1 . . .... \ + &2misc vm=%vm_dp vd=%vd_dp + @2misc_q0 .... ... .. . .. size:2 .. .... . .... . . . .... \ + &2misc vm=%vm_dp vd=%vd_dp q=0 + @2misc_q1 .... ... .. . .. size:2 .. .... . .... . . . .... \ + &2misc vm=%vm_dp vd=%vd_dp q=1 + + VREV64 1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc + VREV32 1111 001 11 . 11 .. 00 .... 0 0001 . . 0 .... @2misc + VREV16 1111 001 11 . 11 .. 00 .... 0 0010 . . 0 .... @2misc + + VPADDL_S 1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc + VPADDL_U 1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc + + AESE 1111 001 11 . 11 .. 00 .... 0 0110 0 . 0 .... @2misc_q1 + AESD 1111 001 11 . 11 .. 00 .... 0 0110 1 . 0 .... @2misc_q1 + AESMC 1111 001 11 . 11 .. 00 .... 0 0111 0 . 0 .... @2misc_q1 + AESIMC 1111 001 11 . 11 .. 00 .... 0 0111 1 . 0 .... @2misc_q1 + + VCLS 1111 001 11 . 11 .. 00 .... 0 1000 . . 0 .... @2misc + VCLZ 1111 001 11 . 11 .. 00 .... 0 1001 . . 0 .... @2misc + VCNT 1111 001 11 . 11 .. 00 .... 0 1010 . . 0 .... @2misc + + VMVN 1111 001 11 . 11 .. 00 .... 0 1011 . . 0 .... @2misc + + VPADAL_S 1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc + VPADAL_U 1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc + + VQABS 1111 001 11 . 11 .. 00 .... 0 1110 . . 0 .... @2misc + VQNEG 1111 001 11 . 11 .. 00 .... 0 1111 . . 0 .... @2misc + + VCGT0 1111 001 11 . 11 .. 01 .... 0 0000 . . 0 .... @2misc + VCGE0 1111 001 11 . 11 .. 01 .... 0 0001 . . 0 .... @2misc + VCEQ0 1111 001 11 . 11 .. 01 .... 0 0010 . . 0 .... @2misc + VCLE0 1111 001 11 . 11 .. 01 .... 0 0011 . . 0 .... @2misc + VCLT0 1111 001 11 . 11 .. 01 .... 0 0100 . . 0 .... @2misc + + SHA1H 1111 001 11 . 11 .. 01 .... 0 0101 1 . 0 .... @2misc_q1 + + VABS 1111 001 11 . 11 .. 01 .... 0 0110 . . 0 .... @2misc + VNEG 1111 001 11 . 11 .. 01 .... 0 0111 . . 0 .... @2misc + + VCGT0_F 1111 001 11 . 11 .. 01 .... 0 1000 . . 0 .... @2misc + VCGE0_F 1111 001 11 . 11 .. 01 .... 0 1001 . . 0 .... @2misc + VCEQ0_F 1111 001 11 . 11 .. 01 .... 0 1010 . . 0 .... @2misc + VCLE0_F 1111 001 11 . 11 .. 01 .... 0 1011 . . 0 .... @2misc + VCLT0_F 1111 001 11 . 11 .. 01 .... 0 1100 . . 0 .... @2misc + + VABS_F 1111 001 11 . 11 .. 01 .... 0 1110 . . 0 .... @2misc + VNEG_F 1111 001 11 . 11 .. 01 .... 0 1111 . . 0 .... @2misc + + VSWP 1111 001 11 . 11 .. 10 .... 0 0000 . . 0 .... @2misc + VTRN 1111 001 11 . 11 .. 10 .... 0 0001 . . 0 .... @2misc + VUZP 1111 001 11 . 11 .. 10 .... 0 0010 . . 0 .... @2misc + VZIP 1111 001 11 . 11 .. 10 .... 0 0011 . . 0 .... @2misc + + VMOVN 1111 001 11 . 11 .. 10 .... 0 0100 0 . 0 .... @2misc_q0 + # VQMOVUN: unsigned result (source is always signed) + VQMOVUN 1111 001 11 . 11 .. 10 .... 0 0100 1 . 0 .... @2misc_q0 + # VQMOVN: signed result, source may be signed (_S) or unsigned (_U) + VQMOVN_S 1111 001 11 . 11 .. 10 .... 0 0101 0 . 0 .... @2misc_q0 + VQMOVN_U 1111 001 11 . 11 .. 10 .... 0 0101 1 . 0 .... @2misc_q0 + + VSHLL 1111 001 11 . 11 .. 10 .... 0 0110 0 . 0 .... @2misc_q0 + + SHA1SU1 1111 001 11 . 11 .. 10 .... 0 0111 0 . 0 .... @2misc_q1 + SHA256SU0 1111 001 11 . 11 .. 10 .... 0 0111 1 . 0 .... @2misc_q1 + + VRINTN 1111 001 11 . 11 .. 10 .... 0 1000 . . 0 .... @2misc + VRINTX 1111 001 11 . 11 .. 10 .... 0 1001 . . 0 .... @2misc + VRINTA 1111 001 11 . 11 .. 10 .... 0 1010 . . 0 .... @2misc + VRINTZ 1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc + + VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0 + VCVT_B16_F32 1111 001 11 . 11 .. 10 .... 0 1100 1 . 0 .... @2misc_q0 + + VRINTM 1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc + + VCVT_F32_F16 1111 001 11 . 11 .. 10 .... 0 1110 0 . 0 .... @2misc_q0 + + VRINTP 1111 001 11 . 11 .. 10 .... 0 1111 . . 0 .... @2misc + + VCVTAS 1111 001 11 . 11 .. 11 .... 0 0000 . . 0 .... @2misc + VCVTAU 1111 001 11 . 11 .. 11 .... 0 0001 . . 0 .... @2misc + VCVTNS 1111 001 11 . 11 .. 11 .... 0 0010 . . 0 .... @2misc + VCVTNU 1111 001 11 . 11 .. 11 .... 0 0011 . . 0 .... @2misc + VCVTPS 1111 001 11 . 11 .. 11 .... 0 0100 . . 0 .... @2misc + VCVTPU 1111 001 11 . 11 .. 11 .... 0 0101 . . 0 .... @2misc + VCVTMS 1111 001 11 . 11 .. 11 .... 0 0110 . . 0 .... @2misc + VCVTMU 1111 001 11 . 11 .. 11 .... 0 0111 . . 0 .... @2misc + + VRECPE 1111 001 11 . 11 .. 11 .... 0 1000 . . 0 .... @2misc + VRSQRTE 1111 001 11 . 11 .. 11 .... 0 1001 . . 0 .... @2misc + VRECPE_F 1111 001 11 . 11 .. 11 .... 0 1010 . . 0 .... @2misc + VRSQRTE_F 1111 001 11 . 11 .. 11 .... 0 1011 . . 0 .... @2misc + VCVT_FS 1111 001 11 . 11 .. 11 .... 0 1100 . . 0 .... @2misc + VCVT_FU 1111 001 11 . 11 .. 11 .... 0 1101 . . 0 .... @2misc + VCVT_SF 1111 001 11 . 11 .. 11 .... 0 1110 . . 0 .... @2misc + VCVT_UF 1111 001 11 . 11 .. 11 .... 0 1111 . . 0 .... @2misc + ] + + # Subgroup for size != 0b11 + [ + ################################################################## + # 3-reg-different-length grouping: + # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4 + ################################################################## + + &3diff vm vn vd size + + @3diff .... ... . . . size:2 .... .... .... . . . . .... \ + &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp + + VADDL_S_3d 1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff + VADDL_U_3d 1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff + + VADDW_S_3d 1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff + VADDW_U_3d 1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff + + VSUBL_S_3d 1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff + VSUBL_U_3d 1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff + + VSUBW_S_3d 1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff + VSUBW_U_3d 1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff + + VADDHN_3d 1111 001 0 1 . .. .... .... 0100 . 0 . 0 .... @3diff + VRADDHN_3d 1111 001 1 1 . .. .... .... 0100 . 0 . 0 .... @3diff + + VABAL_S_3d 1111 001 0 1 . .. .... .... 0101 . 0 . 0 .... @3diff + VABAL_U_3d 1111 001 1 1 . .. .... .... 0101 . 0 . 0 .... @3diff + + VSUBHN_3d 1111 001 0 1 . .. .... .... 0110 . 0 . 0 .... @3diff + VRSUBHN_3d 1111 001 1 1 . .. .... .... 0110 . 0 . 0 .... @3diff + + VABDL_S_3d 1111 001 0 1 . .. .... .... 0111 . 0 . 0 .... @3diff + VABDL_U_3d 1111 001 1 1 . .. .... .... 0111 . 0 . 0 .... @3diff + + VMLAL_S_3d 1111 001 0 1 . .. .... .... 1000 . 0 . 0 .... @3diff + VMLAL_U_3d 1111 001 1 1 . .. .... .... 1000 . 0 . 0 .... @3diff + + VQDMLAL_3d 1111 001 0 1 . .. .... .... 1001 . 0 . 0 .... @3diff + + VMLSL_S_3d 1111 001 0 1 . .. .... .... 1010 . 0 . 0 .... @3diff + VMLSL_U_3d 1111 001 1 1 . .. .... .... 1010 . 0 . 0 .... @3diff + + VQDMLSL_3d 1111 001 0 1 . .. .... .... 1011 . 0 . 0 .... @3diff + + VMULL_S_3d 1111 001 0 1 . .. .... .... 1100 . 0 . 0 .... @3diff + VMULL_U_3d 1111 001 1 1 . .. .... .... 1100 . 0 . 0 .... @3diff + + VQDMULL_3d 1111 001 0 1 . .. .... .... 1101 . 0 . 0 .... @3diff + + VMULL_P_3d 1111 001 0 1 . .. .... .... 1110 . 0 . 0 .... @3diff + + ################################################################## + # 2-regs-plus-scalar grouping: + # 1111 001 Q 1 D sz!=11 Vn:4 Vd:4 opc:4 N 1 M 0 Vm:4 + ################################################################## + &2scalar vm vn vd size q + + @2scalar .... ... q:1 . . size:2 .... .... .... . . . . .... \ + &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp + # For the 'long' ops the Q bit is part of insn decode + @2scalar_q0 .... ... . . . size:2 .... .... .... . . . . .... \ + &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 + + VMLA_2sc 1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar + VMLA_F_2sc 1111 001 . 1 . .. .... .... 0001 . 1 . 0 .... @2scalar + + VMLAL_S_2sc 1111 001 0 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0 + VMLAL_U_2sc 1111 001 1 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0 + + VQDMLAL_2sc 1111 001 0 1 . .. .... .... 0011 . 1 . 0 .... @2scalar_q0 + + VMLS_2sc 1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar + VMLS_F_2sc 1111 001 . 1 . .. .... .... 0101 . 1 . 0 .... @2scalar + + VMLSL_S_2sc 1111 001 0 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0 + VMLSL_U_2sc 1111 001 1 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0 + + VQDMLSL_2sc 1111 001 0 1 . .. .... .... 0111 . 1 . 0 .... @2scalar_q0 + + VMUL_2sc 1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar + VMUL_F_2sc 1111 001 . 1 . .. .... .... 1001 . 1 . 0 .... @2scalar + + VMULL_S_2sc 1111 001 0 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0 + VMULL_U_2sc 1111 001 1 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0 + + VQDMULL_2sc 1111 001 0 1 . .. .... .... 1011 . 1 . 0 .... @2scalar_q0 + + VQDMULH_2sc 1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar + VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar + + VQRDMLAH_2sc 1111 001 . 1 . .. .... .... 1110 . 1 . 0 .... @2scalar + VQRDMLSH_2sc 1111 001 . 1 . .. .... .... 1111 . 1 . 0 .... @2scalar + ] +} diff --git a/target/arm/tcg/neon-ls.decode b/target/arm/tcg/neon-ls.decode new file mode 100644 index 0000000..c5f364c --- /dev/null +++ b/target/arm/tcg/neon-ls.decode @@ -0,0 +1,52 @@ +# AArch32 Neon load/store instruction descriptions +# +# Copyright (c) 2020 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +# Encodings for Neon load/store instructions where the T32 encoding +# is a simple transformation of the A32 encoding. +# More specifically, this file covers instructions where the A32 encoding is +# 0b1111_0100_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx +# and the T32 encoding is +# 0b1111_1001_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx +# This file works on the A32 encoding only; calling code for T32 has to +# transform the insn into the A32 version first. + +%vd_dp 22:1 12:4 + +# Neon load/store multiple structures + +VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \ + vd=%vd_dp + +# Neon load single element to all lanes + +VLD_all_lanes 1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \ + vd=%vd_dp + +# Neon load/store single structure to one lane +%imm1_5_p1 5:1 !function=plus_1 +%imm1_6_p1 6:1 !function=plus_1 + +VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 00 n:2 reg_idx:3 align:1 rm:4 \ + vd=%vd_dp size=0 stride=1 +VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 01 n:2 reg_idx:2 . align:1 rm:4 \ + vd=%vd_dp size=1 stride=%imm1_5_p1 +VLDST_single 1111 0100 1 . l:1 0 rn:4 .... 10 n:2 reg_idx:1 . align:2 rm:4 \ + vd=%vd_dp size=2 stride=%imm1_6_p1 diff --git a/target/arm/tcg/neon-shared.decode b/target/arm/tcg/neon-shared.decode new file mode 100644 index 0000000..8e6bd0b --- /dev/null +++ b/target/arm/tcg/neon-shared.decode @@ -0,0 +1,99 @@ +# AArch32 Neon instruction descriptions +# +# Copyright (c) 2020 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +# Encodings for Neon instructions whose encoding is the same for +# both A32 and T32. + +# More specifically, this covers: +# 2reg scalar ext: 0b1111_1110_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx +# 3same ext: 0b1111_110x_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx + +# VFP/Neon register fields; same as vfp.decode +%vm_dp 5:1 0:4 +%vm_sp 0:4 5:1 +%vn_dp 7:1 16:4 +%vn_sp 16:4 7:1 +%vd_dp 22:1 12:4 +%vd_sp 12:4 22:1 + +# For VCMLA/VCADD insns, convert the single-bit size field +# which is 0 for fp16 and 1 for fp32 into a MO_* constant. +# (Note that this is the reverse of the sense of the 1-bit size +# field in the 3same_fp Neon insns.) +%vcadd_size 20:1 !function=plus_1 + +VCMLA 1111 110 rot:2 . 1 . .... .... 1000 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size + +VCADD 1111 110 rot:1 1 . 0 . .... .... 1000 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size + +VSDOT 1111 110 00 . 10 .... .... 1101 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VUDOT 1111 110 00 . 10 .... .... 1101 . q:1 . 1 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VUSDOT 1111 110 01 . 10 .... .... 1101 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VDOT_b16 1111 110 00 . 00 .... .... 1101 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp + +# VFM[AS]L +VFML 1111 110 0 s:1 . 10 .... .... 1000 . 0 . 1 .... \ + vm=%vm_sp vn=%vn_sp vd=%vd_dp q=0 +VFML 1111 110 0 s:1 . 10 .... .... 1000 . 1 . 1 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp q=1 + +VSMMLA 1111 1100 0.10 .... .... 1100 .1.0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VUMMLA 1111 1100 0.10 .... .... 1100 .1.1 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VUSMMLA 1111 1100 1.10 .... .... 1100 .1.0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp +VMMLA_b16 1111 1100 0.00 .... .... 1100 .1.0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp + +VFMA_b16 1111 110 0 0.11 .... .... 1000 . q:1 . 1 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp + +VCMLA_scalar 1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \ + vn=%vn_dp vd=%vd_dp size=1 +VCMLA_scalar 1111 1110 1 . rot:2 .... .... 1000 . q:1 . 0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp size=2 index=0 + +VSDOT_scalar 1111 1110 0 . 10 .... .... 1101 . q:1 index:1 0 vm:4 \ + vn=%vn_dp vd=%vd_dp +VUDOT_scalar 1111 1110 0 . 10 .... .... 1101 . q:1 index:1 1 vm:4 \ + vn=%vn_dp vd=%vd_dp +VUSDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \ + vn=%vn_dp vd=%vd_dp +VSUDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 1 vm:4 \ + vn=%vn_dp vd=%vd_dp +VDOT_b16_scal 1111 1110 0 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \ + vn=%vn_dp vd=%vd_dp + +%vfml_scalar_q0_rm 0:3 5:1 +%vfml_scalar_q1_index 5:1 3:1 +VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 0 . 1 index:1 ... \ + rm=%vfml_scalar_q0_rm vn=%vn_sp vd=%vd_dp q=0 +VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 1 . 1 . rm:3 \ + index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp q=1 +VFMA_b16_scal 1111 1110 0.11 .... .... 1000 . q:1 . 1 . vm:3 \ + index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp diff --git a/target/arm/tcg/sme-fa64.decode b/target/arm/tcg/sme-fa64.decode new file mode 100644 index 0000000..47708cc --- /dev/null +++ b/target/arm/tcg/sme-fa64.decode @@ -0,0 +1,60 @@ +# AArch64 SME allowed instruction decoding +# +# Copyright (c) 2022 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +# These patterns are taken from Appendix E1.1 of DDI0616 A.a, +# Arm Architecture Reference Manual Supplement, +# The Scalable Matrix Extension (SME), for Armv9-A + +{ + [ + OK 0-00 1110 0000 0001 0010 11-- ---- ---- # SMOV W|Xd,Vn.B[0] + OK 0-00 1110 0000 0010 0010 11-- ---- ---- # SMOV W|Xd,Vn.H[0] + OK 0100 1110 0000 0100 0010 11-- ---- ---- # SMOV Xd,Vn.S[0] + OK 0000 1110 0000 0001 0011 11-- ---- ---- # UMOV Wd,Vn.B[0] + OK 0000 1110 0000 0010 0011 11-- ---- ---- # UMOV Wd,Vn.H[0] + OK 0000 1110 0000 0100 0011 11-- ---- ---- # UMOV Wd,Vn.S[0] + OK 0100 1110 0000 1000 0011 11-- ---- ---- # UMOV Xd,Vn.D[0] + ] + FAIL 0--0 111- ---- ---- ---- ---- ---- ---- # Advanced SIMD vector operations +} + +{ + [ + OK 0101 1110 --1- ---- 11-1 11-- ---- ---- # FMULX/FRECPS/FRSQRTS (scalar) + OK 0101 1110 -10- ---- 00-1 11-- ---- ---- # FMULX/FRECPS/FRSQRTS (scalar, FP16) + OK 01-1 1110 1-10 0001 11-1 10-- ---- ---- # FRECPE/FRSQRTE/FRECPX (scalar) + OK 01-1 1110 1111 1001 11-1 10-- ---- ---- # FRECPE/FRSQRTE/FRECPX (scalar, FP16) + ] + FAIL 01-1 111- ---- ---- ---- ---- ---- ---- # Advanced SIMD single-element operations +} + +FAIL 0-00 110- ---- ---- ---- ---- ---- ---- # Advanced SIMD structure load/store +FAIL 1100 1110 ---- ---- ---- ---- ---- ---- # Advanced SIMD cryptography extensions +FAIL 0001 1110 0111 1110 0000 00-- ---- ---- # FJCVTZS + +# These are the "avoidance of doubt" final table of Illegal Advanced SIMD instructions +# We don't actually need to include these, as the default is OK. +# -001 111- ---- ---- ---- ---- ---- ---- # Scalar floating-point operations +# --10 110- ---- ---- ---- ---- ---- ---- # Load/store pair of FP registers +# --01 1100 ---- ---- ---- ---- ---- ---- # Load FP register (PC-relative literal) +# --11 1100 --0- ---- ---- ---- ---- ---- # Load/store FP register (unscaled imm) +# --11 1100 --1- ---- ---- ---- ---- --10 # Load/store FP register (register offset) +# --11 1101 ---- ---- ---- ---- ---- ---- # Load/store FP register (scaled imm) diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode new file mode 100644 index 0000000..628804e --- /dev/null +++ b/target/arm/tcg/sme.decode @@ -0,0 +1,88 @@ +# AArch64 SME instruction descriptions +# +# Copyright (c) 2022 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +### SME Misc + +ZERO 11000000 00 001 00000000000 imm:8 + +### SME Move into/from Array + +%mova_rs 13:2 !function=plus_12 +&mova esz rs pg zr za_imm v:bool to_vec:bool + +MOVA 11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4 \ + &mova to_vec=0 rs=%mova_rs +MOVA 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za_imm:4 \ + &mova to_vec=0 rs=%mova_rs esz=4 + +MOVA 11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5 \ + &mova to_vec=1 rs=%mova_rs +MOVA 11000000 11 00001 1 v:1 .. pg:3 0 za_imm:4 zr:5 \ + &mova to_vec=1 rs=%mova_rs esz=4 + +### SME Memory + +&ldst esz rs pg rn rm za_imm v:bool st:bool + +LDST1 1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \ + &ldst rs=%mova_rs +LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \ + &ldst esz=4 rs=%mova_rs + +&ldstr rv rn imm +@ldstr ....... ... . ...... .. ... rn:5 . imm:4 \ + &ldstr rv=%mova_rs + +LDR 1110000 100 0 000000 .. 000 ..... 0 .... @ldstr +STR 1110000 100 1 000000 .. 000 ..... 0 .... @ldstr + +### SME Add Vector to Array + +&adda zad zn pm pn +@adda_32 ........ .. ..... . pm:3 pn:3 zn:5 ... zad:2 &adda +@adda_64 ........ .. ..... . pm:3 pn:3 zn:5 .. zad:3 &adda + +ADDHA_s 11000000 10 01000 0 ... ... ..... 000 .. @adda_32 +ADDVA_s 11000000 10 01000 1 ... ... ..... 000 .. @adda_32 +ADDHA_d 11000000 11 01000 0 ... ... ..... 00 ... @adda_64 +ADDVA_d 11000000 11 01000 1 ... ... ..... 00 ... @adda_64 + +### SME Outer Product + +&op zad zn zm pm pn sub:bool +@op_32 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op +@op_64 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 . zad:3 &op + +FMOPA_s 10000000 100 ..... ... ... ..... . 00 .. @op_32 +FMOPA_d 10000000 110 ..... ... ... ..... . 0 ... @op_64 + +BFMOPA 10000001 100 ..... ... ... ..... . 00 .. @op_32 +FMOPA_h 10000001 101 ..... ... ... ..... . 00 .. @op_32 + +SMOPA_s 1010000 0 10 0 ..... ... ... ..... . 00 .. @op_32 +SUMOPA_s 1010000 0 10 1 ..... ... ... ..... . 00 .. @op_32 +USMOPA_s 1010000 1 10 0 ..... ... ... ..... . 00 .. @op_32 +UMOPA_s 1010000 1 10 1 ..... ... ... ..... . 00 .. @op_32 + +SMOPA_d 1010000 0 11 0 ..... ... ... ..... . 0 ... @op_64 +SUMOPA_d 1010000 0 11 1 ..... ... ... ..... . 0 ... @op_64 +USMOPA_d 1010000 1 11 0 ..... ... ... ..... . 0 ... @op_64 +UMOPA_d 1010000 1 11 1 ..... ... ... ..... . 0 ... @op_64 diff --git a/target/arm/tcg/sve.decode b/target/arm/tcg/sve.decode new file mode 100644 index 0000000..14b3a69 --- /dev/null +++ b/target/arm/tcg/sve.decode @@ -0,0 +1,1702 @@ +# AArch64 SVE instruction descriptions +# +# Copyright (c) 2017 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +########################################################################### +# Named fields. These are primarily for disjoint fields. + +%imm4_16_p1 16:4 !function=plus_1 +%imm6_22_5 22:1 5:5 +%imm7_22_16 22:2 16:5 +%imm8_16_10 16:5 10:3 +%imm9_16_10 16:s6 10:3 +%size_23 23:2 +%dtype_23_13 23:2 13:2 +%index3_22_19 22:1 19:2 +%index3_19_11 19:2 11:1 +%index2_20_11 20:1 11:1 + +# A combination of tsz:imm3 -- extract esize. +%tszimm_esz 22:2 5:5 !function=tszimm_esz +# A combination of tsz:imm3 -- extract (2 * esize) - (tsz:imm3) +%tszimm_shr 22:2 5:5 !function=tszimm_shr +# A combination of tsz:imm3 -- extract (tsz:imm3) - esize +%tszimm_shl 22:2 5:5 !function=tszimm_shl + +# Similarly for the tszh/tszl pair at 22/16 for zzi +%tszimm16_esz 22:2 16:5 !function=tszimm_esz +%tszimm16_shr 22:2 16:5 !function=tszimm_shr +%tszimm16_shl 22:2 16:5 !function=tszimm_shl + +# Signed 8-bit immediate, optionally shifted left by 8. +%sh8_i8s 5:9 !function=expand_imm_sh8s +# Unsigned 8-bit immediate, optionally shifted left by 8. +%sh8_i8u 5:9 !function=expand_imm_sh8u + +# Unsigned load of msz into esz=2, represented as a dtype. +%msz_dtype 23:2 !function=msz_dtype + +# Either a copy of rd (at bit 0), or a different source +# as propagated via the MOVPRFX instruction. +%reg_movprfx 0:5 + +########################################################################### +# Named attribute sets. These are used to make nice(er) names +# when creating helpers common to those for the individual +# instruction patterns. + +&rr_esz rd rn esz +&rri rd rn imm +&rr_dbm rd rn dbm +&rrri rd rn rm imm +&rri_esz rd rn imm esz +&rrri_esz rd rn rm imm esz +&rrr_esz rd rn rm esz +&rrx_esz rd rn rm index esz +&rpr_esz rd pg rn esz +&rpr_s rd pg rn s +&rprr_s rd pg rn rm s +&rprr_esz rd pg rn rm esz +&rrrr_esz rd ra rn rm esz +&rrxr_esz rd rn rm ra index esz +&rprrr_esz rd pg rn rm ra esz +&rpri_esz rd pg rn imm esz +&ptrue rd esz pat s +&incdec_cnt rd pat esz imm d u +&incdec2_cnt rd rn pat esz imm d u +&incdec_pred rd pg esz d u +&incdec2_pred rd rn pg esz d u +&rprr_load rd pg rn rm dtype nreg +&rpri_load rd pg rn imm dtype nreg +&rprr_store rd pg rn rm msz esz nreg +&rpri_store rd pg rn imm msz esz nreg +&rprr_gather_load rd pg rn rm esz msz u ff xs scale +&rpri_gather_load rd pg rn imm esz msz u ff +&rprr_scatter_store rd pg rn rm esz msz xs scale +&rpri_scatter_store rd pg rn imm esz msz + +########################################################################### +# Named instruction formats. These are generally used to +# reduce the amount of duplication between instruction patterns. + +# Two operand with unused vector element size +@pd_pn_e0 ........ ........ ....... rn:4 . rd:4 &rr_esz esz=0 + +# Two operand +@pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz +@rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz + +# Two operand with governing predicate, flags setting +@pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s +@pd_pg_pn_s0 ........ . . ...... .. pg:4 . rn:4 . rd:4 &rpr_s s=0 + +# Three operand with unused vector element size +@rd_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 &rrr_esz esz=0 + +# Three predicate operand, with governing predicate, flag setting +@pd_pg_pn_pm_s ........ . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4 &rprr_s + +# Three operand, vector element size +@rd_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 &rrr_esz +@pd_pn_pm ........ esz:2 .. rm:4 ....... rn:4 . rd:4 &rrr_esz +@rdn_rm ........ esz:2 ...... ...... rm:5 rd:5 \ + &rrr_esz rn=%reg_movprfx +@rdn_rm_e0 ........ .. ...... ...... rm:5 rd:5 \ + &rrr_esz rn=%reg_movprfx esz=0 +@rdn_sh_i8u ........ esz:2 ...... ...... ..... rd:5 \ + &rri_esz rn=%reg_movprfx imm=%sh8_i8u +@rdn_i8u ........ esz:2 ...... ... imm:8 rd:5 \ + &rri_esz rn=%reg_movprfx +@rdn_i8s ........ esz:2 ...... ... imm:s8 rd:5 \ + &rri_esz rn=%reg_movprfx + +# Four operand, vector element size +@rda_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 \ + &rrrr_esz ra=%reg_movprfx + +# Four operand with unused vector element size +@rda_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 \ + &rrrr_esz esz=0 ra=%reg_movprfx +@rdn_ra_rm_e0 ........ ... rm:5 ... ... ra:5 rd:5 \ + &rrrr_esz esz=0 rn=%reg_movprfx + +# Three operand with "memory" size, aka immediate left shift +@rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri + +# Two register operand, with governing predicate, vector element size +@rdn_pg_rm ........ esz:2 ... ... ... pg:3 rm:5 rd:5 \ + &rprr_esz rn=%reg_movprfx +@rdm_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \ + &rprr_esz rm=%reg_movprfx +@rd_pg4_rn_rm ........ esz:2 . rm:5 .. pg:4 rn:5 rd:5 &rprr_esz +@pd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4 &rprr_esz + +# Three register operand, with governing predicate, vector element size +@rda_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 \ + &rprrr_esz ra=%reg_movprfx +@rdn_pg_ra_rm ........ esz:2 . rm:5 ... pg:3 ra:5 rd:5 \ + &rprrr_esz rn=%reg_movprfx +@rdn_pg_rm_ra ........ esz:2 . ra:5 ... pg:3 rm:5 rd:5 \ + &rprrr_esz rn=%reg_movprfx +@rd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 &rprr_esz + +# One register operand, with governing predicate, vector element size +@rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz +@rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz +@pd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 . rd:4 &rpr_esz + +# One register operand, with governing predicate, no vector element size +@rd_pg_rn_e0 ........ .. ... ... ... pg:3 rn:5 rd:5 &rpr_esz esz=0 + +# Two register operands with a 6-bit signed immediate. +@rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri + +# Two register operand, one immediate operand, with predicate, +# element size encoded as TSZHL. +@rdn_pg_tszimm_shl ........ .. ... ... ... pg:3 ..... rd:5 \ + &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shl +@rdn_pg_tszimm_shr ........ .. ... ... ... pg:3 ..... rd:5 \ + &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shr + +# Similarly without predicate. +@rd_rn_tszimm_shl ........ .. ... ... ...... rn:5 rd:5 \ + &rri_esz esz=%tszimm16_esz imm=%tszimm16_shl +@rd_rn_tszimm_shr ........ .. ... ... ...... rn:5 rd:5 \ + &rri_esz esz=%tszimm16_esz imm=%tszimm16_shr + +# Two register operand, one immediate operand, with 4-bit predicate. +# User must fill in imm. +@rdn_pg4 ........ esz:2 .. pg:4 ... ........ rd:5 \ + &rpri_esz rn=%reg_movprfx + +# Two register operand, one one-bit floating-point operand. +@rdn_i1 ........ esz:2 ......... pg:3 .... imm:1 rd:5 \ + &rpri_esz rn=%reg_movprfx + +# Two register operand, one encoded bitmask. +@rdn_dbm ........ .. .... dbm:13 rd:5 \ + &rr_dbm rn=%reg_movprfx + +# Predicate output, vector and immediate input, +# controlling predicate, element size. +@pd_pg_rn_i7 ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4 &rpri_esz +@pd_pg_rn_i5 ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4 &rpri_esz + +# Basic Load/Store with 9-bit immediate offset +@pd_rn_i9 ........ ........ ...... rn:5 . rd:4 \ + &rri imm=%imm9_16_10 +@rd_rn_i9 ........ ........ ...... rn:5 rd:5 \ + &rri imm=%imm9_16_10 + +# One register, pattern, and uint4+1. +# User must fill in U and D. +@incdec_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \ + &incdec_cnt imm=%imm4_16_p1 +@incdec2_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \ + &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx + +# One register, predicate. +# User must fill in U and D. +@incdec_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 &incdec_pred +@incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \ + &incdec2_pred rn=%reg_movprfx + +# Loads; user must fill in NREG. +@rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load +@rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load + +@rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_load dtype=%msz_dtype +@rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \ + &rpri_load dtype=%msz_dtype + +# Gather Loads. +@rprr_g_load_u ....... .. . . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rprr_g_load_xs_u ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_xs_u_sc ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_xs_sc ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_u_sc ....... .. . scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rprr_g_load_sc ....... .. . scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rpri_g_load ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rpri_gather_load + +# Stores; user must fill in ESZ, MSZ, NREG as needed. +@rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store +@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store +@rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_store nreg=0 +@rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_scatter_store +@rpri_scatter_store ....... msz:2 .. imm:5 ... pg:3 rn:5 rd:5 \ + &rpri_scatter_store + +# Two registers and a scalar by N-bit index +@rrx_3 ........ .. . .. rm:3 ...... rn:5 rd:5 \ + &rrx_esz index=%index3_22_19 +@rrx_2 ........ .. . index:2 rm:3 ...... rn:5 rd:5 &rrx_esz +@rrx_1 ........ .. . index:1 rm:4 ...... rn:5 rd:5 &rrx_esz + +# Two registers and a scalar by N-bit index, alternate +@rrx_3a ........ .. . .. rm:3 ...... rn:5 rd:5 \ + &rrx_esz index=%index3_19_11 +@rrx_2a ........ .. . . rm:4 ...... rn:5 rd:5 \ + &rrx_esz index=%index2_20_11 + +# Three registers and a scalar by N-bit index +@rrxr_3 ........ .. . .. rm:3 ...... rn:5 rd:5 \ + &rrxr_esz ra=%reg_movprfx index=%index3_22_19 +@rrxr_2 ........ .. . index:2 rm:3 ...... rn:5 rd:5 \ + &rrxr_esz ra=%reg_movprfx +@rrxr_1 ........ .. . index:1 rm:4 ...... rn:5 rd:5 \ + &rrxr_esz ra=%reg_movprfx + +# Three registers and a scalar by N-bit index, alternate +@rrxr_3a ........ .. ... rm:3 ...... rn:5 rd:5 \ + &rrxr_esz ra=%reg_movprfx index=%index3_19_11 +@rrxr_2a ........ .. .. rm:4 ...... rn:5 rd:5 \ + &rrxr_esz ra=%reg_movprfx index=%index2_20_11 + +########################################################################### +# Instruction patterns. Grouped according to the SVE encodingindex.xhtml. + +### SVE Integer Arithmetic - Binary Predicated Group + +# SVE bitwise logical vector operations (predicated) +ORR_zpzz 00000100 .. 011 000 000 ... ..... ..... @rdn_pg_rm +EOR_zpzz 00000100 .. 011 001 000 ... ..... ..... @rdn_pg_rm +AND_zpzz 00000100 .. 011 010 000 ... ..... ..... @rdn_pg_rm +BIC_zpzz 00000100 .. 011 011 000 ... ..... ..... @rdn_pg_rm + +# SVE integer add/subtract vectors (predicated) +ADD_zpzz 00000100 .. 000 000 000 ... ..... ..... @rdn_pg_rm +SUB_zpzz 00000100 .. 000 001 000 ... ..... ..... @rdn_pg_rm +SUB_zpzz 00000100 .. 000 011 000 ... ..... ..... @rdm_pg_rn # SUBR + +# SVE integer min/max/difference (predicated) +SMAX_zpzz 00000100 .. 001 000 000 ... ..... ..... @rdn_pg_rm +UMAX_zpzz 00000100 .. 001 001 000 ... ..... ..... @rdn_pg_rm +SMIN_zpzz 00000100 .. 001 010 000 ... ..... ..... @rdn_pg_rm +UMIN_zpzz 00000100 .. 001 011 000 ... ..... ..... @rdn_pg_rm +SABD_zpzz 00000100 .. 001 100 000 ... ..... ..... @rdn_pg_rm +UABD_zpzz 00000100 .. 001 101 000 ... ..... ..... @rdn_pg_rm + +# SVE integer multiply/divide (predicated) +MUL_zpzz 00000100 .. 010 000 000 ... ..... ..... @rdn_pg_rm +SMULH_zpzz 00000100 .. 010 010 000 ... ..... ..... @rdn_pg_rm +UMULH_zpzz 00000100 .. 010 011 000 ... ..... ..... @rdn_pg_rm +# Note that divide requires size >= 2; below 2 is unallocated. +SDIV_zpzz 00000100 .. 010 100 000 ... ..... ..... @rdn_pg_rm +UDIV_zpzz 00000100 .. 010 101 000 ... ..... ..... @rdn_pg_rm +SDIV_zpzz 00000100 .. 010 110 000 ... ..... ..... @rdm_pg_rn # SDIVR +UDIV_zpzz 00000100 .. 010 111 000 ... ..... ..... @rdm_pg_rn # UDIVR + +### SVE Integer Reduction Group + +# SVE bitwise logical reduction (predicated) +ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn +EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn +ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn + +# SVE constructive prefix (predicated) +MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn +MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn + +# SVE integer add reduction (predicated) +# Note that saddv requires size != 3. +UADDV 00000100 .. 000 001 001 ... ..... ..... @rd_pg_rn +SADDV 00000100 .. 000 000 001 ... ..... ..... @rd_pg_rn + +# SVE integer min/max reduction (predicated) +SMAXV 00000100 .. 001 000 001 ... ..... ..... @rd_pg_rn +UMAXV 00000100 .. 001 001 001 ... ..... ..... @rd_pg_rn +SMINV 00000100 .. 001 010 001 ... ..... ..... @rd_pg_rn +UMINV 00000100 .. 001 011 001 ... ..... ..... @rd_pg_rn + +### SVE Shift by Immediate - Predicated Group + +# SVE bitwise shift by immediate (predicated) +ASR_zpzi 00000100 .. 000 000 100 ... .. ... ..... @rdn_pg_tszimm_shr +LSR_zpzi 00000100 .. 000 001 100 ... .. ... ..... @rdn_pg_tszimm_shr +LSL_zpzi 00000100 .. 000 011 100 ... .. ... ..... @rdn_pg_tszimm_shl +ASRD 00000100 .. 000 100 100 ... .. ... ..... @rdn_pg_tszimm_shr +SQSHL_zpzi 00000100 .. 000 110 100 ... .. ... ..... @rdn_pg_tszimm_shl +UQSHL_zpzi 00000100 .. 000 111 100 ... .. ... ..... @rdn_pg_tszimm_shl +SRSHR 00000100 .. 001 100 100 ... .. ... ..... @rdn_pg_tszimm_shr +URSHR 00000100 .. 001 101 100 ... .. ... ..... @rdn_pg_tszimm_shr +SQSHLU 00000100 .. 001 111 100 ... .. ... ..... @rdn_pg_tszimm_shl + +# SVE bitwise shift by vector (predicated) +ASR_zpzz 00000100 .. 010 000 100 ... ..... ..... @rdn_pg_rm +LSR_zpzz 00000100 .. 010 001 100 ... ..... ..... @rdn_pg_rm +LSL_zpzz 00000100 .. 010 011 100 ... ..... ..... @rdn_pg_rm +ASR_zpzz 00000100 .. 010 100 100 ... ..... ..... @rdm_pg_rn # ASRR +LSR_zpzz 00000100 .. 010 101 100 ... ..... ..... @rdm_pg_rn # LSRR +LSL_zpzz 00000100 .. 010 111 100 ... ..... ..... @rdm_pg_rn # LSLR + +# SVE bitwise shift by wide elements (predicated) +# Note these require size != 3. +ASR_zpzw 00000100 .. 011 000 100 ... ..... ..... @rdn_pg_rm +LSR_zpzw 00000100 .. 011 001 100 ... ..... ..... @rdn_pg_rm +LSL_zpzw 00000100 .. 011 011 100 ... ..... ..... @rdn_pg_rm + +### SVE Integer Arithmetic - Unary Predicated Group + +# SVE unary bit operations (predicated) +# Note esz != 0 for FABS and FNEG. +CLS 00000100 .. 011 000 101 ... ..... ..... @rd_pg_rn +CLZ 00000100 .. 011 001 101 ... ..... ..... @rd_pg_rn +CNT_zpz 00000100 .. 011 010 101 ... ..... ..... @rd_pg_rn +CNOT 00000100 .. 011 011 101 ... ..... ..... @rd_pg_rn +NOT_zpz 00000100 .. 011 110 101 ... ..... ..... @rd_pg_rn +FABS 00000100 .. 011 100 101 ... ..... ..... @rd_pg_rn +FNEG 00000100 .. 011 101 101 ... ..... ..... @rd_pg_rn + +# SVE integer unary operations (predicated) +# Note esz > original size for extensions. +ABS 00000100 .. 010 110 101 ... ..... ..... @rd_pg_rn +NEG 00000100 .. 010 111 101 ... ..... ..... @rd_pg_rn +SXTB 00000100 .. 010 000 101 ... ..... ..... @rd_pg_rn +UXTB 00000100 .. 010 001 101 ... ..... ..... @rd_pg_rn +SXTH 00000100 .. 010 010 101 ... ..... ..... @rd_pg_rn +UXTH 00000100 .. 010 011 101 ... ..... ..... @rd_pg_rn +SXTW 00000100 .. 010 100 101 ... ..... ..... @rd_pg_rn +UXTW 00000100 .. 010 101 101 ... ..... ..... @rd_pg_rn + +### SVE Floating Point Compare - Vectors Group + +# SVE floating-point compare vectors +FCMGE_ppzz 01100101 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm +FCMGT_ppzz 01100101 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm +FCMEQ_ppzz 01100101 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm +FCMNE_ppzz 01100101 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm +FCMUO_ppzz 01100101 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm +FACGE_ppzz 01100101 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm +FACGT_ppzz 01100101 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm + +### SVE Integer Multiply-Add Group + +# SVE integer multiply-add writing addend (predicated) +MLA 00000100 .. 0 ..... 010 ... ..... ..... @rda_pg_rn_rm +MLS 00000100 .. 0 ..... 011 ... ..... ..... @rda_pg_rn_rm + +# SVE integer multiply-add writing multiplicand (predicated) +MLA 00000100 .. 0 ..... 110 ... ..... ..... @rdn_pg_ra_rm # MAD +MLS 00000100 .. 0 ..... 111 ... ..... ..... @rdn_pg_ra_rm # MSB + +### SVE Integer Arithmetic - Unpredicated Group + +# SVE integer add/subtract vectors (unpredicated) +ADD_zzz 00000100 .. 1 ..... 000 000 ..... ..... @rd_rn_rm +SUB_zzz 00000100 .. 1 ..... 000 001 ..... ..... @rd_rn_rm +SQADD_zzz 00000100 .. 1 ..... 000 100 ..... ..... @rd_rn_rm +UQADD_zzz 00000100 .. 1 ..... 000 101 ..... ..... @rd_rn_rm +SQSUB_zzz 00000100 .. 1 ..... 000 110 ..... ..... @rd_rn_rm +UQSUB_zzz 00000100 .. 1 ..... 000 111 ..... ..... @rd_rn_rm + +### SVE Logical - Unpredicated Group + +# SVE bitwise logical operations (unpredicated) +AND_zzz 00000100 00 1 ..... 001 100 ..... ..... @rd_rn_rm_e0 +ORR_zzz 00000100 01 1 ..... 001 100 ..... ..... @rd_rn_rm_e0 +EOR_zzz 00000100 10 1 ..... 001 100 ..... ..... @rd_rn_rm_e0 +BIC_zzz 00000100 11 1 ..... 001 100 ..... ..... @rd_rn_rm_e0 + +XAR 00000100 .. 1 ..... 001 101 rm:5 rd:5 &rrri_esz \ + rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr + +# SVE2 bitwise ternary operations +EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0 +BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 +BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0 +BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 +BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 +NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 + +### SVE Index Generation Group + +# SVE index generation (immediate start, immediate increment) +INDEX_ii 00000100 esz:2 1 imm2:s5 010000 imm1:s5 rd:5 + +# SVE index generation (immediate start, register increment) +INDEX_ir 00000100 esz:2 1 rm:5 010010 imm:s5 rd:5 + +# SVE index generation (register start, immediate increment) +INDEX_ri 00000100 esz:2 1 imm:s5 010001 rn:5 rd:5 + +# SVE index generation (register start, register increment) +INDEX_rr 00000100 .. 1 ..... 010011 ..... ..... @rd_rn_rm + +### SVE / Streaming SVE Stack Allocation Group + +# SVE stack frame adjustment +ADDVL 00000100 001 ..... 01010 ...... ..... @rd_rn_i6 +ADDSVL 00000100 001 ..... 01011 ...... ..... @rd_rn_i6 +ADDPL 00000100 011 ..... 01010 ...... ..... @rd_rn_i6 +ADDSPL 00000100 011 ..... 01011 ...... ..... @rd_rn_i6 + +# SVE stack frame size +RDVL 00000100 101 11111 01010 imm:s6 rd:5 +RDSVL 00000100 101 11111 01011 imm:s6 rd:5 + +### SVE Bitwise Shift - Unpredicated Group + +# SVE bitwise shift by immediate (unpredicated) +ASR_zzi 00000100 .. 1 ..... 1001 00 ..... ..... @rd_rn_tszimm_shr +LSR_zzi 00000100 .. 1 ..... 1001 01 ..... ..... @rd_rn_tszimm_shr +LSL_zzi 00000100 .. 1 ..... 1001 11 ..... ..... @rd_rn_tszimm_shl + +# SVE bitwise shift by wide elements (unpredicated) +# Note esz != 3 +ASR_zzw 00000100 .. 1 ..... 1000 00 ..... ..... @rd_rn_rm +LSR_zzw 00000100 .. 1 ..... 1000 01 ..... ..... @rd_rn_rm +LSL_zzw 00000100 .. 1 ..... 1000 11 ..... ..... @rd_rn_rm + +### SVE Compute Vector Address Group + +# SVE vector address generation +ADR_s32 00000100 00 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm +ADR_u32 00000100 01 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm +ADR_p32 00000100 10 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm +ADR_p64 00000100 11 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm + +### SVE Integer Misc - Unpredicated Group + +# SVE constructive prefix (unpredicated) +MOVPRFX 00000100 00 1 00000 101111 rn:5 rd:5 + +# SVE floating-point exponential accelerator +# Note esz != 0 +FEXPA 00000100 .. 1 00000 101110 ..... ..... @rd_rn + +# SVE floating-point trig select coefficient +# Note esz != 0 +FTSSEL 00000100 .. 1 ..... 101100 ..... ..... @rd_rn_rm + +### SVE Element Count Group + +# SVE element count +CNT_r 00000100 .. 10 .... 1110 0 0 ..... ..... @incdec_cnt d=0 u=1 + +# SVE inc/dec register by element count +INCDEC_r 00000100 .. 11 .... 1110 0 d:1 ..... ..... @incdec_cnt u=1 + +# SVE saturating inc/dec register by element count +SINCDEC_r_32 00000100 .. 10 .... 1111 d:1 u:1 ..... ..... @incdec_cnt +SINCDEC_r_64 00000100 .. 11 .... 1111 d:1 u:1 ..... ..... @incdec_cnt + +# SVE inc/dec vector by element count +# Note this requires esz != 0. +INCDEC_v 00000100 .. 1 1 .... 1100 0 d:1 ..... ..... @incdec2_cnt u=1 + +# SVE saturating inc/dec vector by element count +# Note these require esz != 0. +SINCDEC_v 00000100 .. 1 0 .... 1100 d:1 u:1 ..... ..... @incdec2_cnt + +### SVE Bitwise Immediate Group + +# SVE bitwise logical with immediate (unpredicated) +ORR_zzi 00000101 00 0000 ............. ..... @rdn_dbm +EOR_zzi 00000101 01 0000 ............. ..... @rdn_dbm +AND_zzi 00000101 10 0000 ............. ..... @rdn_dbm + +# SVE broadcast bitmask immediate +DUPM 00000101 11 0000 dbm:13 rd:5 + +### SVE Integer Wide Immediate - Predicated Group + +# SVE copy floating-point immediate (predicated) +FCPY 00000101 .. 01 .... 110 imm:8 ..... @rdn_pg4 + +# SVE copy integer immediate (predicated) +{ + INVALID 00000101 00 01 ---- 01 1 -------- ----- + CPY_m_i 00000101 .. 01 .... 01 . ........ ..... @rdn_pg4 imm=%sh8_i8s +} +{ + INVALID 00000101 00 01 ---- 00 1 -------- ----- + CPY_z_i 00000101 .. 01 .... 00 . ........ ..... @rdn_pg4 imm=%sh8_i8s +} + +### SVE Permute - Extract Group + +# SVE extract vector (destructive) +EXT 00000101 001 ..... 000 ... rm:5 rd:5 \ + &rrri rn=%reg_movprfx imm=%imm8_16_10 + +# SVE2 extract vector (constructive) +EXT_sve2 00000101 011 ..... 000 ... rn:5 rd:5 \ + &rri imm=%imm8_16_10 + +### SVE Permute - Unpredicated Group + +# SVE broadcast general register +DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn + +# SVE broadcast indexed element +DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \ + &rri imm=%imm7_22_16 + +# SVE insert SIMD&FP scalar register +INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm + +# SVE insert general register +INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm + +# SVE reverse vector elements +REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn + +# SVE vector table lookup +TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm + +# SVE unpack vector elements +UNPK 00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5 + +# SVE2 Table Lookup (three sources) + +TBL_sve2 00000101 .. 1 ..... 001010 ..... ..... @rd_rn_rm +TBX 00000101 .. 1 ..... 001011 ..... ..... @rd_rn_rm + +### SVE Permute - Predicates Group + +# SVE permute predicate elements +ZIP1_p 00000101 .. 10 .... 010 000 0 .... 0 .... @pd_pn_pm +ZIP2_p 00000101 .. 10 .... 010 001 0 .... 0 .... @pd_pn_pm +UZP1_p 00000101 .. 10 .... 010 010 0 .... 0 .... @pd_pn_pm +UZP2_p 00000101 .. 10 .... 010 011 0 .... 0 .... @pd_pn_pm +TRN1_p 00000101 .. 10 .... 010 100 0 .... 0 .... @pd_pn_pm +TRN2_p 00000101 .. 10 .... 010 101 0 .... 0 .... @pd_pn_pm + +# SVE reverse predicate elements +REV_p 00000101 .. 11 0100 010 000 0 .... 0 .... @pd_pn + +# SVE unpack predicate elements +PUNPKLO 00000101 00 11 0000 010 000 0 .... 0 .... @pd_pn_e0 +PUNPKHI 00000101 00 11 0001 010 000 0 .... 0 .... @pd_pn_e0 + +### SVE Permute - Interleaving Group + +# SVE permute vector elements +ZIP1_z 00000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm +ZIP2_z 00000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm +UZP1_z 00000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm +UZP2_z 00000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm +TRN1_z 00000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm +TRN2_z 00000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm + +# SVE2 permute vector segments +ZIP1_q 00000101 10 1 ..... 000 000 ..... ..... @rd_rn_rm_e0 +ZIP2_q 00000101 10 1 ..... 000 001 ..... ..... @rd_rn_rm_e0 +UZP1_q 00000101 10 1 ..... 000 010 ..... ..... @rd_rn_rm_e0 +UZP2_q 00000101 10 1 ..... 000 011 ..... ..... @rd_rn_rm_e0 +TRN1_q 00000101 10 1 ..... 000 110 ..... ..... @rd_rn_rm_e0 +TRN2_q 00000101 10 1 ..... 000 111 ..... ..... @rd_rn_rm_e0 + +### SVE Permute - Predicated Group + +# SVE compress active elements +# Note esz >= 2 +COMPACT 00000101 .. 100001 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally broadcast element to vector +CLASTA_z 00000101 .. 10100 0 100 ... ..... ..... @rdn_pg_rm +CLASTB_z 00000101 .. 10100 1 100 ... ..... ..... @rdn_pg_rm + +# SVE conditionally copy element to SIMD&FP scalar +CLASTA_v 00000101 .. 10101 0 100 ... ..... ..... @rd_pg_rn +CLASTB_v 00000101 .. 10101 1 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally copy element to general register +CLASTA_r 00000101 .. 11000 0 101 ... ..... ..... @rd_pg_rn +CLASTB_r 00000101 .. 11000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element to SIMD&FP scalar register +LASTA_v 00000101 .. 10001 0 100 ... ..... ..... @rd_pg_rn +LASTB_v 00000101 .. 10001 1 100 ... ..... ..... @rd_pg_rn + +# SVE copy element to general register +LASTA_r 00000101 .. 10000 0 101 ... ..... ..... @rd_pg_rn +LASTB_r 00000101 .. 10000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element from SIMD&FP scalar register +CPY_m_v 00000101 .. 100000 100 ... ..... ..... @rd_pg_rn + +# SVE copy element from general register to vector (predicated) +CPY_m_r 00000101 .. 101000 101 ... ..... ..... @rd_pg_rn + +# SVE reverse within elements +# Note esz >= operation size +REVB 00000101 .. 1001 00 100 ... ..... ..... @rd_pg_rn +REVH 00000101 .. 1001 01 100 ... ..... ..... @rd_pg_rn +REVW 00000101 .. 1001 10 100 ... ..... ..... @rd_pg_rn +RBIT 00000101 .. 1001 11 100 ... ..... ..... @rd_pg_rn +REVD 00000101 00 1011 10 100 ... ..... ..... @rd_pg_rn_e0 + +# SVE vector splice (predicated, destructive) +SPLICE 00000101 .. 101 100 100 ... ..... ..... @rdn_pg_rm + +# SVE2 vector splice (predicated, constructive) +SPLICE_sve2 00000101 .. 101 101 100 ... ..... ..... @rd_pg_rn + +### SVE Select Vectors Group + +# SVE select vector elements (predicated) +SEL_zpzz 00000101 .. 1 ..... 11 .... ..... ..... @rd_pg4_rn_rm + +### SVE Integer Compare - Vectors Group + +# SVE integer compare_vectors +CMPHS_ppzz 00100100 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzz 00100100 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzz 00100100 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzz 00100100 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_rm +CMPEQ_ppzz 00100100 .. 0 ..... 101 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzz 00100100 .. 0 ..... 101 ... ..... 1 .... @pd_pg_rn_rm + +# SVE integer compare with wide elements +# Note these require esz != 3. +CMPEQ_ppzw 00100100 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzw 00100100 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzw 00100100 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzw 00100100 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm +CMPLT_ppzw 00100100 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm +CMPLE_ppzw 00100100 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm +CMPHS_ppzw 00100100 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzw 00100100 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm +CMPLO_ppzw 00100100 .. 0 ..... 111 ... ..... 0 .... @pd_pg_rn_rm +CMPLS_ppzw 00100100 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm + +### SVE Integer Compare - Unsigned Immediate Group + +# SVE integer compare with unsigned immediate +CMPHS_ppzi 00100100 .. 1 ....... 0 ... ..... 0 .... @pd_pg_rn_i7 +CMPHI_ppzi 00100100 .. 1 ....... 0 ... ..... 1 .... @pd_pg_rn_i7 +CMPLO_ppzi 00100100 .. 1 ....... 1 ... ..... 0 .... @pd_pg_rn_i7 +CMPLS_ppzi 00100100 .. 1 ....... 1 ... ..... 1 .... @pd_pg_rn_i7 + +### SVE Integer Compare - Signed Immediate Group + +# SVE integer compare with signed immediate +CMPGE_ppzi 00100101 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_i5 +CMPGT_ppzi 00100101 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_i5 +CMPLT_ppzi 00100101 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_i5 +CMPLE_ppzi 00100101 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_i5 +CMPEQ_ppzi 00100101 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_i5 +CMPNE_ppzi 00100101 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_i5 + +### SVE Predicate Logical Operations Group + +# SVE predicate logical operations +AND_pppp 00100101 0. 00 .... 01 .... 0 .... 0 .... @pd_pg_pn_pm_s +BIC_pppp 00100101 0. 00 .... 01 .... 0 .... 1 .... @pd_pg_pn_pm_s +EOR_pppp 00100101 0. 00 .... 01 .... 1 .... 0 .... @pd_pg_pn_pm_s +SEL_pppp 00100101 0. 00 .... 01 .... 1 .... 1 .... @pd_pg_pn_pm_s +ORR_pppp 00100101 1. 00 .... 01 .... 0 .... 0 .... @pd_pg_pn_pm_s +ORN_pppp 00100101 1. 00 .... 01 .... 0 .... 1 .... @pd_pg_pn_pm_s +NOR_pppp 00100101 1. 00 .... 01 .... 1 .... 0 .... @pd_pg_pn_pm_s +NAND_pppp 00100101 1. 00 .... 01 .... 1 .... 1 .... @pd_pg_pn_pm_s + +### SVE Predicate Misc Group + +# SVE predicate test +PTEST 00100101 01 010000 11 pg:4 0 rn:4 0 0000 + +# SVE predicate initialize +PTRUE 00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4 + +# SVE initialize FFR +SETFFR 00100101 0010 1100 1001 0000 0000 0000 + +# SVE zero predicate register +PFALSE 00100101 0001 1000 1110 0100 0000 rd:4 + +# SVE predicate read from FFR (predicated) +RDFFR_p 00100101 0 s:1 0110001111000 pg:4 0 rd:4 + +# SVE predicate read from FFR (unpredicated) +RDFFR 00100101 0001 1001 1111 0000 0000 rd:4 + +# SVE FFR write from predicate (WRFFR) +WRFFR 00100101 0010 1000 1001 000 rn:4 00000 + +# SVE predicate first active +PFIRST 00100101 01 011 000 11000 00 .... 0 .... @pd_pn_e0 + +# SVE predicate next active +PNEXT 00100101 .. 011 001 11000 10 .... 0 .... @pd_pn + +### SVE Partition Break Group + +# SVE propagate break from previous partition +BRKPA 00100101 0. 00 .... 11 .... 0 .... 0 .... @pd_pg_pn_pm_s +BRKPB 00100101 0. 00 .... 11 .... 0 .... 1 .... @pd_pg_pn_pm_s + +# SVE partition break condition +BRKA_z 00100101 0. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKB_z 00100101 1. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKA_m 00100101 00 01000001 .... 0 .... 1 .... @pd_pg_pn_s0 +BRKB_m 00100101 10 01000001 .... 0 .... 1 .... @pd_pg_pn_s0 + +# SVE propagate break to next partition +BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s + +### SVE Predicate Count Group + +# SVE predicate count +CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn + +# SVE inc/dec register by predicate count +INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1 + +# SVE inc/dec vector by predicate count +INCDECP_z 00100101 .. 10110 d:1 10000 00 .... ..... @incdec2_pred u=1 + +# SVE saturating inc/dec register by predicate count +SINCDECP_r_32 00100101 .. 1010 d:1 u:1 10001 00 .... ..... @incdec_pred +SINCDECP_r_64 00100101 .. 1010 d:1 u:1 10001 10 .... ..... @incdec_pred + +# SVE saturating inc/dec vector by predicate count +SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred + +### SVE Integer Compare - Scalars Group + +# SVE conditionally terminate scalars +CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000 + +# SVE integer compare scalar count and limit +WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4 + +# SVE2 pointer conflict compare +WHILE_ptr 00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4 + +### SVE Integer Wide Immediate - Unpredicated Group + +# SVE broadcast floating-point immediate (unpredicated) +FDUP 00100101 esz:2 111 00 1110 imm:8 rd:5 + +# SVE broadcast integer immediate (unpredicated) +{ + INVALID 00100101 00 111 00 011 1 -------- ----- + DUP_i 00100101 esz:2 111 00 011 . ........ rd:5 imm=%sh8_i8s +} + +# SVE integer add/subtract immediate (unpredicated) +{ + INVALID 00100101 00 100 000 11 1 -------- ----- + ADD_zzi 00100101 .. 100 000 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 001 11 1 -------- ----- + SUB_zzi 00100101 .. 100 001 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 011 11 1 -------- ----- + SUBR_zzi 00100101 .. 100 011 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 100 11 1 -------- ----- + SQADD_zzi 00100101 .. 100 100 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 101 11 1 -------- ----- + UQADD_zzi 00100101 .. 100 101 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 110 11 1 -------- ----- + SQSUB_zzi 00100101 .. 100 110 11 . ........ ..... @rdn_sh_i8u +} +{ + INVALID 00100101 00 100 111 11 1 -------- ----- + UQSUB_zzi 00100101 .. 100 111 11 . ........ ..... @rdn_sh_i8u +} + +# SVE integer min/max immediate (unpredicated) +SMAX_zzi 00100101 .. 101 000 110 ........ ..... @rdn_i8s +UMAX_zzi 00100101 .. 101 001 110 ........ ..... @rdn_i8u +SMIN_zzi 00100101 .. 101 010 110 ........ ..... @rdn_i8s +UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u + +# SVE integer multiply immediate (unpredicated) +MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s + +# SVE integer dot product (unpredicated) +DOT_zzzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE2 complex dot product (vectors) +CDOT_zzzz 01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5 ra=%reg_movprfx + +#### SVE Multiply - Indexed + +# SVE integer dot product (indexed) +SDOT_zzxw_s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2 +SDOT_zzxw_d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3 +UDOT_zzxw_s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2 +UDOT_zzxw_d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 + +# SVE2 integer multiply-add (indexed) +MLA_zzxz_h 01000100 0. 1 ..... 000010 ..... ..... @rrxr_3 esz=1 +MLA_zzxz_s 01000100 10 1 ..... 000010 ..... ..... @rrxr_2 esz=2 +MLA_zzxz_d 01000100 11 1 ..... 000010 ..... ..... @rrxr_1 esz=3 +MLS_zzxz_h 01000100 0. 1 ..... 000011 ..... ..... @rrxr_3 esz=1 +MLS_zzxz_s 01000100 10 1 ..... 000011 ..... ..... @rrxr_2 esz=2 +MLS_zzxz_d 01000100 11 1 ..... 000011 ..... ..... @rrxr_1 esz=3 + +# SVE2 saturating multiply-add high (indexed) +SQRDMLAH_zzxz_h 01000100 0. 1 ..... 000100 ..... ..... @rrxr_3 esz=1 +SQRDMLAH_zzxz_s 01000100 10 1 ..... 000100 ..... ..... @rrxr_2 esz=2 +SQRDMLAH_zzxz_d 01000100 11 1 ..... 000100 ..... ..... @rrxr_1 esz=3 +SQRDMLSH_zzxz_h 01000100 0. 1 ..... 000101 ..... ..... @rrxr_3 esz=1 +SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... ..... @rrxr_2 esz=2 +SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... ..... @rrxr_1 esz=3 + +# SVE mixed sign dot product (indexed) +USDOT_zzxw_s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2 +SUDOT_zzxw_s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2 + +# SVE2 saturating multiply-add (indexed) +SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... ..... @rrxr_3a esz=2 +SQDMLALB_zzxw_d 01000100 11 1 ..... 0010.0 ..... ..... @rrxr_2a esz=3 +SQDMLALT_zzxw_s 01000100 10 1 ..... 0010.1 ..... ..... @rrxr_3a esz=2 +SQDMLALT_zzxw_d 01000100 11 1 ..... 0010.1 ..... ..... @rrxr_2a esz=3 +SQDMLSLB_zzxw_s 01000100 10 1 ..... 0011.0 ..... ..... @rrxr_3a esz=2 +SQDMLSLB_zzxw_d 01000100 11 1 ..... 0011.0 ..... ..... @rrxr_2a esz=3 +SQDMLSLT_zzxw_s 01000100 10 1 ..... 0011.1 ..... ..... @rrxr_3a esz=2 +SQDMLSLT_zzxw_d 01000100 11 1 ..... 0011.1 ..... ..... @rrxr_2a esz=3 + +# SVE2 complex integer dot product (indexed) +CDOT_zzxw_s 01000100 10 1 index:2 rm:3 0100 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx +CDOT_zzxw_d 01000100 11 1 index:1 rm:4 0100 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE2 complex integer multiply-add (indexed) +CMLA_zzxz_h 01000100 10 1 index:2 rm:3 0110 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx +CMLA_zzxz_s 01000100 11 1 index:1 rm:4 0110 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE2 complex saturating integer multiply-add (indexed) +SQRDCMLAH_zzxz_h 01000100 10 1 index:2 rm:3 0111 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx +SQRDCMLAH_zzxz_s 01000100 11 1 index:1 rm:4 0111 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE2 multiply-add long (indexed) +SMLALB_zzxw_s 01000100 10 1 ..... 1000.0 ..... ..... @rrxr_3a esz=2 +SMLALB_zzxw_d 01000100 11 1 ..... 1000.0 ..... ..... @rrxr_2a esz=3 +SMLALT_zzxw_s 01000100 10 1 ..... 1000.1 ..... ..... @rrxr_3a esz=2 +SMLALT_zzxw_d 01000100 11 1 ..... 1000.1 ..... ..... @rrxr_2a esz=3 +UMLALB_zzxw_s 01000100 10 1 ..... 1001.0 ..... ..... @rrxr_3a esz=2 +UMLALB_zzxw_d 01000100 11 1 ..... 1001.0 ..... ..... @rrxr_2a esz=3 +UMLALT_zzxw_s 01000100 10 1 ..... 1001.1 ..... ..... @rrxr_3a esz=2 +UMLALT_zzxw_d 01000100 11 1 ..... 1001.1 ..... ..... @rrxr_2a esz=3 +SMLSLB_zzxw_s 01000100 10 1 ..... 1010.0 ..... ..... @rrxr_3a esz=2 +SMLSLB_zzxw_d 01000100 11 1 ..... 1010.0 ..... ..... @rrxr_2a esz=3 +SMLSLT_zzxw_s 01000100 10 1 ..... 1010.1 ..... ..... @rrxr_3a esz=2 +SMLSLT_zzxw_d 01000100 11 1 ..... 1010.1 ..... ..... @rrxr_2a esz=3 +UMLSLB_zzxw_s 01000100 10 1 ..... 1011.0 ..... ..... @rrxr_3a esz=2 +UMLSLB_zzxw_d 01000100 11 1 ..... 1011.0 ..... ..... @rrxr_2a esz=3 +UMLSLT_zzxw_s 01000100 10 1 ..... 1011.1 ..... ..... @rrxr_3a esz=2 +UMLSLT_zzxw_d 01000100 11 1 ..... 1011.1 ..... ..... @rrxr_2a esz=3 + +# SVE2 integer multiply long (indexed) +SMULLB_zzx_s 01000100 10 1 ..... 1100.0 ..... ..... @rrx_3a esz=2 +SMULLB_zzx_d 01000100 11 1 ..... 1100.0 ..... ..... @rrx_2a esz=3 +SMULLT_zzx_s 01000100 10 1 ..... 1100.1 ..... ..... @rrx_3a esz=2 +SMULLT_zzx_d 01000100 11 1 ..... 1100.1 ..... ..... @rrx_2a esz=3 +UMULLB_zzx_s 01000100 10 1 ..... 1101.0 ..... ..... @rrx_3a esz=2 +UMULLB_zzx_d 01000100 11 1 ..... 1101.0 ..... ..... @rrx_2a esz=3 +UMULLT_zzx_s 01000100 10 1 ..... 1101.1 ..... ..... @rrx_3a esz=2 +UMULLT_zzx_d 01000100 11 1 ..... 1101.1 ..... ..... @rrx_2a esz=3 + +# SVE2 saturating multiply (indexed) +SQDMULLB_zzx_s 01000100 10 1 ..... 1110.0 ..... ..... @rrx_3a esz=2 +SQDMULLB_zzx_d 01000100 11 1 ..... 1110.0 ..... ..... @rrx_2a esz=3 +SQDMULLT_zzx_s 01000100 10 1 ..... 1110.1 ..... ..... @rrx_3a esz=2 +SQDMULLT_zzx_d 01000100 11 1 ..... 1110.1 ..... ..... @rrx_2a esz=3 + +# SVE2 saturating multiply high (indexed) +SQDMULH_zzx_h 01000100 0. 1 ..... 111100 ..... ..... @rrx_3 esz=1 +SQDMULH_zzx_s 01000100 10 1 ..... 111100 ..... ..... @rrx_2 esz=2 +SQDMULH_zzx_d 01000100 11 1 ..... 111100 ..... ..... @rrx_1 esz=3 +SQRDMULH_zzx_h 01000100 0. 1 ..... 111101 ..... ..... @rrx_3 esz=1 +SQRDMULH_zzx_s 01000100 10 1 ..... 111101 ..... ..... @rrx_2 esz=2 +SQRDMULH_zzx_d 01000100 11 1 ..... 111101 ..... ..... @rrx_1 esz=3 + +# SVE2 integer multiply (indexed) +MUL_zzx_h 01000100 0. 1 ..... 111110 ..... ..... @rrx_3 esz=1 +MUL_zzx_s 01000100 10 1 ..... 111110 ..... ..... @rrx_2 esz=2 +MUL_zzx_d 01000100 11 1 ..... 111110 ..... ..... @rrx_1 esz=3 + +# SVE floating-point complex add (predicated) +FCADD 01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \ + rn=%reg_movprfx + +# SVE floating-point complex multiply-add (predicated) +FCMLA_zpzzz 01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE floating-point complex multiply-add (indexed) +FCMLA_zzxz 01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx esz=1 +FCMLA_zzxz 01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx esz=2 + +### SVE FP Multiply-Add Indexed Group + +# SVE floating-point multiply-add (indexed) +FMLA_zzxz 01100100 0. 1 ..... 000000 ..... ..... @rrxr_3 esz=1 +FMLA_zzxz 01100100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2 +FMLA_zzxz 01100100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3 +FMLS_zzxz 01100100 0. 1 ..... 000001 ..... ..... @rrxr_3 esz=1 +FMLS_zzxz 01100100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2 +FMLS_zzxz 01100100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 + +### SVE FP Multiply Indexed Group + +# SVE floating-point multiply (indexed) +FMUL_zzx 01100100 0. 1 ..... 001000 ..... ..... @rrx_3 esz=1 +FMUL_zzx 01100100 10 1 ..... 001000 ..... ..... @rrx_2 esz=2 +FMUL_zzx 01100100 11 1 ..... 001000 ..... ..... @rrx_1 esz=3 + +### SVE FP Fast Reduction Group + +FADDV 01100101 .. 000 000 001 ... ..... ..... @rd_pg_rn +FMAXNMV 01100101 .. 000 100 001 ... ..... ..... @rd_pg_rn +FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn +FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn +FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn + +## SVE Floating Point Unary Operations - Unpredicated Group + +FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn +FRSQRTE 01100101 .. 001 111 001100 ..... ..... @rd_rn + +### SVE FP Compare with Zero Group + +FCMGE_ppz0 01100101 .. 0100 00 001 ... ..... 0 .... @pd_pg_rn +FCMGT_ppz0 01100101 .. 0100 00 001 ... ..... 1 .... @pd_pg_rn +FCMLT_ppz0 01100101 .. 0100 01 001 ... ..... 0 .... @pd_pg_rn +FCMLE_ppz0 01100101 .. 0100 01 001 ... ..... 1 .... @pd_pg_rn +FCMEQ_ppz0 01100101 .. 0100 10 001 ... ..... 0 .... @pd_pg_rn +FCMNE_ppz0 01100101 .. 0100 11 001 ... ..... 0 .... @pd_pg_rn + +### SVE FP Accumulating Reduction Group + +# SVE floating-point serial reduction (predicated) +FADDA 01100101 .. 011 000 001 ... ..... ..... @rdn_pg_rm + +### SVE Floating Point Arithmetic - Unpredicated Group + +# SVE floating-point arithmetic (unpredicated) +FADD_zzz 01100101 .. 0 ..... 000 000 ..... ..... @rd_rn_rm +FSUB_zzz 01100101 .. 0 ..... 000 001 ..... ..... @rd_rn_rm +FMUL_zzz 01100101 .. 0 ..... 000 010 ..... ..... @rd_rn_rm +FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm +FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm +FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm + +### SVE FP Arithmetic Predicated Group + +# SVE floating-point arithmetic (predicated) +FADD_zpzz 01100101 .. 00 0000 100 ... ..... ..... @rdn_pg_rm +FSUB_zpzz 01100101 .. 00 0001 100 ... ..... ..... @rdn_pg_rm +FMUL_zpzz 01100101 .. 00 0010 100 ... ..... ..... @rdn_pg_rm +FSUB_zpzz 01100101 .. 00 0011 100 ... ..... ..... @rdm_pg_rn # FSUBR +FMAXNM_zpzz 01100101 .. 00 0100 100 ... ..... ..... @rdn_pg_rm +FMINNM_zpzz 01100101 .. 00 0101 100 ... ..... ..... @rdn_pg_rm +FMAX_zpzz 01100101 .. 00 0110 100 ... ..... ..... @rdn_pg_rm +FMIN_zpzz 01100101 .. 00 0111 100 ... ..... ..... @rdn_pg_rm +FABD 01100101 .. 00 1000 100 ... ..... ..... @rdn_pg_rm +FSCALE 01100101 .. 00 1001 100 ... ..... ..... @rdn_pg_rm +FMULX 01100101 .. 00 1010 100 ... ..... ..... @rdn_pg_rm +FDIV 01100101 .. 00 1100 100 ... ..... ..... @rdm_pg_rn # FDIVR +FDIV 01100101 .. 00 1101 100 ... ..... ..... @rdn_pg_rm + +# SVE floating-point arithmetic with immediate (predicated) +FADD_zpzi 01100101 .. 011 000 100 ... 0000 . ..... @rdn_i1 +FSUB_zpzi 01100101 .. 011 001 100 ... 0000 . ..... @rdn_i1 +FMUL_zpzi 01100101 .. 011 010 100 ... 0000 . ..... @rdn_i1 +FSUBR_zpzi 01100101 .. 011 011 100 ... 0000 . ..... @rdn_i1 +FMAXNM_zpzi 01100101 .. 011 100 100 ... 0000 . ..... @rdn_i1 +FMINNM_zpzi 01100101 .. 011 101 100 ... 0000 . ..... @rdn_i1 +FMAX_zpzi 01100101 .. 011 110 100 ... 0000 . ..... @rdn_i1 +FMIN_zpzi 01100101 .. 011 111 100 ... 0000 . ..... @rdn_i1 + +# SVE floating-point trig multiply-add coefficient +FTMAD 01100101 esz:2 010 imm:3 100000 rm:5 rd:5 rn=%reg_movprfx + +### SVE FP Multiply-Add Group + +# SVE floating-point multiply-accumulate writing addend +FMLA_zpzzz 01100101 .. 1 ..... 000 ... ..... ..... @rda_pg_rn_rm +FMLS_zpzzz 01100101 .. 1 ..... 001 ... ..... ..... @rda_pg_rn_rm +FNMLA_zpzzz 01100101 .. 1 ..... 010 ... ..... ..... @rda_pg_rn_rm +FNMLS_zpzzz 01100101 .. 1 ..... 011 ... ..... ..... @rda_pg_rn_rm + +# SVE floating-point multiply-accumulate writing multiplicand +# Alter the operand extraction order and reuse the helpers from above. +# FMAD, FMSB, FNMAD, FNMS +FMLA_zpzzz 01100101 .. 1 ..... 100 ... ..... ..... @rdn_pg_rm_ra +FMLS_zpzzz 01100101 .. 1 ..... 101 ... ..... ..... @rdn_pg_rm_ra +FNMLA_zpzzz 01100101 .. 1 ..... 110 ... ..... ..... @rdn_pg_rm_ra +FNMLS_zpzzz 01100101 .. 1 ..... 111 ... ..... ..... @rdn_pg_rm_ra + +### SVE FP Unary Operations Predicated Group + +# SVE floating-point convert precision +FCVT_sh 01100101 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_hs 01100101 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0 +BFCVT 01100101 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_dh 01100101 11 0010 00 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_hd 01100101 11 0010 01 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_ds 01100101 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_sd 01100101 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0 + +# SVE floating-point convert to integer +FCVTZS_hh 01100101 01 011 01 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hh 01100101 01 011 01 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_hs 01100101 01 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hs 01100101 01 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_hd 01100101 01 011 11 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hd 01100101 01 011 11 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_ss 01100101 10 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_ss 01100101 10 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_ds 01100101 11 011 00 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_ds 01100101 11 011 00 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_sd 01100101 11 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_sd 01100101 11 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_dd 01100101 11 011 11 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_dd 01100101 11 011 11 1 101 ... ..... ..... @rd_pg_rn_e0 + +# SVE floating-point round to integral value +FRINTN 01100101 .. 000 000 101 ... ..... ..... @rd_pg_rn +FRINTP 01100101 .. 000 001 101 ... ..... ..... @rd_pg_rn +FRINTM 01100101 .. 000 010 101 ... ..... ..... @rd_pg_rn +FRINTZ 01100101 .. 000 011 101 ... ..... ..... @rd_pg_rn +FRINTA 01100101 .. 000 100 101 ... ..... ..... @rd_pg_rn +FRINTX 01100101 .. 000 110 101 ... ..... ..... @rd_pg_rn +FRINTI 01100101 .. 000 111 101 ... ..... ..... @rd_pg_rn + +# SVE floating-point unary operations +FRECPX 01100101 .. 001 100 101 ... ..... ..... @rd_pg_rn +FSQRT 01100101 .. 001 101 101 ... ..... ..... @rd_pg_rn + +# SVE integer convert to floating-point +SCVTF_hh 01100101 01 010 01 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_sh 01100101 01 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_dh 01100101 01 010 11 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_ss 01100101 10 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_sd 01100101 11 010 00 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_ds 01100101 11 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_dd 01100101 11 010 11 0 101 ... ..... ..... @rd_pg_rn_e0 + +UCVTF_hh 01100101 01 010 01 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_sh 01100101 01 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_dh 01100101 01 010 11 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_ss 01100101 10 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_sd 01100101 11 010 00 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_ds 01100101 11 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_dd 01100101 11 010 11 1 101 ... ..... ..... @rd_pg_rn_e0 + +### SVE Memory - 32-bit Gather and Unsized Contiguous Group + +# SVE load predicate register +LDR_pri 10000101 10 ...... 000 ... ..... 0 .... @pd_rn_i9 + +# SVE load vector register +LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9 + +# SVE load and broadcast element +LD1R_zpri 1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \ + &rpri_load dtype=%dtype_23_13 nreg=0 + +# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets) +# SVE 32-bit gather load (scalar plus 32-bit scaled offsets) +LD1_zprz 1000010 00 .0 ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u esz=2 msz=0 scale=0 +LD1_zprz 1000010 01 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=2 msz=1 +LD1_zprz 1000010 10 .. ..... 01. ... ..... ..... \ + @rprr_g_load_xs_sc esz=2 msz=2 u=1 + +# SVE 32-bit gather load (vector plus immediate) +LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \ + @rpri_g_load esz=2 + +### SVE Memory Contiguous Load Group + +# SVE contiguous load (scalar plus scalar) +LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0 + +# SVE contiguous first-fault load (scalar plus scalar) +LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0 + +# SVE contiguous load (scalar plus immediate) +LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0 + +# SVE contiguous non-fault load (scalar plus immediate) +LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 + +# SVE contiguous non-temporal load (scalar plus scalar) +# LDNT1B, LDNT1H, LDNT1W, LDNT1D +# SVE load multiple structures (scalar plus scalar) +# LD2B, LD2H, LD2W, LD2D; etc. +LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz + +# SVE contiguous non-temporal load (scalar plus immediate) +# LDNT1B, LDNT1H, LDNT1W, LDNT1D +# SVE load multiple structures (scalar plus immediate) +# LD2B, LD2H, LD2W, LD2D; etc. +LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz + +# SVE load and broadcast quadword (scalar plus scalar) +LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \ + @rprr_load_msz nreg=0 +LD1RO_zprr 1010010 .. 01 ..... 000 ... ..... ..... \ + @rprr_load_msz nreg=0 + +# SVE load and broadcast quadword (scalar plus immediate) +# LD1RQB, LD1RQH, LD1RQS, LD1RQD +LD1RQ_zpri 1010010 .. 00 0.... 001 ... ..... ..... \ + @rpri_load_msz nreg=0 +LD1RO_zpri 1010010 .. 01 0.... 001 ... ..... ..... \ + @rpri_load_msz nreg=0 + +# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets) +PRF_ns 1000010 00 -1 ----- 0-- --- ----- 0 ---- + +# SVE 32-bit gather prefetch (vector plus immediate) +PRF_ns 1000010 -- 00 ----- 111 --- ----- 0 ---- + +# SVE contiguous prefetch (scalar plus immediate) +PRF 1000010 11 1- ----- 0-- --- ----- 0 ---- + +# SVE contiguous prefetch (scalar plus scalar) +PRF_rr 1000010 -- 00 rm:5 110 --- ----- 0 ---- + +### SVE Memory 64-bit Gather Group + +# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets) +# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets) +LD1_zprz 1100010 00 .0 ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u esz=3 msz=0 scale=0 +LD1_zprz 1100010 01 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=3 msz=1 +LD1_zprz 1100010 10 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=3 msz=2 +LD1_zprz 1100010 11 .. ..... 01. ... ..... ..... \ + @rprr_g_load_xs_sc esz=3 msz=3 u=1 + +# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets) +# SVE 64-bit gather load (scalar plus 64-bit scaled offsets) +LD1_zprz 1100010 00 10 ..... 1.. ... ..... ..... \ + @rprr_g_load_u esz=3 msz=0 scale=0 +LD1_zprz 1100010 01 1. ..... 1.. ... ..... ..... \ + @rprr_g_load_u_sc esz=3 msz=1 +LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \ + @rprr_g_load_u_sc esz=3 msz=2 +LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \ + @rprr_g_load_sc esz=3 msz=3 u=1 + +# SVE 64-bit gather load (vector plus immediate) +LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \ + @rpri_g_load esz=3 + +# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets) +PRF_ns 1100010 00 11 ----- 1-- --- ----- 0 ---- + +# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets) +PRF_ns 1100010 00 -1 ----- 0-- --- ----- 0 ---- + +# SVE 64-bit gather prefetch (vector plus immediate) +PRF_ns 1100010 -- 00 ----- 111 --- ----- 0 ---- + +### SVE Memory Store Group + +# SVE store predicate register +STR_pri 1110010 11 0. ..... 000 ... ..... 0 .... @pd_rn_i9 + +# SVE store vector register +STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9 + +# SVE contiguous store (scalar plus immediate) +# ST1B, ST1H, ST1W, ST1D; require msz <= esz +ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \ + @rpri_store_msz nreg=0 + +# SVE contiguous store (scalar plus scalar) +# ST1B, ST1H, ST1W, ST1D; require msz <= esz +# Enumerate msz lest we conflict with STR_zri. +ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=0 +ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=1 +ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=2 +ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \ + @rprr_store msz=3 esz=3 nreg=0 + +# SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0) +# SVE store multiple structures (scalar plus immediate) (nreg != 0) +ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \ + @rpri_store_msz esz=%size_23 + +# SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0) +# SVE store multiple structures (scalar plus scalar) (nreg != 0) +ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \ + @rprr_store esz=%size_23 + +# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets) +# Require msz > 0 && msz <= esz. +ST1_zprz 1110010 .. 11 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=2 scale=1 +ST1_zprz 1110010 .. 11 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=2 scale=1 + +# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets) +# Require msz <= esz. +ST1_zprz 1110010 .. 10 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=2 scale=0 +ST1_zprz 1110010 .. 10 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=2 scale=0 + +# SVE 64-bit scatter store (scalar plus 64-bit scaled offset) +# Require msz > 0 +ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \ + @rprr_scatter_store xs=2 esz=3 scale=1 + +# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset) +ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \ + @rprr_scatter_store xs=2 esz=3 scale=0 + +# SVE 64-bit scatter store (vector plus immediate) +ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \ + @rpri_scatter_store esz=3 + +# SVE 32-bit scatter store (vector plus immediate) +ST1_zpiz 1110010 .. 11 ..... 101 ... ..... ..... \ + @rpri_scatter_store esz=2 + +# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset) +# Require msz > 0 +ST1_zprz 1110010 .. 01 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=3 scale=1 +ST1_zprz 1110010 .. 01 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=3 scale=1 + +# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset) +ST1_zprz 1110010 .. 00 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=3 scale=0 +ST1_zprz 1110010 .. 00 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=3 scale=0 + +#### SVE2 Support + +### SVE2 Integer Multiply - Unpredicated + +# SVE2 integer multiply vectors (unpredicated) +MUL_zzz 00000100 .. 1 ..... 0110 00 ..... ..... @rd_rn_rm +SMULH_zzz 00000100 .. 1 ..... 0110 10 ..... ..... @rd_rn_rm +UMULH_zzz 00000100 .. 1 ..... 0110 11 ..... ..... @rd_rn_rm +PMUL_zzz 00000100 00 1 ..... 0110 01 ..... ..... @rd_rn_rm_e0 + +# SVE2 signed saturating doubling multiply high (unpredicated) +SQDMULH_zzz 00000100 .. 1 ..... 0111 00 ..... ..... @rd_rn_rm +SQRDMULH_zzz 00000100 .. 1 ..... 0111 01 ..... ..... @rd_rn_rm + +### SVE2 Integer - Predicated + +SADALP_zpzz 01000100 .. 000 100 101 ... ..... ..... @rdm_pg_rn +UADALP_zpzz 01000100 .. 000 101 101 ... ..... ..... @rdm_pg_rn + +### SVE2 integer unary operations (predicated) + +URECPE 01000100 .. 000 000 101 ... ..... ..... @rd_pg_rn +URSQRTE 01000100 .. 000 001 101 ... ..... ..... @rd_pg_rn +SQABS 01000100 .. 001 000 101 ... ..... ..... @rd_pg_rn +SQNEG 01000100 .. 001 001 101 ... ..... ..... @rd_pg_rn + +### SVE2 saturating/rounding bitwise shift left (predicated) + +SRSHL 01000100 .. 000 010 100 ... ..... ..... @rdn_pg_rm +URSHL 01000100 .. 000 011 100 ... ..... ..... @rdn_pg_rm +SRSHL 01000100 .. 000 110 100 ... ..... ..... @rdm_pg_rn # SRSHLR +URSHL 01000100 .. 000 111 100 ... ..... ..... @rdm_pg_rn # URSHLR + +SQSHL 01000100 .. 001 000 100 ... ..... ..... @rdn_pg_rm +UQSHL 01000100 .. 001 001 100 ... ..... ..... @rdn_pg_rm +SQSHL 01000100 .. 001 100 100 ... ..... ..... @rdm_pg_rn # SQSHLR +UQSHL 01000100 .. 001 101 100 ... ..... ..... @rdm_pg_rn # UQSHLR + +SQRSHL 01000100 .. 001 010 100 ... ..... ..... @rdn_pg_rm +UQRSHL 01000100 .. 001 011 100 ... ..... ..... @rdn_pg_rm +SQRSHL 01000100 .. 001 110 100 ... ..... ..... @rdm_pg_rn # SQRSHLR +UQRSHL 01000100 .. 001 111 100 ... ..... ..... @rdm_pg_rn # UQRSHLR + +### SVE2 integer halving add/subtract (predicated) + +SHADD 01000100 .. 010 000 100 ... ..... ..... @rdn_pg_rm +UHADD 01000100 .. 010 001 100 ... ..... ..... @rdn_pg_rm +SHSUB 01000100 .. 010 010 100 ... ..... ..... @rdn_pg_rm +UHSUB 01000100 .. 010 011 100 ... ..... ..... @rdn_pg_rm +SRHADD 01000100 .. 010 100 100 ... ..... ..... @rdn_pg_rm +URHADD 01000100 .. 010 101 100 ... ..... ..... @rdn_pg_rm +SHSUB 01000100 .. 010 110 100 ... ..... ..... @rdm_pg_rn # SHSUBR +UHSUB 01000100 .. 010 111 100 ... ..... ..... @rdm_pg_rn # UHSUBR + +### SVE2 integer pairwise arithmetic + +ADDP 01000100 .. 010 001 101 ... ..... ..... @rdn_pg_rm +SMAXP 01000100 .. 010 100 101 ... ..... ..... @rdn_pg_rm +UMAXP 01000100 .. 010 101 101 ... ..... ..... @rdn_pg_rm +SMINP 01000100 .. 010 110 101 ... ..... ..... @rdn_pg_rm +UMINP 01000100 .. 010 111 101 ... ..... ..... @rdn_pg_rm + +### SVE2 saturating add/subtract (predicated) + +SQADD_zpzz 01000100 .. 011 000 100 ... ..... ..... @rdn_pg_rm +UQADD_zpzz 01000100 .. 011 001 100 ... ..... ..... @rdn_pg_rm +SQSUB_zpzz 01000100 .. 011 010 100 ... ..... ..... @rdn_pg_rm +UQSUB_zpzz 01000100 .. 011 011 100 ... ..... ..... @rdn_pg_rm +SUQADD 01000100 .. 011 100 100 ... ..... ..... @rdn_pg_rm +USQADD 01000100 .. 011 101 100 ... ..... ..... @rdn_pg_rm +SQSUB_zpzz 01000100 .. 011 110 100 ... ..... ..... @rdm_pg_rn # SQSUBR +UQSUB_zpzz 01000100 .. 011 111 100 ... ..... ..... @rdm_pg_rn # UQSUBR + +#### SVE2 Widening Integer Arithmetic + +## SVE2 integer add/subtract long + +SADDLB 01000101 .. 0 ..... 00 0000 ..... ..... @rd_rn_rm +SADDLT 01000101 .. 0 ..... 00 0001 ..... ..... @rd_rn_rm +UADDLB 01000101 .. 0 ..... 00 0010 ..... ..... @rd_rn_rm +UADDLT 01000101 .. 0 ..... 00 0011 ..... ..... @rd_rn_rm + +SSUBLB 01000101 .. 0 ..... 00 0100 ..... ..... @rd_rn_rm +SSUBLT 01000101 .. 0 ..... 00 0101 ..... ..... @rd_rn_rm +USUBLB 01000101 .. 0 ..... 00 0110 ..... ..... @rd_rn_rm +USUBLT 01000101 .. 0 ..... 00 0111 ..... ..... @rd_rn_rm + +SABDLB 01000101 .. 0 ..... 00 1100 ..... ..... @rd_rn_rm +SABDLT 01000101 .. 0 ..... 00 1101 ..... ..... @rd_rn_rm +UABDLB 01000101 .. 0 ..... 00 1110 ..... ..... @rd_rn_rm +UABDLT 01000101 .. 0 ..... 00 1111 ..... ..... @rd_rn_rm + +## SVE2 integer add/subtract interleaved long + +SADDLBT 01000101 .. 0 ..... 1000 00 ..... ..... @rd_rn_rm +SSUBLBT 01000101 .. 0 ..... 1000 10 ..... ..... @rd_rn_rm +SSUBLTB 01000101 .. 0 ..... 1000 11 ..... ..... @rd_rn_rm + +## SVE2 integer add/subtract wide + +SADDWB 01000101 .. 0 ..... 010 000 ..... ..... @rd_rn_rm +SADDWT 01000101 .. 0 ..... 010 001 ..... ..... @rd_rn_rm +UADDWB 01000101 .. 0 ..... 010 010 ..... ..... @rd_rn_rm +UADDWT 01000101 .. 0 ..... 010 011 ..... ..... @rd_rn_rm + +SSUBWB 01000101 .. 0 ..... 010 100 ..... ..... @rd_rn_rm +SSUBWT 01000101 .. 0 ..... 010 101 ..... ..... @rd_rn_rm +USUBWB 01000101 .. 0 ..... 010 110 ..... ..... @rd_rn_rm +USUBWT 01000101 .. 0 ..... 010 111 ..... ..... @rd_rn_rm + +## SVE2 integer multiply long + +SQDMULLB_zzz 01000101 .. 0 ..... 011 000 ..... ..... @rd_rn_rm +SQDMULLT_zzz 01000101 .. 0 ..... 011 001 ..... ..... @rd_rn_rm +PMULLB 01000101 .. 0 ..... 011 010 ..... ..... @rd_rn_rm +PMULLT 01000101 .. 0 ..... 011 011 ..... ..... @rd_rn_rm +SMULLB_zzz 01000101 .. 0 ..... 011 100 ..... ..... @rd_rn_rm +SMULLT_zzz 01000101 .. 0 ..... 011 101 ..... ..... @rd_rn_rm +UMULLB_zzz 01000101 .. 0 ..... 011 110 ..... ..... @rd_rn_rm +UMULLT_zzz 01000101 .. 0 ..... 011 111 ..... ..... @rd_rn_rm + +## SVE2 bitwise shift left long + +# Note bit23 == 0 is handled by esz > 0 in do_sve2_shll_tb. +SSHLLB 01000101 .. 0 ..... 1010 00 ..... ..... @rd_rn_tszimm_shl +SSHLLT 01000101 .. 0 ..... 1010 01 ..... ..... @rd_rn_tszimm_shl +USHLLB 01000101 .. 0 ..... 1010 10 ..... ..... @rd_rn_tszimm_shl +USHLLT 01000101 .. 0 ..... 1010 11 ..... ..... @rd_rn_tszimm_shl + +## SVE2 bitwise exclusive-or interleaved + +EORBT 01000101 .. 0 ..... 10010 0 ..... ..... @rd_rn_rm +EORTB 01000101 .. 0 ..... 10010 1 ..... ..... @rd_rn_rm + +## SVE integer matrix multiply accumulate + +SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 +USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 +UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 + +## SVE2 bitwise permute + +BEXT 01000101 .. 0 ..... 1011 00 ..... ..... @rd_rn_rm +BDEP 01000101 .. 0 ..... 1011 01 ..... ..... @rd_rn_rm +BGRP 01000101 .. 0 ..... 1011 10 ..... ..... @rd_rn_rm + +#### SVE2 Accumulate + +## SVE2 complex integer add + +CADD_rot90 01000101 .. 00000 0 11011 0 ..... ..... @rdn_rm +CADD_rot270 01000101 .. 00000 0 11011 1 ..... ..... @rdn_rm +SQCADD_rot90 01000101 .. 00000 1 11011 0 ..... ..... @rdn_rm +SQCADD_rot270 01000101 .. 00000 1 11011 1 ..... ..... @rdn_rm + +## SVE2 integer absolute difference and accumulate long + +SABALB 01000101 .. 0 ..... 1100 00 ..... ..... @rda_rn_rm +SABALT 01000101 .. 0 ..... 1100 01 ..... ..... @rda_rn_rm +UABALB 01000101 .. 0 ..... 1100 10 ..... ..... @rda_rn_rm +UABALT 01000101 .. 0 ..... 1100 11 ..... ..... @rda_rn_rm + +## SVE2 integer add/subtract long with carry + +# ADC and SBC decoded via size in helper dispatch. +ADCLB 01000101 .. 0 ..... 11010 0 ..... ..... @rda_rn_rm +ADCLT 01000101 .. 0 ..... 11010 1 ..... ..... @rda_rn_rm + +## SVE2 bitwise shift right and accumulate + +# TODO: Use @rda and %reg_movprfx here. +SSRA 01000101 .. 0 ..... 1110 00 ..... ..... @rd_rn_tszimm_shr +USRA 01000101 .. 0 ..... 1110 01 ..... ..... @rd_rn_tszimm_shr +SRSRA 01000101 .. 0 ..... 1110 10 ..... ..... @rd_rn_tszimm_shr +URSRA 01000101 .. 0 ..... 1110 11 ..... ..... @rd_rn_tszimm_shr + +## SVE2 bitwise shift and insert + +SRI 01000101 .. 0 ..... 11110 0 ..... ..... @rd_rn_tszimm_shr +SLI 01000101 .. 0 ..... 11110 1 ..... ..... @rd_rn_tszimm_shl + +## SVE2 integer absolute difference and accumulate + +# TODO: Use @rda and %reg_movprfx here. +SABA 01000101 .. 0 ..... 11111 0 ..... ..... @rd_rn_rm +UABA 01000101 .. 0 ..... 11111 1 ..... ..... @rd_rn_rm + +#### SVE2 Narrowing + +## SVE2 saturating extract narrow + +# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0. +SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl +SQXTNT 01000101 .. 1 ..... 010 001 ..... ..... @rd_rn_tszimm_shl +UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl +UQXTNT 01000101 .. 1 ..... 010 011 ..... ..... @rd_rn_tszimm_shl +SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl +SQXTUNT 01000101 .. 1 ..... 010 101 ..... ..... @rd_rn_tszimm_shl + +## SVE2 bitwise shift right narrow + +# Bit 23 == 0 is handled by esz > 0 in the translator. +SQSHRUNB 01000101 .. 1 ..... 00 0000 ..... ..... @rd_rn_tszimm_shr +SQSHRUNT 01000101 .. 1 ..... 00 0001 ..... ..... @rd_rn_tszimm_shr +SQRSHRUNB 01000101 .. 1 ..... 00 0010 ..... ..... @rd_rn_tszimm_shr +SQRSHRUNT 01000101 .. 1 ..... 00 0011 ..... ..... @rd_rn_tszimm_shr +SHRNB 01000101 .. 1 ..... 00 0100 ..... ..... @rd_rn_tszimm_shr +SHRNT 01000101 .. 1 ..... 00 0101 ..... ..... @rd_rn_tszimm_shr +RSHRNB 01000101 .. 1 ..... 00 0110 ..... ..... @rd_rn_tszimm_shr +RSHRNT 01000101 .. 1 ..... 00 0111 ..... ..... @rd_rn_tszimm_shr +SQSHRNB 01000101 .. 1 ..... 00 1000 ..... ..... @rd_rn_tszimm_shr +SQSHRNT 01000101 .. 1 ..... 00 1001 ..... ..... @rd_rn_tszimm_shr +SQRSHRNB 01000101 .. 1 ..... 00 1010 ..... ..... @rd_rn_tszimm_shr +SQRSHRNT 01000101 .. 1 ..... 00 1011 ..... ..... @rd_rn_tszimm_shr +UQSHRNB 01000101 .. 1 ..... 00 1100 ..... ..... @rd_rn_tszimm_shr +UQSHRNT 01000101 .. 1 ..... 00 1101 ..... ..... @rd_rn_tszimm_shr +UQRSHRNB 01000101 .. 1 ..... 00 1110 ..... ..... @rd_rn_tszimm_shr +UQRSHRNT 01000101 .. 1 ..... 00 1111 ..... ..... @rd_rn_tszimm_shr + +## SVE2 integer add/subtract narrow high part + +ADDHNB 01000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm +ADDHNT 01000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm +RADDHNB 01000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm +RADDHNT 01000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm +SUBHNB 01000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm +SUBHNT 01000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm +RSUBHNB 01000101 .. 1 ..... 011 110 ..... ..... @rd_rn_rm +RSUBHNT 01000101 .. 1 ..... 011 111 ..... ..... @rd_rn_rm + +### SVE2 Character Match + +MATCH 01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm +NMATCH 01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm + +### SVE2 Histogram Computation + +HISTCNT 01000101 .. 1 ..... 110 ... ..... ..... @rd_pg_rn_rm +HISTSEG 01000101 .. 1 ..... 101 000 ..... ..... @rd_rn_rm + +## SVE2 floating-point pairwise operations + +FADDP 01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm +FMAXNMP 01100100 .. 010 10 0 100 ... ..... ..... @rdn_pg_rm +FMINNMP 01100100 .. 010 10 1 100 ... ..... ..... @rdn_pg_rm +FMAXP 01100100 .. 010 11 0 100 ... ..... ..... @rdn_pg_rm +FMINP 01100100 .. 010 11 1 100 ... ..... ..... @rdn_pg_rm + +#### SVE Integer Multiply-Add (unpredicated) + +## SVE2 saturating multiply-add long + +SQDMLALB_zzzw 01000100 .. 0 ..... 0110 00 ..... ..... @rda_rn_rm +SQDMLALT_zzzw 01000100 .. 0 ..... 0110 01 ..... ..... @rda_rn_rm +SQDMLSLB_zzzw 01000100 .. 0 ..... 0110 10 ..... ..... @rda_rn_rm +SQDMLSLT_zzzw 01000100 .. 0 ..... 0110 11 ..... ..... @rda_rn_rm + +## SVE2 saturating multiply-add interleaved long + +SQDMLALBT 01000100 .. 0 ..... 00001 0 ..... ..... @rda_rn_rm +SQDMLSLBT 01000100 .. 0 ..... 00001 1 ..... ..... @rda_rn_rm + +## SVE2 saturating multiply-add high + +SQRDMLAH_zzzz 01000100 .. 0 ..... 01110 0 ..... ..... @rda_rn_rm +SQRDMLSH_zzzz 01000100 .. 0 ..... 01110 1 ..... ..... @rda_rn_rm + +## SVE2 integer multiply-add long + +SMLALB_zzzw 01000100 .. 0 ..... 010 000 ..... ..... @rda_rn_rm +SMLALT_zzzw 01000100 .. 0 ..... 010 001 ..... ..... @rda_rn_rm +UMLALB_zzzw 01000100 .. 0 ..... 010 010 ..... ..... @rda_rn_rm +UMLALT_zzzw 01000100 .. 0 ..... 010 011 ..... ..... @rda_rn_rm +SMLSLB_zzzw 01000100 .. 0 ..... 010 100 ..... ..... @rda_rn_rm +SMLSLT_zzzw 01000100 .. 0 ..... 010 101 ..... ..... @rda_rn_rm +UMLSLB_zzzw 01000100 .. 0 ..... 010 110 ..... ..... @rda_rn_rm +UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm + +## SVE2 complex integer multiply-add + +CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx +SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx + +## SVE mixed sign dot product + +USDOT_zzzz 01000100 .. 0 ..... 011 110 ..... ..... @rda_rn_rm + +### SVE2 floating point matrix multiply accumulate +BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 +FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 +FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 + +### SVE2 Memory Gather Load Group + +# SVE2 64-bit gather non-temporal load (scalar plus 64-bit unscaled offsets) +LDNT1_zprz 1100010 msz:2 00 rm:5 1 u:1 0 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 esz=3 scale=0 ff=0 + +# SVE2 32-bit gather non-temporal load (scalar plus 32-bit unscaled offsets) +LDNT1_zprz 1000010 msz:2 00 rm:5 10 u:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=0 esz=2 scale=0 ff=0 + +### SVE2 Memory Store Group + +# SVE2 64-bit scatter non-temporal store (vector plus scalar) +STNT1_zprz 1110010 .. 00 ..... 001 ... ..... ..... \ + @rprr_scatter_store xs=2 esz=3 scale=0 + +# SVE2 32-bit scatter non-temporal store (vector plus scalar) +STNT1_zprz 1110010 .. 10 ..... 001 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=2 scale=0 + +### SVE2 Crypto Extensions + +# SVE2 crypto unary operations +# AESMC and AESIMC +AESMC 01000101 00 10000011100 decrypt:1 00000 rd:5 + +# SVE2 crypto destructive binary operations +AESE 01000101 00 10001 0 11100 0 ..... ..... @rdn_rm_e0 +AESD 01000101 00 10001 0 11100 1 ..... ..... @rdn_rm_e0 +SM4E 01000101 00 10001 1 11100 0 ..... ..... @rdn_rm_e0 + +# SVE2 crypto constructive binary operations +SM4EKEY 01000101 00 1 ..... 11110 0 ..... ..... @rd_rn_rm_e0 +RAX1 01000101 00 1 ..... 11110 1 ..... ..... @rd_rn_rm_e0 + +### SVE2 floating-point convert precision odd elements +FCVTXNT_ds 01100100 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVTX_ds 01100101 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVTNT_sh 01100100 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0 +BFCVTNT 01100100 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVTLT_hs 01100100 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0 +FCVTNT_ds 01100100 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0 + +### SVE2 floating-point convert to integer +FLOGB 01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5 &rpr_esz + +### SVE2 floating-point multiply-add long (vectors) +FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 +FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0 +FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0 +FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0 + +BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 +BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0 + +### SVE2 floating-point bfloat16 dot-product +BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 + +### SVE2 floating-point multiply-add long (indexed) +FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2 +FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2 +FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2 +FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2 +BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2 +BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2 + +### SVE2 floating-point bfloat16 dot-product (indexed) +BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2 + +### SVE broadcast predicate element + +&psel esz pd pn pm rv imm +%psel_rv 16:2 !function=plus_12 +%psel_imm_b 22:2 19:2 +%psel_imm_h 22:2 20:1 +%psel_imm_s 22:2 +%psel_imm_d 23:1 +@psel ........ .. . ... .. .. pn:4 . pm:4 . pd:4 \ + &psel rv=%psel_rv + +PSEL 00100101 .. 1 ..1 .. 01 .... 0 .... 0 .... \ + @psel esz=0 imm=%psel_imm_b +PSEL 00100101 .. 1 .10 .. 01 .... 0 .... 0 .... \ + @psel esz=1 imm=%psel_imm_h +PSEL 00100101 .. 1 100 .. 01 .... 0 .... 0 .... \ + @psel esz=2 imm=%psel_imm_s +PSEL 00100101 .1 1 000 .. 01 .... 0 .... 0 .... \ + @psel esz=3 imm=%psel_imm_d + +### SVE clamp + +SCLAMP 01000100 .. 0 ..... 110000 ..... ..... @rda_rn_rm +UCLAMP 01000100 .. 0 ..... 110001 ..... ..... @rda_rn_rm diff --git a/target/arm/tcg/t16.decode b/target/arm/tcg/t16.decode new file mode 100644 index 0000000..646c749 --- /dev/null +++ b/target/arm/tcg/t16.decode @@ -0,0 +1,281 @@ +# Thumb1 instructions +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +&empty !extern +&s_rrr_shi !extern s rd rn rm shim shty +&s_rrr_shr !extern s rn rd rm rs shty +&s_rri_rot !extern s rn rd imm rot +&s_rrrr !extern s rd rn rm ra +&rrr_rot !extern rd rn rm rot +&rr !extern rd rm +&ri !extern rd imm +&r !extern rm +&i !extern imm +&ldst_rr !extern p w u rn rt rm shimm shtype +&ldst_ri !extern p w u rn rt imm +&ldst_block !extern rn i b u w list +&setend !extern E +&cps !extern mode imod M A I F +&ci !extern cond imm + +# Set S if the instruction is outside of an IT block. +%s !function=t16_setflags + +# Data-processing (two low registers) + +%reg_0 0:3 + +@lll_noshr ...... .... rm:3 rd:3 \ + &s_rrr_shi %s rn=%reg_0 shim=0 shty=0 +@xll_noshr ...... .... rm:3 rn:3 \ + &s_rrr_shi s=1 rd=0 shim=0 shty=0 +@lxl_shr ...... .... rs:3 rd:3 \ + &s_rrr_shr %s rm=%reg_0 rn=0 + +AND_rrri 010000 0000 ... ... @lll_noshr +EOR_rrri 010000 0001 ... ... @lll_noshr +MOV_rxrr 010000 0010 ... ... @lxl_shr shty=0 # LSL +MOV_rxrr 010000 0011 ... ... @lxl_shr shty=1 # LSR +MOV_rxrr 010000 0100 ... ... @lxl_shr shty=2 # ASR +ADC_rrri 010000 0101 ... ... @lll_noshr +SBC_rrri 010000 0110 ... ... @lll_noshr +MOV_rxrr 010000 0111 ... ... @lxl_shr shty=3 # ROR +TST_xrri 010000 1000 ... ... @xll_noshr +RSB_rri 010000 1001 rn:3 rd:3 &s_rri_rot %s imm=0 rot=0 +CMP_xrri 010000 1010 ... ... @xll_noshr +CMN_xrri 010000 1011 ... ... @xll_noshr +ORR_rrri 010000 1100 ... ... @lll_noshr +MUL 010000 1101 rn:3 rd:3 &s_rrrr %s rm=%reg_0 ra=0 +BIC_rrri 010000 1110 ... ... @lll_noshr +MVN_rxri 010000 1111 ... ... @lll_noshr + +# Load/store (register offset) + +@ldst_rr ....... rm:3 rn:3 rt:3 \ + &ldst_rr p=1 w=0 u=1 shimm=0 shtype=0 + +STR_rr 0101 000 ... ... ... @ldst_rr +STRH_rr 0101 001 ... ... ... @ldst_rr +STRB_rr 0101 010 ... ... ... @ldst_rr +LDRSB_rr 0101 011 ... ... ... @ldst_rr +LDR_rr 0101 100 ... ... ... @ldst_rr +LDRH_rr 0101 101 ... ... ... @ldst_rr +LDRB_rr 0101 110 ... ... ... @ldst_rr +LDRSH_rr 0101 111 ... ... ... @ldst_rr + +# Load/store word/byte (immediate offset) + +%imm5_6x4 6:5 !function=times_4 + +@ldst_ri_1 ..... imm:5 rn:3 rt:3 \ + &ldst_ri p=1 w=0 u=1 +@ldst_ri_4 ..... ..... rn:3 rt:3 \ + &ldst_ri p=1 w=0 u=1 imm=%imm5_6x4 + +STR_ri 01100 ..... ... ... @ldst_ri_4 +LDR_ri 01101 ..... ... ... @ldst_ri_4 +STRB_ri 01110 ..... ... ... @ldst_ri_1 +LDRB_ri 01111 ..... ... ... @ldst_ri_1 + +# Load/store halfword (immediate offset) + +%imm5_6x2 6:5 !function=times_2 +@ldst_ri_2 ..... ..... rn:3 rt:3 \ + &ldst_ri p=1 w=0 u=1 imm=%imm5_6x2 + +STRH_ri 10000 ..... ... ... @ldst_ri_2 +LDRH_ri 10001 ..... ... ... @ldst_ri_2 + +# Load/store (SP-relative) + +%imm8_0x4 0:8 !function=times_4 +@ldst_spec_i ..... rt:3 ........ \ + &ldst_ri p=1 w=0 u=1 imm=%imm8_0x4 + +STR_ri 10010 ... ........ @ldst_spec_i rn=13 +LDR_ri 10011 ... ........ @ldst_spec_i rn=13 + +# Load (PC-relative) + +LDR_ri 01001 ... ........ @ldst_spec_i rn=15 + +# Add PC/SP (immediate) + +ADR 10100 rd:3 ........ imm=%imm8_0x4 +ADD_rri 10101 rd:3 ........ \ + &s_rri_rot rn=13 s=0 rot=0 imm=%imm8_0x4 # SP + +# Load/store multiple + +@ldstm ..... rn:3 list:8 &ldst_block i=1 b=0 u=0 w=1 + +STM 11000 ... ........ @ldstm +LDM_t16 11001 ... ........ @ldstm + +# Shift (immediate) + +@shift_i ..... shim:5 rm:3 rd:3 &s_rrr_shi %s rn=%reg_0 + +MOV_rxri 000 00 ..... ... ... @shift_i shty=0 # LSL +MOV_rxri 000 01 ..... ... ... @shift_i shty=1 # LSR +MOV_rxri 000 10 ..... ... ... @shift_i shty=2 # ASR + +# Add/subtract (three low registers) + +@addsub_3 ....... rm:3 rn:3 rd:3 \ + &s_rrr_shi %s shim=0 shty=0 + +ADD_rrri 0001100 ... ... ... @addsub_3 +SUB_rrri 0001101 ... ... ... @addsub_3 + +# Add/subtract (two low registers and immediate) + +@addsub_2i ....... imm:3 rn:3 rd:3 \ + &s_rri_rot %s rot=0 + +ADD_rri 0001 110 ... ... ... @addsub_2i +SUB_rri 0001 111 ... ... ... @addsub_2i + +# Add, subtract, compare, move (one low register and immediate) + +%reg_8 8:3 +@arith_1i ..... rd:3 imm:8 \ + &s_rri_rot rot=0 rn=%reg_8 + +MOV_rxi 00100 ... ........ @arith_1i %s +CMP_xri 00101 ... ........ @arith_1i s=1 +ADD_rri 00110 ... ........ @arith_1i %s +SUB_rri 00111 ... ........ @arith_1i %s + +# Add, compare, move (two high registers) + +%reg_0_7 7:1 0:3 +@addsub_2h .... .... . rm:4 ... \ + &s_rrr_shi rd=%reg_0_7 rn=%reg_0_7 shim=0 shty=0 + +ADD_rrri 0100 0100 . .... ... @addsub_2h s=0 +CMP_xrri 0100 0101 . .... ... @addsub_2h s=1 +MOV_rxri 0100 0110 . .... ... @addsub_2h s=0 + +# Adjust SP (immediate) + +%imm7_0x4 0:7 !function=times_4 +@addsub_sp_i .... .... . ....... \ + &s_rri_rot s=0 rd=13 rn=13 rot=0 imm=%imm7_0x4 + +ADD_rri 1011 0000 0 ....... @addsub_sp_i +SUB_rri 1011 0000 1 ....... @addsub_sp_i + +# Branch and exchange + +@branchr .... .... . rm:4 ... &r + +BX 0100 0111 0 .... 000 @branchr +BLX_r 0100 0111 1 .... 000 @branchr +BXNS 0100 0111 0 .... 100 @branchr +BLXNS 0100 0111 1 .... 100 @branchr + +# Extend + +@extend .... .... .. rm:3 rd:3 &rrr_rot rn=15 rot=0 + +SXTAH 1011 0010 00 ... ... @extend +SXTAB 1011 0010 01 ... ... @extend +UXTAH 1011 0010 10 ... ... @extend +UXTAB 1011 0010 11 ... ... @extend + +# Change processor state + +%imod 4:1 !function=plus_2 + +SETEND 1011 0110 010 1 E:1 000 &setend +{ + CPS 1011 0110 011 . 0 A:1 I:1 F:1 &cps mode=0 M=0 %imod + CPS_v7m 1011 0110 011 im:1 00 I:1 F:1 +} + +# Reverse bytes + +@rdm .... .... .. rm:3 rd:3 &rr + +REV 1011 1010 00 ... ... @rdm +REV16 1011 1010 01 ... ... @rdm +REVSH 1011 1010 11 ... ... @rdm + +# Hints + +{ + { + YIELD 1011 1111 0001 0000 + WFE 1011 1111 0010 0000 + WFI 1011 1111 0011 0000 + + # TODO: Implement SEV, SEVL; may help SMP performance. + # SEV 1011 1111 0100 0000 + # SEVL 1011 1111 0101 0000 + + # The canonical nop has the second nibble as 0000, but the whole of the + # rest of the space is a reserved hint, behaves as nop. + NOP 1011 1111 ---- 0000 + } + IT 1011 1111 cond_mask:8 +} + +# Miscellaneous 16-bit instructions + +%imm6_9_3 9:1 3:5 !function=times_2 + +HLT 1011 1010 10 imm:6 &i +BKPT 1011 1110 imm:8 &i +CBZ 1011 nz:1 0.1 ..... rn:3 imm=%imm6_9_3 + +# Push and Pop + +%push_list 0:9 !function=t16_push_list +%pop_list 0:9 !function=t16_pop_list + +STM 1011 010 ......... \ + &ldst_block i=0 b=1 u=0 w=1 rn=13 list=%push_list +LDM_t16 1011 110 ......... \ + &ldst_block i=1 b=0 u=0 w=1 rn=13 list=%pop_list + +# Conditional branches, Supervisor call + +%imm8_0x2 0:s8 !function=times_2 + +{ + UDF 1101 1110 ---- ---- + SVC 1101 1111 imm:8 &i + B_cond_thumb 1101 cond:4 ........ &ci imm=%imm8_0x2 +} + +# Unconditional Branch + +%imm11_0x2 0:s11 !function=times_2 + +B 11100 ........... &i imm=%imm11_0x2 + +# thumb_insn_is_16bit() ensures we won't be decoding these as +# T16 instructions for a Thumb2 CPU, so these patterns must be +# a Thumb1 split BL/BLX. +BLX_suffix 11101 imm:11 &i +BL_BLX_prefix 11110 imm:s11 &i +BL_suffix 11111 imm:11 &i diff --git a/target/arm/tcg/t32.decode b/target/arm/tcg/t32.decode new file mode 100644 index 0000000..f21ad01 --- /dev/null +++ b/target/arm/tcg/t32.decode @@ -0,0 +1,753 @@ +# Thumb2 instructions +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# + +&empty !extern +&s_rrr_shi !extern s rd rn rm shim shty +&s_rrr_shr !extern s rn rd rm rs shty +&s_rri_rot !extern s rn rd imm rot +&s_rrrr !extern s rd rn rm ra +&rrrr !extern rd rn rm ra +&rrr_rot !extern rd rn rm rot +&rrr !extern rd rn rm +&rr !extern rd rm +&ri !extern rd imm +&r !extern rm +&i !extern imm +&msr_reg !extern rn r mask +&mrs_reg !extern rd r +&msr_bank !extern rn r sysm +&mrs_bank !extern rd r sysm +&ldst_rr !extern p w u rn rt rm shimm shtype +&ldst_ri !extern p w u rn rt imm +&ldst_block !extern rn i b u w list +&strex !extern rn rd rt rt2 imm +&ldrex !extern rn rt rt2 imm +&bfx !extern rd rn lsb widthm1 +&bfi !extern rd rn lsb msb +&sat !extern rd rn satimm imm sh +&pkh !extern rd rn rm imm tb +&cps !extern mode imod M A I F +&mcr !extern cp opc1 crn crm opc2 rt +&mcrr !extern cp opc1 crm rt rt2 + +&mve_shl_ri rdalo rdahi shim +&mve_shl_rr rdalo rdahi rm +&mve_sh_ri rda shim +&mve_sh_rr rda rm + +# rdahi: bits [3:1] from insn, bit 0 is 1 +# rdalo: bits [3:1] from insn, bit 0 is 0 +%rdahi_9 9:3 !function=times_2_plus_1 +%rdalo_17 17:3 !function=times_2 + +# Data-processing (register) + +%imm5_12_6 12:3 6:2 + +@s_rrr_shi ....... .... s:1 rn:4 .... rd:4 .. shty:2 rm:4 \ + &s_rrr_shi shim=%imm5_12_6 +@s_rxr_shi ....... .... s:1 .... .... rd:4 .. shty:2 rm:4 \ + &s_rrr_shi shim=%imm5_12_6 rn=0 +@S_xrr_shi ....... .... . rn:4 .... .... .. shty:2 rm:4 \ + &s_rrr_shi shim=%imm5_12_6 s=1 rd=0 + +@mve_shl_ri ....... .... . ... . . ... ... . .. .. .... \ + &mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9 +@mve_shl_rr ....... .... . ... . rm:4 ... . .. .. .... \ + &mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9 +@mve_sh_ri ....... .... . rda:4 . ... ... . .. .. .... \ + &mve_sh_ri shim=%imm5_12_6 +@mve_sh_rr ....... .... . rda:4 rm:4 .... .... .... &mve_sh_rr + +{ + TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi + AND_rrri 1110101 0000 . .... 0 ... .... .... .... @s_rrr_shi +} +BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi +{ + # The v8.1M MVE shift insns overlap in encoding with MOVS/ORRS + # and are distinguished by having Rm==13 or 15. Those are UNPREDICTABLE + # cases for MOVS/ORRS. We decode the MVE cases first, ensuring that + # they explicitly call unallocated_encoding() for cases that must UNDEF + # (eg "using a new shift insn on a v8.1M CPU without MVE"), and letting + # the rest fall through (where ORR_rrri and MOV_rxri will end up + # handling them as r13 and r15 accesses with the same semantics as A32). + [ + { + UQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 00 1111 @mve_sh_ri + LSLL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111 @mve_shl_ri + UQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111 @mve_shl_ri + } + + { + URSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 01 1111 @mve_sh_ri + LSRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111 @mve_shl_ri + URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri + } + + { + SRSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 10 1111 @mve_sh_ri + ASRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111 @mve_shl_ri + SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri + } + + { + SQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 11 1111 @mve_sh_ri + SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri + } + + { + UQRSHL_rr 1110101 0010 1 .... .... 1111 0000 1101 @mve_sh_rr + LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr + UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101 @mve_shl_rr + } + + { + SQRSHR_rr 1110101 0010 1 .... .... 1111 0010 1101 @mve_sh_rr + ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr + SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101 @mve_shl_rr + } + + UQRSHLL48_rr 1110101 0010 1 ... 1 .... ... 1 1000 1101 @mve_shl_rr + SQRSHRL48_rr 1110101 0010 1 ... 1 .... ... 1 1010 1101 @mve_shl_rr + ] + + MOV_rxri 1110101 0010 . 1111 0 ... .... .... .... @s_rxr_shi + ORR_rrri 1110101 0010 . .... 0 ... .... .... .... @s_rrr_shi + + # v8.1M CSEL and friends + CSEL 1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4 +} +{ + MVN_rxri 1110101 0011 . 1111 0 ... .... .... .... @s_rxr_shi + ORN_rrri 1110101 0011 . .... 0 ... .... .... .... @s_rrr_shi +} +{ + TEQ_xrri 1110101 0100 1 .... 0 ... 1111 .... .... @S_xrr_shi + EOR_rrri 1110101 0100 . .... 0 ... .... .... .... @s_rrr_shi +} +PKH 1110101 0110 0 rn:4 0 ... rd:4 .. tb:1 0 rm:4 \ + &pkh imm=%imm5_12_6 +{ + CMN_xrri 1110101 1000 1 .... 0 ... 1111 .... .... @S_xrr_shi + ADD_rrri 1110101 1000 . .... 0 ... .... .... .... @s_rrr_shi +} +ADC_rrri 1110101 1010 . .... 0 ... .... .... .... @s_rrr_shi +SBC_rrri 1110101 1011 . .... 0 ... .... .... .... @s_rrr_shi +{ + CMP_xrri 1110101 1101 1 .... 0 ... 1111 .... .... @S_xrr_shi + SUB_rrri 1110101 1101 . .... 0 ... .... .... .... @s_rrr_shi +} +RSB_rrri 1110101 1110 . .... 0 ... .... .... .... @s_rrr_shi + +# Data-processing (register-shifted register) + +MOV_rxrr 1111 1010 0 shty:2 s:1 rm:4 1111 rd:4 0000 rs:4 \ + &s_rrr_shr rn=0 + +# Data-processing (immediate) + +%t32extrot 26:1 12:3 0:8 !function=t32_expandimm_rot +%t32extimm 26:1 12:3 0:8 !function=t32_expandimm_imm + +@s_rri_rot ....... .... s:1 rn:4 . ... rd:4 ........ \ + &s_rri_rot imm=%t32extimm rot=%t32extrot +@s_rxi_rot ....... .... s:1 .... . ... rd:4 ........ \ + &s_rri_rot imm=%t32extimm rot=%t32extrot rn=0 +@S_xri_rot ....... .... . rn:4 . ... .... ........ \ + &s_rri_rot imm=%t32extimm rot=%t32extrot s=1 rd=0 + +{ + TST_xri 1111 0.0 0000 1 .... 0 ... 1111 ........ @S_xri_rot + AND_rri 1111 0.0 0000 . .... 0 ... .... ........ @s_rri_rot +} +BIC_rri 1111 0.0 0001 . .... 0 ... .... ........ @s_rri_rot +{ + MOV_rxi 1111 0.0 0010 . 1111 0 ... .... ........ @s_rxi_rot + ORR_rri 1111 0.0 0010 . .... 0 ... .... ........ @s_rri_rot +} +{ + MVN_rxi 1111 0.0 0011 . 1111 0 ... .... ........ @s_rxi_rot + ORN_rri 1111 0.0 0011 . .... 0 ... .... ........ @s_rri_rot +} +{ + TEQ_xri 1111 0.0 0100 1 .... 0 ... 1111 ........ @S_xri_rot + EOR_rri 1111 0.0 0100 . .... 0 ... .... ........ @s_rri_rot +} +{ + CMN_xri 1111 0.0 1000 1 .... 0 ... 1111 ........ @S_xri_rot + ADD_rri 1111 0.0 1000 . .... 0 ... .... ........ @s_rri_rot +} +ADC_rri 1111 0.0 1010 . .... 0 ... .... ........ @s_rri_rot +SBC_rri 1111 0.0 1011 . .... 0 ... .... ........ @s_rri_rot +{ + CMP_xri 1111 0.0 1101 1 .... 0 ... 1111 ........ @S_xri_rot + SUB_rri 1111 0.0 1101 . .... 0 ... .... ........ @s_rri_rot +} +RSB_rri 1111 0.0 1110 . .... 0 ... .... ........ @s_rri_rot + +# Data processing (plain binary immediate) + +%imm12_26_12_0 26:1 12:3 0:8 +%neg12_26_12_0 26:1 12:3 0:8 !function=negate +@s0_rri_12 .... ... .... . rn:4 . ... rd:4 ........ \ + &s_rri_rot imm=%imm12_26_12_0 rot=0 s=0 + +{ + ADR 1111 0.1 0000 0 1111 0 ... rd:4 ........ \ + &ri imm=%imm12_26_12_0 + ADD_rri 1111 0.1 0000 0 .... 0 ... .... ........ @s0_rri_12 +} +{ + ADR 1111 0.1 0101 0 1111 0 ... rd:4 ........ \ + &ri imm=%neg12_26_12_0 + SUB_rri 1111 0.1 0101 0 .... 0 ... .... ........ @s0_rri_12 +} + +# Move Wide + +%imm16_26_16_12_0 16:4 26:1 12:3 0:8 +@mov16 .... .... .... .... .... rd:4 .... .... \ + &ri imm=%imm16_26_16_12_0 + +MOVW 1111 0.10 0100 .... 0 ... .... ........ @mov16 +MOVT 1111 0.10 1100 .... 0 ... .... ........ @mov16 + +# Saturate, bitfield + +@sat .... .... .. sh:1 . rn:4 . ... rd:4 .. . satimm:5 \ + &sat imm=%imm5_12_6 +@sat16 .... .... .. . . rn:4 . ... rd:4 .. . satimm:5 \ + &sat sh=0 imm=0 + +{ + SSAT16 1111 0011 001 0 .... 0 000 .... 00 0 ..... @sat16 + SSAT 1111 0011 00. 0 .... 0 ... .... .. 0 ..... @sat +} +{ + USAT16 1111 0011 101 0 .... 0 000 .... 00 0 ..... @sat16 + USAT 1111 0011 10. 0 .... 0 ... .... .. 0 ..... @sat +} + +@bfx .... .... ... . rn:4 . ... rd:4 .. . widthm1:5 \ + &bfx lsb=%imm5_12_6 +@bfi .... .... ... . rn:4 . ... rd:4 .. . msb:5 \ + &bfi lsb=%imm5_12_6 + +SBFX 1111 0011 010 0 .... 0 ... .... ..0..... @bfx +UBFX 1111 0011 110 0 .... 0 ... .... ..0..... @bfx + +# bfc is bfi w/ rn=15 +BFCI 1111 0011 011 0 .... 0 ... .... ..0..... @bfi + +# Multiply and multiply accumulate + +@s0_rnadm .... .... .... rn:4 ra:4 rd:4 .... rm:4 &s_rrrr s=0 +@s0_rn0dm .... .... .... rn:4 .... rd:4 .... rm:4 &s_rrrr ra=0 s=0 +@rnadm .... .... .... rn:4 ra:4 rd:4 .... rm:4 &rrrr +@rn0dm .... .... .... rn:4 .... rd:4 .... rm:4 &rrrr ra=0 +@rndm .... .... .... rn:4 .... rd:4 .... rm:4 &rrr +@rdm .... .... .... .... .... rd:4 .... rm:4 &rr + +{ + MUL 1111 1011 0000 .... 1111 .... 0000 .... @s0_rn0dm + MLA 1111 1011 0000 .... .... .... 0000 .... @s0_rnadm +} +MLS 1111 1011 0000 .... .... .... 0001 .... @rnadm +SMULL 1111 1011 1000 .... .... .... 0000 .... @s0_rnadm +UMULL 1111 1011 1010 .... .... .... 0000 .... @s0_rnadm +SMLAL 1111 1011 1100 .... .... .... 0000 .... @s0_rnadm +UMLAL 1111 1011 1110 .... .... .... 0000 .... @s0_rnadm +UMAAL 1111 1011 1110 .... .... .... 0110 .... @rnadm +{ + SMULWB 1111 1011 0011 .... 1111 .... 0000 .... @rn0dm + SMLAWB 1111 1011 0011 .... .... .... 0000 .... @rnadm +} +{ + SMULWT 1111 1011 0011 .... 1111 .... 0001 .... @rn0dm + SMLAWT 1111 1011 0011 .... .... .... 0001 .... @rnadm +} +{ + SMULBB 1111 1011 0001 .... 1111 .... 0000 .... @rn0dm + SMLABB 1111 1011 0001 .... .... .... 0000 .... @rnadm +} +{ + SMULBT 1111 1011 0001 .... 1111 .... 0001 .... @rn0dm + SMLABT 1111 1011 0001 .... .... .... 0001 .... @rnadm +} +{ + SMULTB 1111 1011 0001 .... 1111 .... 0010 .... @rn0dm + SMLATB 1111 1011 0001 .... .... .... 0010 .... @rnadm +} +{ + SMULTT 1111 1011 0001 .... 1111 .... 0011 .... @rn0dm + SMLATT 1111 1011 0001 .... .... .... 0011 .... @rnadm +} +SMLALBB 1111 1011 1100 .... .... .... 1000 .... @rnadm +SMLALBT 1111 1011 1100 .... .... .... 1001 .... @rnadm +SMLALTB 1111 1011 1100 .... .... .... 1010 .... @rnadm +SMLALTT 1111 1011 1100 .... .... .... 1011 .... @rnadm + +# usad8 is usada8 w/ ra=15 +USADA8 1111 1011 0111 .... .... .... 0000 .... @rnadm + +SMLAD 1111 1011 0010 .... .... .... 0000 .... @rnadm +SMLADX 1111 1011 0010 .... .... .... 0001 .... @rnadm +SMLSD 1111 1011 0100 .... .... .... 0000 .... @rnadm +SMLSDX 1111 1011 0100 .... .... .... 0001 .... @rnadm + +SMLALD 1111 1011 1100 .... .... .... 1100 .... @rnadm +SMLALDX 1111 1011 1100 .... .... .... 1101 .... @rnadm +SMLSLD 1111 1011 1101 .... .... .... 1100 .... @rnadm +SMLSLDX 1111 1011 1101 .... .... .... 1101 .... @rnadm + +SMMLA 1111 1011 0101 .... .... .... 0000 .... @rnadm +SMMLAR 1111 1011 0101 .... .... .... 0001 .... @rnadm +SMMLS 1111 1011 0110 .... .... .... 0000 .... @rnadm +SMMLSR 1111 1011 0110 .... .... .... 0001 .... @rnadm + +SDIV 1111 1011 1001 .... 1111 .... 1111 .... @rndm +UDIV 1111 1011 1011 .... 1111 .... 1111 .... @rndm + +# Data-processing (two source registers) + +QADD 1111 1010 1000 .... 1111 .... 1000 .... @rndm +QSUB 1111 1010 1000 .... 1111 .... 1010 .... @rndm +QDADD 1111 1010 1000 .... 1111 .... 1001 .... @rndm +QDSUB 1111 1010 1000 .... 1111 .... 1011 .... @rndm + +CRC32B 1111 1010 1100 .... 1111 .... 1000 .... @rndm +CRC32H 1111 1010 1100 .... 1111 .... 1001 .... @rndm +CRC32W 1111 1010 1100 .... 1111 .... 1010 .... @rndm +CRC32CB 1111 1010 1101 .... 1111 .... 1000 .... @rndm +CRC32CH 1111 1010 1101 .... 1111 .... 1001 .... @rndm +CRC32CW 1111 1010 1101 .... 1111 .... 1010 .... @rndm + +SEL 1111 1010 1010 .... 1111 .... 1000 .... @rndm + +# Note rn != rm is CONSTRAINED UNPREDICTABLE; we choose to ignore rn. +REV 1111 1010 1001 ---- 1111 .... 1000 .... @rdm +REV16 1111 1010 1001 ---- 1111 .... 1001 .... @rdm +RBIT 1111 1010 1001 ---- 1111 .... 1010 .... @rdm +REVSH 1111 1010 1001 ---- 1111 .... 1011 .... @rdm +CLZ 1111 1010 1011 ---- 1111 .... 1000 .... @rdm + +# Branches and miscellaneous control + +%msr_sysm 4:1 8:4 +%mrs_sysm 4:1 16:4 +%imm16_16_0 16:4 0:12 +%imm21 26:s1 11:1 13:1 16:6 0:11 !function=times_2 +&ci cond imm + +{ + # Group insn[25:23] = 111, which is cond=111x for the branch below, + # or unconditional, which would be illegal for the branch. + [ + # Hints, and CPS + { + [ + YIELD 1111 0011 1010 1111 1000 0000 0000 0001 + WFE 1111 0011 1010 1111 1000 0000 0000 0010 + WFI 1111 0011 1010 1111 1000 0000 0000 0011 + + # TODO: Implement SEV, SEVL; may help SMP performance. + # SEV 1111 0011 1010 1111 1000 0000 0000 0100 + # SEVL 1111 0011 1010 1111 1000 0000 0000 0101 + + ESB 1111 0011 1010 1111 1000 0000 0001 0000 + ] + + # The canonical nop ends in 0000 0000, but the whole rest + # of the space is "reserved hint, behaves as nop". + NOP 1111 0011 1010 1111 1000 0000 ---- ---- + + # If imod == '00' && M == '0' then SEE "Hint instructions", above. + CPS 1111 0011 1010 1111 1000 0 imod:2 M:1 A:1 I:1 F:1 mode:5 \ + &cps + } + + # Miscellaneous control + CLREX 1111 0011 1011 1111 1000 1111 0010 1111 + DSB 1111 0011 1011 1111 1000 1111 0100 ---- + DMB 1111 0011 1011 1111 1000 1111 0101 ---- + ISB 1111 0011 1011 1111 1000 1111 0110 ---- + SB 1111 0011 1011 1111 1000 1111 0111 0000 + + # Note that the v7m insn overlaps both the normal and banked insn. + { + MRS_bank 1111 0011 111 r:1 .... 1000 rd:4 001. 0000 \ + &mrs_bank sysm=%mrs_sysm + MRS_reg 1111 0011 111 r:1 1111 1000 rd:4 0000 0000 &mrs_reg + MRS_v7m 1111 0011 111 0 1111 1000 rd:4 sysm:8 + } + { + MSR_bank 1111 0011 100 r:1 rn:4 1000 .... 001. 0000 \ + &msr_bank sysm=%msr_sysm + MSR_reg 1111 0011 100 r:1 rn:4 1000 mask:4 0000 0000 &msr_reg + MSR_v7m 1111 0011 100 0 rn:4 1000 mask:2 00 sysm:8 + } + BXJ 1111 0011 1100 rm:4 1000 1111 0000 0000 &r + { + # At v6T2, this is the T5 encoding of SUBS PC, LR, #IMM, and works as for + # every other encoding of SUBS. With v7VE, IMM=0 is redefined as ERET. + # The distinction between the two only matters for Hyp mode. + ERET 1111 0011 1101 1110 1000 1111 0000 0000 + SUB_rri 1111 0011 1101 1110 1000 1111 imm:8 \ + &s_rri_rot rot=0 s=1 rd=15 rn=14 + } + SMC 1111 0111 1111 imm:4 1000 0000 0000 0000 &i + HVC 1111 0111 1110 .... 1000 .... .... .... \ + &i imm=%imm16_16_0 + UDF 1111 0111 1111 ---- 1010 ---- ---- ---- + ] + B_cond_thumb 1111 0. cond:4 ...... 10.0 ............ &ci imm=%imm21 +} + +# Load/store (register, immediate, literal) + +@ldst_rr .... .... .... rn:4 rt:4 ...... shimm:2 rm:4 \ + &ldst_rr p=1 w=0 u=1 shtype=0 +@ldst_ri_idx .... .... .... rn:4 rt:4 . p:1 u:1 . imm:8 \ + &ldst_ri w=1 +@ldst_ri_neg .... .... .... rn:4 rt:4 .... imm:8 \ + &ldst_ri p=1 w=0 u=0 +@ldst_ri_unp .... .... .... rn:4 rt:4 .... imm:8 \ + &ldst_ri p=1 w=0 u=1 +@ldst_ri_pos .... .... .... rn:4 rt:4 imm:12 \ + &ldst_ri p=1 w=0 u=1 +@ldst_ri_lit .... .... u:1 ... .... rt:4 imm:12 \ + &ldst_ri p=1 w=0 rn=15 + +STRB_rr 1111 1000 0000 .... .... 000000 .. .... @ldst_rr +STRB_ri 1111 1000 0000 .... .... 1..1 ........ @ldst_ri_idx +STRB_ri 1111 1000 0000 .... .... 1100 ........ @ldst_ri_neg +STRBT_ri 1111 1000 0000 .... .... 1110 ........ @ldst_ri_unp +STRB_ri 1111 1000 1000 .... .... ............ @ldst_ri_pos + +STRH_rr 1111 1000 0010 .... .... 000000 .. .... @ldst_rr +STRH_ri 1111 1000 0010 .... .... 1..1 ........ @ldst_ri_idx +STRH_ri 1111 1000 0010 .... .... 1100 ........ @ldst_ri_neg +STRHT_ri 1111 1000 0010 .... .... 1110 ........ @ldst_ri_unp +STRH_ri 1111 1000 1010 .... .... ............ @ldst_ri_pos + +STR_rr 1111 1000 0100 .... .... 000000 .. .... @ldst_rr +STR_ri 1111 1000 0100 .... .... 1..1 ........ @ldst_ri_idx +STR_ri 1111 1000 0100 .... .... 1100 ........ @ldst_ri_neg +STRT_ri 1111 1000 0100 .... .... 1110 ........ @ldst_ri_unp +STR_ri 1111 1000 1100 .... .... ............ @ldst_ri_pos + +# Note that Load, unsigned (literal) overlaps all other load encodings. +{ + { + NOP 1111 1000 -001 1111 1111 ------------ # PLD + LDRB_ri 1111 1000 .001 1111 .... ............ @ldst_ri_lit + } + { + NOP 1111 1000 1001 ---- 1111 ------------ # PLD + LDRB_ri 1111 1000 1001 .... .... ............ @ldst_ri_pos + } + LDRB_ri 1111 1000 0001 .... .... 1..1 ........ @ldst_ri_idx + { + NOP 1111 1000 0001 ---- 1111 1100 -------- # PLD + LDRB_ri 1111 1000 0001 .... .... 1100 ........ @ldst_ri_neg + } + LDRBT_ri 1111 1000 0001 .... .... 1110 ........ @ldst_ri_unp + { + NOP 1111 1000 0001 ---- 1111 000000 -- ---- # PLD + LDRB_rr 1111 1000 0001 .... .... 000000 .. .... @ldst_rr + } +} +{ + { + NOP 1111 1000 -011 1111 1111 ------------ # PLD + LDRH_ri 1111 1000 .011 1111 .... ............ @ldst_ri_lit + } + { + NOP 1111 1000 1011 ---- 1111 ------------ # PLDW + LDRH_ri 1111 1000 1011 .... .... ............ @ldst_ri_pos + } + LDRH_ri 1111 1000 0011 .... .... 1..1 ........ @ldst_ri_idx + { + NOP 1111 1000 0011 ---- 1111 1100 -------- # PLDW + LDRH_ri 1111 1000 0011 .... .... 1100 ........ @ldst_ri_neg + } + LDRHT_ri 1111 1000 0011 .... .... 1110 ........ @ldst_ri_unp + { + NOP 1111 1000 0011 ---- 1111 000000 -- ---- # PLDW + LDRH_rr 1111 1000 0011 .... .... 000000 .. .... @ldst_rr + } +} +{ + LDR_ri 1111 1000 .101 1111 .... ............ @ldst_ri_lit + LDR_ri 1111 1000 1101 .... .... ............ @ldst_ri_pos + LDR_ri 1111 1000 0101 .... .... 1..1 ........ @ldst_ri_idx + LDR_ri 1111 1000 0101 .... .... 1100 ........ @ldst_ri_neg + LDRT_ri 1111 1000 0101 .... .... 1110 ........ @ldst_ri_unp + LDR_rr 1111 1000 0101 .... .... 000000 .. .... @ldst_rr +} +# NOPs here are PLI. +{ + { + NOP 1111 1001 -001 1111 1111 ------------ + LDRSB_ri 1111 1001 .001 1111 .... ............ @ldst_ri_lit + } + { + NOP 1111 1001 1001 ---- 1111 ------------ + LDRSB_ri 1111 1001 1001 .... .... ............ @ldst_ri_pos + } + LDRSB_ri 1111 1001 0001 .... .... 1..1 ........ @ldst_ri_idx + { + NOP 1111 1001 0001 ---- 1111 1100 -------- + LDRSB_ri 1111 1001 0001 .... .... 1100 ........ @ldst_ri_neg + } + LDRSBT_ri 1111 1001 0001 .... .... 1110 ........ @ldst_ri_unp + { + NOP 1111 1001 0001 ---- 1111 000000 -- ---- + LDRSB_rr 1111 1001 0001 .... .... 000000 .. .... @ldst_rr + } +} +# NOPs here are unallocated memory hints, treated as NOP. +{ + { + NOP 1111 1001 -011 1111 1111 ------------ + LDRSH_ri 1111 1001 .011 1111 .... ............ @ldst_ri_lit + } + { + NOP 1111 1001 1011 ---- 1111 ------------ + LDRSH_ri 1111 1001 1011 .... .... ............ @ldst_ri_pos + } + LDRSH_ri 1111 1001 0011 .... .... 1..1 ........ @ldst_ri_idx + { + NOP 1111 1001 0011 ---- 1111 1100 -------- + LDRSH_ri 1111 1001 0011 .... .... 1100 ........ @ldst_ri_neg + } + LDRSHT_ri 1111 1001 0011 .... .... 1110 ........ @ldst_ri_unp + { + NOP 1111 1001 0011 ---- 1111 000000 -- ---- + LDRSH_rr 1111 1001 0011 .... .... 000000 .. .... @ldst_rr + } +} + +%imm8x4 0:8 !function=times_4 +&ldst_ri2 p w u rn rt rt2 imm +@ldstd_ri8 .... .... u:1 ... rn:4 rt:4 rt2:4 ........ \ + &ldst_ri2 imm=%imm8x4 + +STRD_ri_t32 1110 1000 .110 .... .... .... ........ @ldstd_ri8 w=1 p=0 +LDRD_ri_t32 1110 1000 .111 .... .... .... ........ @ldstd_ri8 w=1 p=0 + +STRD_ri_t32 1110 1001 .100 .... .... .... ........ @ldstd_ri8 w=0 p=1 +LDRD_ri_t32 1110 1001 .101 .... .... .... ........ @ldstd_ri8 w=0 p=1 + +STRD_ri_t32 1110 1001 .110 .... .... .... ........ @ldstd_ri8 w=1 p=1 +{ + SG 1110 1001 0111 1111 1110 1001 01111111 + LDRD_ri_t32 1110 1001 .111 .... .... .... ........ @ldstd_ri8 w=1 p=1 +} + +# Load/Store Exclusive, Load-Acquire/Store-Release, and Table Branch + +@strex_i .... .... .... rn:4 rt:4 rd:4 .... .... \ + &strex rt2=15 imm=%imm8x4 +@strex_0 .... .... .... rn:4 rt:4 .... .... rd:4 \ + &strex rt2=15 imm=0 +@strex_d .... .... .... rn:4 rt:4 rt2:4 .... rd:4 \ + &strex imm=0 + +@ldrex_i .... .... .... rn:4 rt:4 .... .... .... \ + &ldrex rt2=15 imm=%imm8x4 +@ldrex_0 .... .... .... rn:4 rt:4 .... .... .... \ + &ldrex rt2=15 imm=0 +@ldrex_d .... .... .... rn:4 rt:4 rt2:4 .... .... \ + &ldrex imm=0 + +{ + TT 1110 1000 0100 rn:4 1111 rd:4 A:1 T:1 000000 + STREX 1110 1000 0100 .... .... .... .... .... @strex_i +} +STREXB 1110 1000 1100 .... .... 1111 0100 .... @strex_0 +STREXH 1110 1000 1100 .... .... 1111 0101 .... @strex_0 +STREXD_t32 1110 1000 1100 .... .... .... 0111 .... @strex_d + +STLEX 1110 1000 1100 .... .... 1111 1110 .... @strex_0 +STLEXB 1110 1000 1100 .... .... 1111 1100 .... @strex_0 +STLEXH 1110 1000 1100 .... .... 1111 1101 .... @strex_0 +STLEXD_t32 1110 1000 1100 .... .... .... 1111 .... @strex_d + +STL 1110 1000 1100 .... .... 1111 1010 1111 @ldrex_0 +STLB 1110 1000 1100 .... .... 1111 1000 1111 @ldrex_0 +STLH 1110 1000 1100 .... .... 1111 1001 1111 @ldrex_0 + +LDREX 1110 1000 0101 .... .... 1111 .... .... @ldrex_i +LDREXB 1110 1000 1101 .... .... 1111 0100 1111 @ldrex_0 +LDREXH 1110 1000 1101 .... .... 1111 0101 1111 @ldrex_0 +LDREXD_t32 1110 1000 1101 .... .... .... 0111 1111 @ldrex_d + +LDAEX 1110 1000 1101 .... .... 1111 1110 1111 @ldrex_0 +LDAEXB 1110 1000 1101 .... .... 1111 1100 1111 @ldrex_0 +LDAEXH 1110 1000 1101 .... .... 1111 1101 1111 @ldrex_0 +LDAEXD_t32 1110 1000 1101 .... .... .... 1111 1111 @ldrex_d + +LDA 1110 1000 1101 .... .... 1111 1010 1111 @ldrex_0 +LDAB 1110 1000 1101 .... .... 1111 1000 1111 @ldrex_0 +LDAH 1110 1000 1101 .... .... 1111 1001 1111 @ldrex_0 + +&tbranch rn rm +@tbranch .... .... .... rn:4 .... .... .... rm:4 &tbranch + +TBB 1110 1000 1101 .... 1111 0000 0000 .... @tbranch +TBH 1110 1000 1101 .... 1111 0000 0001 .... @tbranch + +# Parallel addition and subtraction + +SADD8 1111 1010 1000 .... 1111 .... 0000 .... @rndm +QADD8 1111 1010 1000 .... 1111 .... 0001 .... @rndm +SHADD8 1111 1010 1000 .... 1111 .... 0010 .... @rndm +UADD8 1111 1010 1000 .... 1111 .... 0100 .... @rndm +UQADD8 1111 1010 1000 .... 1111 .... 0101 .... @rndm +UHADD8 1111 1010 1000 .... 1111 .... 0110 .... @rndm + +SADD16 1111 1010 1001 .... 1111 .... 0000 .... @rndm +QADD16 1111 1010 1001 .... 1111 .... 0001 .... @rndm +SHADD16 1111 1010 1001 .... 1111 .... 0010 .... @rndm +UADD16 1111 1010 1001 .... 1111 .... 0100 .... @rndm +UQADD16 1111 1010 1001 .... 1111 .... 0101 .... @rndm +UHADD16 1111 1010 1001 .... 1111 .... 0110 .... @rndm + +SASX 1111 1010 1010 .... 1111 .... 0000 .... @rndm +QASX 1111 1010 1010 .... 1111 .... 0001 .... @rndm +SHASX 1111 1010 1010 .... 1111 .... 0010 .... @rndm +UASX 1111 1010 1010 .... 1111 .... 0100 .... @rndm +UQASX 1111 1010 1010 .... 1111 .... 0101 .... @rndm +UHASX 1111 1010 1010 .... 1111 .... 0110 .... @rndm + +SSUB8 1111 1010 1100 .... 1111 .... 0000 .... @rndm +QSUB8 1111 1010 1100 .... 1111 .... 0001 .... @rndm +SHSUB8 1111 1010 1100 .... 1111 .... 0010 .... @rndm +USUB8 1111 1010 1100 .... 1111 .... 0100 .... @rndm +UQSUB8 1111 1010 1100 .... 1111 .... 0101 .... @rndm +UHSUB8 1111 1010 1100 .... 1111 .... 0110 .... @rndm + +SSUB16 1111 1010 1101 .... 1111 .... 0000 .... @rndm +QSUB16 1111 1010 1101 .... 1111 .... 0001 .... @rndm +SHSUB16 1111 1010 1101 .... 1111 .... 0010 .... @rndm +USUB16 1111 1010 1101 .... 1111 .... 0100 .... @rndm +UQSUB16 1111 1010 1101 .... 1111 .... 0101 .... @rndm +UHSUB16 1111 1010 1101 .... 1111 .... 0110 .... @rndm + +SSAX 1111 1010 1110 .... 1111 .... 0000 .... @rndm +QSAX 1111 1010 1110 .... 1111 .... 0001 .... @rndm +SHSAX 1111 1010 1110 .... 1111 .... 0010 .... @rndm +USAX 1111 1010 1110 .... 1111 .... 0100 .... @rndm +UQSAX 1111 1010 1110 .... 1111 .... 0101 .... @rndm +UHSAX 1111 1010 1110 .... 1111 .... 0110 .... @rndm + +# Register extends + +@rrr_rot .... .... .... rn:4 .... rd:4 .. rot:2 rm:4 &rrr_rot + +SXTAH 1111 1010 0000 .... 1111 .... 10.. .... @rrr_rot +UXTAH 1111 1010 0001 .... 1111 .... 10.. .... @rrr_rot +SXTAB16 1111 1010 0010 .... 1111 .... 10.. .... @rrr_rot +UXTAB16 1111 1010 0011 .... 1111 .... 10.. .... @rrr_rot +SXTAB 1111 1010 0100 .... 1111 .... 10.. .... @rrr_rot +UXTAB 1111 1010 0101 .... 1111 .... 10.. .... @rrr_rot + +# Load/store multiple + +@ldstm .... .... .. w:1 . rn:4 list:16 &ldst_block u=0 + +STM_t32 1110 1000 10.0 .... ................ @ldstm i=1 b=0 +STM_t32 1110 1001 00.0 .... ................ @ldstm i=0 b=1 +{ + # Rn=15 UNDEFs for LDM; M-profile CLRM uses that encoding + CLRM 1110 1000 1001 1111 list:16 + LDM_t32 1110 1000 10.1 .... ................ @ldstm i=1 b=0 +} +LDM_t32 1110 1001 00.1 .... ................ @ldstm i=0 b=1 + +&rfe !extern rn w pu +@rfe .... .... .. w:1 . rn:4 ................ &rfe + +RFE 1110 1000 00.1 .... 1100000000000000 @rfe pu=2 +RFE 1110 1001 10.1 .... 1100000000000000 @rfe pu=1 + +&srs !extern mode w pu +@srs .... .... .. w:1 . .... ........... mode:5 &srs + +SRS 1110 1000 00.0 1101 1100 0000 000. .... @srs pu=2 +SRS 1110 1001 10.0 1101 1100 0000 000. .... @srs pu=1 + +# Coprocessor instructions + +# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the +# other coprocessor instructions always UNDEF. +# The trans_ functions for these will ignore cp values 8..13 for v7 or +# earlier, and 0..13 for v8 and later, because those areas of the +# encoding space may be used for other things, such as VFP or Neon. + +@mcr .... .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4 +@mcrr .... .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4 + +MCRR 1110 1100 0100 .... .... .... .... .... @mcrr +MRRC 1110 1100 0101 .... .... .... .... .... @mcrr + +MCR 1110 1110 ... 0 .... .... .... ... 1 .... @mcr +MRC 1110 1110 ... 1 .... .... .... ... 1 .... @mcr + +# Branches + +%imm24 26:s1 13:1 11:1 16:10 0:11 !function=t32_branch24 +@branch24 ................................ &i imm=%imm24 + +B 1111 0. .......... 10.1 ............ @branch24 +BL 1111 0. .......... 11.1 ............ @branch24 +{ + # BLX_i is non-M-profile only + BLX_i 1111 0. .......... 11.0 ............ @branch24 + # M-profile only: loop and branch insns + [ + # All these BF insns have boff != 0b0000; we NOP them all + BF 1111 0 boff:4 ------- 1100 - ---------- 1 # BFL + BF 1111 0 boff:4 0 ------ 1110 - ---------- 1 # BFCSEL + BF 1111 0 boff:4 10 ----- 1110 - ---------- 1 # BF + BF 1111 0 boff:4 11 ----- 1110 0 0000000000 1 # BFX, BFLX + ] + [ + # LE and WLS immediate + %lob_imm 1:10 11:1 !function=times_2 + + DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001 size=4 + WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm size=4 + { + LE 1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm + # This is WLSTP + WLS 1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm + } + { + LCTP 1111 0 0000 000 1111 1110 0000 0000 0001 + # This is DLSTP + DLS 1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001 + } + VCTP 1111 0 0000 0 size:2 rn:4 1110 1000 0000 0001 + ] +} diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c new file mode 100644 index 0000000..da9f877 --- /dev/null +++ b/target/arm/tcg/translate-a64.c @@ -0,0 +1,15054 @@ +/* + * AArch64 translation + * + * Copyright (c) 2013 Alexander Graf <agraf@suse.de> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ +#include "qemu/osdep.h" + +#include "cpu.h" +#include "exec/exec-all.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "qemu/log.h" +#include "arm_ldst.h" +#include "translate.h" +#include "internals.h" +#include "qemu/host-utils.h" +#include "semihosting/semihost.h" +#include "exec/gen-icount.h" +#include "exec/helper-proto.h" +#include "exec/helper-gen.h" +#include "exec/log.h" +#include "cpregs.h" +#include "translate-a64.h" +#include "qemu/atomic128.h" + +static TCGv_i64 cpu_X[32]; +static TCGv_i64 cpu_pc; + +/* Load/store exclusive handling */ +static TCGv_i64 cpu_exclusive_high; + +static const char *regnames[] = { + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28", "x29", "lr", "sp" +}; + +enum a64_shift_type { + A64_SHIFT_TYPE_LSL = 0, + A64_SHIFT_TYPE_LSR = 1, + A64_SHIFT_TYPE_ASR = 2, + A64_SHIFT_TYPE_ROR = 3 +}; + +/* Table based decoder typedefs - used when the relevant bits for decode + * are too awkwardly scattered across the instruction (eg SIMD). + */ +typedef void AArch64DecodeFn(DisasContext *s, uint32_t insn); + +typedef struct AArch64DecodeTable { + uint32_t pattern; + uint32_t mask; + AArch64DecodeFn *disas_fn; +} AArch64DecodeTable; + +/* initialize TCG globals. */ +void a64_translate_init(void) +{ + int i; + + cpu_pc = tcg_global_mem_new_i64(cpu_env, + offsetof(CPUARMState, pc), + "pc"); + for (i = 0; i < 32; i++) { + cpu_X[i] = tcg_global_mem_new_i64(cpu_env, + offsetof(CPUARMState, xregs[i]), + regnames[i]); + } + + cpu_exclusive_high = tcg_global_mem_new_i64(cpu_env, + offsetof(CPUARMState, exclusive_high), "exclusive_high"); +} + +/* + * Return the core mmu_idx to use for A64 "unprivileged load/store" insns + */ +static int get_a64_user_mem_index(DisasContext *s) +{ + /* + * If AccType_UNPRIV is not used, the insn uses AccType_NORMAL, + * which is the usual mmu_idx for this cpu state. + */ + ARMMMUIdx useridx = s->mmu_idx; + + if (s->unpriv) { + /* + * We have pre-computed the condition for AccType_UNPRIV. + * Therefore we should never get here with a mmu_idx for + * which we do not know the corresponding user mmu_idx. + */ + switch (useridx) { + case ARMMMUIdx_E10_1: + case ARMMMUIdx_E10_1_PAN: + useridx = ARMMMUIdx_E10_0; + break; + case ARMMMUIdx_E20_2: + case ARMMMUIdx_E20_2_PAN: + useridx = ARMMMUIdx_E20_0; + break; + default: + g_assert_not_reached(); + } + } + return arm_to_core_mmu_idx(useridx); +} + +static void set_btype_raw(int val) +{ + tcg_gen_st_i32(tcg_constant_i32(val), cpu_env, + offsetof(CPUARMState, btype)); +} + +static void set_btype(DisasContext *s, int val) +{ + /* BTYPE is a 2-bit field, and 0 should be done with reset_btype. */ + tcg_debug_assert(val >= 1 && val <= 3); + set_btype_raw(val); + s->btype = -1; +} + +static void reset_btype(DisasContext *s) +{ + if (s->btype != 0) { + set_btype_raw(0); + s->btype = 0; + } +} + +static void gen_pc_plus_diff(DisasContext *s, TCGv_i64 dest, target_long diff) +{ + assert(s->pc_save != -1); + if (TARGET_TB_PCREL) { + tcg_gen_addi_i64(dest, cpu_pc, (s->pc_curr - s->pc_save) + diff); + } else { + tcg_gen_movi_i64(dest, s->pc_curr + diff); + } +} + +void gen_a64_update_pc(DisasContext *s, target_long diff) +{ + gen_pc_plus_diff(s, cpu_pc, diff); + s->pc_save = s->pc_curr + diff; +} + +/* + * Handle Top Byte Ignore (TBI) bits. + * + * If address tagging is enabled via the TCR TBI bits: + * + for EL2 and EL3 there is only one TBI bit, and if it is set + * then the address is zero-extended, clearing bits [63:56] + * + for EL0 and EL1, TBI0 controls addresses with bit 55 == 0 + * and TBI1 controls addressses with bit 55 == 1. + * If the appropriate TBI bit is set for the address then + * the address is sign-extended from bit 55 into bits [63:56] + * + * Here We have concatenated TBI{1,0} into tbi. + */ +static void gen_top_byte_ignore(DisasContext *s, TCGv_i64 dst, + TCGv_i64 src, int tbi) +{ + if (tbi == 0) { + /* Load unmodified address */ + tcg_gen_mov_i64(dst, src); + } else if (!regime_has_2_ranges(s->mmu_idx)) { + /* Force tag byte to all zero */ + tcg_gen_extract_i64(dst, src, 0, 56); + } else { + /* Sign-extend from bit 55. */ + tcg_gen_sextract_i64(dst, src, 0, 56); + + switch (tbi) { + case 1: + /* tbi0 but !tbi1: only use the extension if positive */ + tcg_gen_and_i64(dst, dst, src); + break; + case 2: + /* !tbi0 but tbi1: only use the extension if negative */ + tcg_gen_or_i64(dst, dst, src); + break; + case 3: + /* tbi0 and tbi1: always use the extension */ + break; + default: + g_assert_not_reached(); + } + } +} + +static void gen_a64_set_pc(DisasContext *s, TCGv_i64 src) +{ + /* + * If address tagging is enabled for instructions via the TCR TBI bits, + * then loading an address into the PC will clear out any tag. + */ + gen_top_byte_ignore(s, cpu_pc, src, s->tbii); + s->pc_save = -1; +} + +/* + * Handle MTE and/or TBI. + * + * For TBI, ideally, we would do nothing. Proper behaviour on fault is + * for the tag to be present in the FAR_ELx register. But for user-only + * mode we do not have a TLB with which to implement this, so we must + * remove the top byte now. + * + * Always return a fresh temporary that we can increment independently + * of the write-back address. + */ + +TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr) +{ + TCGv_i64 clean = new_tmp_a64(s); +#ifdef CONFIG_USER_ONLY + gen_top_byte_ignore(s, clean, addr, s->tbid); +#else + tcg_gen_mov_i64(clean, addr); +#endif + return clean; +} + +/* Insert a zero tag into src, with the result at dst. */ +static void gen_address_with_allocation_tag0(TCGv_i64 dst, TCGv_i64 src) +{ + tcg_gen_andi_i64(dst, src, ~MAKE_64BIT_MASK(56, 4)); +} + +static void gen_probe_access(DisasContext *s, TCGv_i64 ptr, + MMUAccessType acc, int log2_size) +{ + gen_helper_probe_access(cpu_env, ptr, + tcg_constant_i32(acc), + tcg_constant_i32(get_mem_index(s)), + tcg_constant_i32(1 << log2_size)); +} + +/* + * For MTE, check a single logical or atomic access. This probes a single + * address, the exact one specified. The size and alignment of the access + * is not relevant to MTE, per se, but watchpoints do require the size, + * and we want to recognize those before making any other changes to state. + */ +static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr, + bool is_write, bool tag_checked, + int log2_size, bool is_unpriv, + int core_idx) +{ + if (tag_checked && s->mte_active[is_unpriv]) { + TCGv_i64 ret; + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, core_idx); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << log2_size) - 1); + + ret = new_tmp_a64(s); + gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr); + + return ret; + } + return clean_data_tbi(s, addr); +} + +TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int log2_size) +{ + return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size, + false, get_mem_index(s)); +} + +/* + * For MTE, check multiple logical sequential accesses. + */ +TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int size) +{ + if (tag_checked && s->mte_active[0]) { + TCGv_i64 ret; + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, size - 1); + + ret = new_tmp_a64(s); + gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr); + + return ret; + } + return clean_data_tbi(s, addr); +} + +typedef struct DisasCompare64 { + TCGCond cond; + TCGv_i64 value; +} DisasCompare64; + +static void a64_test_cc(DisasCompare64 *c64, int cc) +{ + DisasCompare c32; + + arm_test_cc(&c32, cc); + + /* Sign-extend the 32-bit value so that the GE/LT comparisons work + * properly. The NE/EQ comparisons are also fine with this choice. */ + c64->cond = c32.cond; + c64->value = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(c64->value, c32.value); + + arm_free_cc(&c32); +} + +static void a64_free_cc(DisasCompare64 *c64) +{ + tcg_temp_free_i64(c64->value); +} + +static void gen_rebuild_hflags(DisasContext *s) +{ + gen_helper_rebuild_hflags_a64(cpu_env, tcg_constant_i32(s->current_el)); +} + +static void gen_exception_internal(int excp) +{ + assert(excp_is_internal(excp)); + gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp)); +} + +static void gen_exception_internal_insn(DisasContext *s, int excp) +{ + gen_a64_update_pc(s, 0); + gen_exception_internal(excp); + s->base.is_jmp = DISAS_NORETURN; +} + +static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syndrome) +{ + gen_a64_update_pc(s, 0); + gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syndrome)); + s->base.is_jmp = DISAS_NORETURN; +} + +static void gen_step_complete_exception(DisasContext *s) +{ + /* We just completed step of an insn. Move from Active-not-pending + * to Active-pending, and then also take the swstep exception. + * This corresponds to making the (IMPDEF) choice to prioritize + * swstep exceptions over asynchronous exceptions taken to an exception + * level where debug is disabled. This choice has the advantage that + * we do not need to maintain internal state corresponding to the + * ISV/EX syndrome bits between completion of the step and generation + * of the exception, and our syndrome information is always correct. + */ + gen_ss_advance(s); + gen_swstep_exception(s, 1, s->is_ldex); + s->base.is_jmp = DISAS_NORETURN; +} + +static inline bool use_goto_tb(DisasContext *s, uint64_t dest) +{ + if (s->ss_active) { + return false; + } + return translator_use_goto_tb(&s->base, dest); +} + +static void gen_goto_tb(DisasContext *s, int n, int64_t diff) +{ + if (use_goto_tb(s, s->pc_curr + diff)) { + /* + * For pcrel, the pc must always be up-to-date on entry to + * the linked TB, so that it can use simple additions for all + * further adjustments. For !pcrel, the linked TB is compiled + * to know its full virtual address, so we can delay the + * update to pc to the unlinked path. A long chain of links + * can thus avoid many updates to the PC. + */ + if (TARGET_TB_PCREL) { + gen_a64_update_pc(s, diff); + tcg_gen_goto_tb(n); + } else { + tcg_gen_goto_tb(n); + gen_a64_update_pc(s, diff); + } + tcg_gen_exit_tb(s->base.tb, n); + s->base.is_jmp = DISAS_NORETURN; + } else { + gen_a64_update_pc(s, diff); + if (s->ss_active) { + gen_step_complete_exception(s); + } else { + tcg_gen_lookup_and_goto_ptr(); + s->base.is_jmp = DISAS_NORETURN; + } + } +} + +static void init_tmp_a64_array(DisasContext *s) +{ +#ifdef CONFIG_DEBUG_TCG + memset(s->tmp_a64, 0, sizeof(s->tmp_a64)); +#endif + s->tmp_a64_count = 0; +} + +static void free_tmp_a64(DisasContext *s) +{ + int i; + for (i = 0; i < s->tmp_a64_count; i++) { + tcg_temp_free_i64(s->tmp_a64[i]); + } + init_tmp_a64_array(s); +} + +TCGv_i64 new_tmp_a64(DisasContext *s) +{ + assert(s->tmp_a64_count < TMP_A64_MAX); + return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_new_i64(); +} + +TCGv_i64 new_tmp_a64_local(DisasContext *s) +{ + assert(s->tmp_a64_count < TMP_A64_MAX); + return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_local_new_i64(); +} + +TCGv_i64 new_tmp_a64_zero(DisasContext *s) +{ + TCGv_i64 t = new_tmp_a64(s); + tcg_gen_movi_i64(t, 0); + return t; +} + +/* + * Register access functions + * + * These functions are used for directly accessing a register in where + * changes to the final register value are likely to be made. If you + * need to use a register for temporary calculation (e.g. index type + * operations) use the read_* form. + * + * B1.2.1 Register mappings + * + * In instruction register encoding 31 can refer to ZR (zero register) or + * the SP (stack pointer) depending on context. In QEMU's case we map SP + * to cpu_X[31] and ZR accesses to a temporary which can be discarded. + * This is the point of the _sp forms. + */ +TCGv_i64 cpu_reg(DisasContext *s, int reg) +{ + if (reg == 31) { + return new_tmp_a64_zero(s); + } else { + return cpu_X[reg]; + } +} + +/* register access for when 31 == SP */ +TCGv_i64 cpu_reg_sp(DisasContext *s, int reg) +{ + return cpu_X[reg]; +} + +/* read a cpu register in 32bit/64bit mode. Returns a TCGv_i64 + * representing the register contents. This TCGv is an auto-freed + * temporary so it need not be explicitly freed, and may be modified. + */ +TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf) +{ + TCGv_i64 v = new_tmp_a64(s); + if (reg != 31) { + if (sf) { + tcg_gen_mov_i64(v, cpu_X[reg]); + } else { + tcg_gen_ext32u_i64(v, cpu_X[reg]); + } + } else { + tcg_gen_movi_i64(v, 0); + } + return v; +} + +TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf) +{ + TCGv_i64 v = new_tmp_a64(s); + if (sf) { + tcg_gen_mov_i64(v, cpu_X[reg]); + } else { + tcg_gen_ext32u_i64(v, cpu_X[reg]); + } + return v; +} + +/* Return the offset into CPUARMState of a slice (from + * the least significant end) of FP register Qn (ie + * Dn, Sn, Hn or Bn). + * (Note that this is not the same mapping as for A32; see cpu.h) + */ +static inline int fp_reg_offset(DisasContext *s, int regno, MemOp size) +{ + return vec_reg_offset(s, regno, 0, size); +} + +/* Offset of the high half of the 128 bit vector Qn */ +static inline int fp_reg_hi_offset(DisasContext *s, int regno) +{ + return vec_reg_offset(s, regno, 1, MO_64); +} + +/* Convenience accessors for reading and writing single and double + * FP registers. Writing clears the upper parts of the associated + * 128 bit vector register, as required by the architecture. + * Note that unlike the GP register accessors, the values returned + * by the read functions must be manually freed. + */ +static TCGv_i64 read_fp_dreg(DisasContext *s, int reg) +{ + TCGv_i64 v = tcg_temp_new_i64(); + + tcg_gen_ld_i64(v, cpu_env, fp_reg_offset(s, reg, MO_64)); + return v; +} + +static TCGv_i32 read_fp_sreg(DisasContext *s, int reg) +{ + TCGv_i32 v = tcg_temp_new_i32(); + + tcg_gen_ld_i32(v, cpu_env, fp_reg_offset(s, reg, MO_32)); + return v; +} + +static TCGv_i32 read_fp_hreg(DisasContext *s, int reg) +{ + TCGv_i32 v = tcg_temp_new_i32(); + + tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16)); + return v; +} + +/* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64). + * If SVE is not enabled, then there are only 128 bits in the vector. + */ +static void clear_vec_high(DisasContext *s, bool is_q, int rd) +{ + unsigned ofs = fp_reg_offset(s, rd, MO_64); + unsigned vsz = vec_full_reg_size(s); + + /* Nop move, with side effect of clearing the tail. */ + tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz); +} + +void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v) +{ + unsigned ofs = fp_reg_offset(s, reg, MO_64); + + tcg_gen_st_i64(v, cpu_env, ofs); + clear_vec_high(s, false, reg); +} + +static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + + tcg_gen_extu_i32_i64(tmp, v); + write_fp_dreg(s, reg, tmp); + tcg_temp_free_i64(tmp); +} + +/* Expand a 2-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn, + GVecGen2Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand + immediate AdvSIMD vector operation using + * an expander function. + */ +static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn, + int64_t imm, GVecGen2iFn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + imm, is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 3-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm, + GVecGen3Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 4-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn4(DisasContext *s, bool is_q, int rd, int rn, int rm, + int rx, GVecGen4Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), vec_full_reg_offset(s, rx), + is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand operation using an out-of-line helper. */ +static void gen_gvec_op2_ool(DisasContext *s, bool is_q, int rd, + int rn, int data, gen_helper_gvec_2 *fn) +{ + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + +/* Expand a 3-operand operation using an out-of-line helper. */ +static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd, + int rn, int rm, int data, gen_helper_gvec_3 *fn) +{ + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + +/* Expand a 3-operand + fpstatus pointer + simd data value operation using + * an out-of-line helper. + */ +static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn, + int rm, bool is_fp16, int data, + gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); + tcg_temp_free_ptr(fpst); +} + +/* Expand a 3-operand + qc + operation using an out-of-line helper. */ +static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn, + int rm, gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr qc_ptr = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc)); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), qc_ptr, + is_q ? 16 : 8, vec_full_reg_size(s), 0, fn); + tcg_temp_free_ptr(qc_ptr); +} + +/* Expand a 4-operand operation using an out-of-line helper. */ +static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn, + int rm, int ra, int data, gen_helper_gvec_4 *fn) +{ + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + +/* + * Expand a 4-operand + fpstatus pointer + simd data value operation using + * an out-of-line helper. + */ +static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn, + int rm, int ra, bool is_fp16, int data, + gen_helper_gvec_4_ptr *fn) +{ + TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); + tcg_temp_free_ptr(fpst); +} + +/* Set ZF and NF based on a 64 bit result. This is alas fiddlier + * than the 32 bit equivalent. + */ +static inline void gen_set_NZ64(TCGv_i64 result) +{ + tcg_gen_extr_i64_i32(cpu_ZF, cpu_NF, result); + tcg_gen_or_i32(cpu_ZF, cpu_ZF, cpu_NF); +} + +/* Set NZCV as for a logical operation: NZ as per result, CV cleared. */ +static inline void gen_logic_CC(int sf, TCGv_i64 result) +{ + if (sf) { + gen_set_NZ64(result); + } else { + tcg_gen_extrl_i64_i32(cpu_ZF, result); + tcg_gen_mov_i32(cpu_NF, cpu_ZF); + } + tcg_gen_movi_i32(cpu_CF, 0); + tcg_gen_movi_i32(cpu_VF, 0); +} + +/* dest = T0 + T1; compute C, N, V and Z flags */ +static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) +{ + if (sf) { + TCGv_i64 result, flag, tmp; + result = tcg_temp_new_i64(); + flag = tcg_temp_new_i64(); + tmp = tcg_temp_new_i64(); + + tcg_gen_movi_i64(tmp, 0); + tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp); + + tcg_gen_extrl_i64_i32(cpu_CF, flag); + + gen_set_NZ64(result); + + tcg_gen_xor_i64(flag, result, t0); + tcg_gen_xor_i64(tmp, t0, t1); + tcg_gen_andc_i64(flag, flag, tmp); + tcg_temp_free_i64(tmp); + tcg_gen_extrh_i64_i32(cpu_VF, flag); + + tcg_gen_mov_i64(dest, result); + tcg_temp_free_i64(result); + tcg_temp_free_i64(flag); + } else { + /* 32 bit arithmetic */ + TCGv_i32 t0_32 = tcg_temp_new_i32(); + TCGv_i32 t1_32 = tcg_temp_new_i32(); + TCGv_i32 tmp = tcg_temp_new_i32(); + + tcg_gen_movi_i32(tmp, 0); + tcg_gen_extrl_i64_i32(t0_32, t0); + tcg_gen_extrl_i64_i32(t1_32, t1); + tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, t1_32, tmp); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32); + tcg_gen_xor_i32(tmp, t0_32, t1_32); + tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp); + tcg_gen_extu_i32_i64(dest, cpu_NF); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(t0_32); + tcg_temp_free_i32(t1_32); + } +} + +/* dest = T0 - T1; compute C, N, V and Z flags */ +static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) +{ + if (sf) { + /* 64 bit arithmetic */ + TCGv_i64 result, flag, tmp; + + result = tcg_temp_new_i64(); + flag = tcg_temp_new_i64(); + tcg_gen_sub_i64(result, t0, t1); + + gen_set_NZ64(result); + + tcg_gen_setcond_i64(TCG_COND_GEU, flag, t0, t1); + tcg_gen_extrl_i64_i32(cpu_CF, flag); + + tcg_gen_xor_i64(flag, result, t0); + tmp = tcg_temp_new_i64(); + tcg_gen_xor_i64(tmp, t0, t1); + tcg_gen_and_i64(flag, flag, tmp); + tcg_temp_free_i64(tmp); + tcg_gen_extrh_i64_i32(cpu_VF, flag); + tcg_gen_mov_i64(dest, result); + tcg_temp_free_i64(flag); + tcg_temp_free_i64(result); + } else { + /* 32 bit arithmetic */ + TCGv_i32 t0_32 = tcg_temp_new_i32(); + TCGv_i32 t1_32 = tcg_temp_new_i32(); + TCGv_i32 tmp; + + tcg_gen_extrl_i64_i32(t0_32, t0); + tcg_gen_extrl_i64_i32(t1_32, t1); + tcg_gen_sub_i32(cpu_NF, t0_32, t1_32); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0_32, t1_32); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32); + tmp = tcg_temp_new_i32(); + tcg_gen_xor_i32(tmp, t0_32, t1_32); + tcg_temp_free_i32(t0_32); + tcg_temp_free_i32(t1_32); + tcg_gen_and_i32(cpu_VF, cpu_VF, tmp); + tcg_temp_free_i32(tmp); + tcg_gen_extu_i32_i64(dest, cpu_NF); + } +} + +/* dest = T0 + T1 + CF; do not compute flags. */ +static void gen_adc(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) +{ + TCGv_i64 flag = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(flag, cpu_CF); + tcg_gen_add_i64(dest, t0, t1); + tcg_gen_add_i64(dest, dest, flag); + tcg_temp_free_i64(flag); + + if (!sf) { + tcg_gen_ext32u_i64(dest, dest); + } +} + +/* dest = T0 + T1 + CF; compute C, N, V and Z flags. */ +static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) +{ + if (sf) { + TCGv_i64 result = tcg_temp_new_i64(); + TCGv_i64 cf_64 = tcg_temp_new_i64(); + TCGv_i64 vf_64 = tcg_temp_new_i64(); + TCGv_i64 tmp = tcg_temp_new_i64(); + TCGv_i64 zero = tcg_constant_i64(0); + + tcg_gen_extu_i32_i64(cf_64, cpu_CF); + tcg_gen_add2_i64(result, cf_64, t0, zero, cf_64, zero); + tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, zero); + tcg_gen_extrl_i64_i32(cpu_CF, cf_64); + gen_set_NZ64(result); + + tcg_gen_xor_i64(vf_64, result, t0); + tcg_gen_xor_i64(tmp, t0, t1); + tcg_gen_andc_i64(vf_64, vf_64, tmp); + tcg_gen_extrh_i64_i32(cpu_VF, vf_64); + + tcg_gen_mov_i64(dest, result); + + tcg_temp_free_i64(tmp); + tcg_temp_free_i64(vf_64); + tcg_temp_free_i64(cf_64); + tcg_temp_free_i64(result); + } else { + TCGv_i32 t0_32 = tcg_temp_new_i32(); + TCGv_i32 t1_32 = tcg_temp_new_i32(); + TCGv_i32 tmp = tcg_temp_new_i32(); + TCGv_i32 zero = tcg_constant_i32(0); + + tcg_gen_extrl_i64_i32(t0_32, t0); + tcg_gen_extrl_i64_i32(t1_32, t1); + tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, zero, cpu_CF, zero); + tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, zero); + + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32); + tcg_gen_xor_i32(tmp, t0_32, t1_32); + tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp); + tcg_gen_extu_i32_i64(dest, cpu_NF); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(t1_32); + tcg_temp_free_i32(t0_32); + } +} + +/* + * Load/Store generators + */ + +/* + * Store from GPR register to memory. + */ +static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source, + TCGv_i64 tcg_addr, MemOp memop, int memidx, + bool iss_valid, + unsigned int iss_srt, + bool iss_sf, bool iss_ar) +{ + memop = finalize_memop(s, memop); + tcg_gen_qemu_st_i64(source, tcg_addr, memidx, memop); + + if (iss_valid) { + uint32_t syn; + + syn = syn_data_abort_with_iss(0, + (memop & MO_SIZE), + false, + iss_srt, + iss_sf, + iss_ar, + 0, 0, 0, 0, 0, false); + disas_set_insn_syndrome(s, syn); + } +} + +static void do_gpr_st(DisasContext *s, TCGv_i64 source, + TCGv_i64 tcg_addr, MemOp memop, + bool iss_valid, + unsigned int iss_srt, + bool iss_sf, bool iss_ar) +{ + do_gpr_st_memidx(s, source, tcg_addr, memop, get_mem_index(s), + iss_valid, iss_srt, iss_sf, iss_ar); +} + +/* + * Load from memory to GPR register + */ +static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr, + MemOp memop, bool extend, int memidx, + bool iss_valid, unsigned int iss_srt, + bool iss_sf, bool iss_ar) +{ + memop = finalize_memop(s, memop); + tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop); + + if (extend && (memop & MO_SIGN)) { + g_assert((memop & MO_SIZE) <= MO_32); + tcg_gen_ext32u_i64(dest, dest); + } + + if (iss_valid) { + uint32_t syn; + + syn = syn_data_abort_with_iss(0, + (memop & MO_SIZE), + (memop & MO_SIGN) != 0, + iss_srt, + iss_sf, + iss_ar, + 0, 0, 0, 0, 0, false); + disas_set_insn_syndrome(s, syn); + } +} + +static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr, + MemOp memop, bool extend, + bool iss_valid, unsigned int iss_srt, + bool iss_sf, bool iss_ar) +{ + do_gpr_ld_memidx(s, dest, tcg_addr, memop, extend, get_mem_index(s), + iss_valid, iss_srt, iss_sf, iss_ar); +} + +/* + * Store from FP register to memory + */ +static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size) +{ + /* This writes the bottom N bits of a 128 bit wide vector to memory */ + TCGv_i64 tmplo = tcg_temp_new_i64(); + MemOp mop; + + tcg_gen_ld_i64(tmplo, cpu_env, fp_reg_offset(s, srcidx, MO_64)); + + if (size < 4) { + mop = finalize_memop(s, size); + tcg_gen_qemu_st_i64(tmplo, tcg_addr, get_mem_index(s), mop); + } else { + bool be = s->be_data == MO_BE; + TCGv_i64 tcg_hiaddr = tcg_temp_new_i64(); + TCGv_i64 tmphi = tcg_temp_new_i64(); + + tcg_gen_ld_i64(tmphi, cpu_env, fp_reg_hi_offset(s, srcidx)); + + mop = s->be_data | MO_UQ; + tcg_gen_qemu_st_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s), + mop | (s->align_mem ? MO_ALIGN_16 : 0)); + tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8); + tcg_gen_qemu_st_i64(be ? tmplo : tmphi, tcg_hiaddr, + get_mem_index(s), mop); + + tcg_temp_free_i64(tcg_hiaddr); + tcg_temp_free_i64(tmphi); + } + + tcg_temp_free_i64(tmplo); +} + +/* + * Load from memory to FP register + */ +static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) +{ + /* This always zero-extends and writes to a full 128 bit wide vector */ + TCGv_i64 tmplo = tcg_temp_new_i64(); + TCGv_i64 tmphi = NULL; + MemOp mop; + + if (size < 4) { + mop = finalize_memop(s, size); + tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), mop); + } else { + bool be = s->be_data == MO_BE; + TCGv_i64 tcg_hiaddr; + + tmphi = tcg_temp_new_i64(); + tcg_hiaddr = tcg_temp_new_i64(); + + mop = s->be_data | MO_UQ; + tcg_gen_qemu_ld_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s), + mop | (s->align_mem ? MO_ALIGN_16 : 0)); + tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8); + tcg_gen_qemu_ld_i64(be ? tmplo : tmphi, tcg_hiaddr, + get_mem_index(s), mop); + tcg_temp_free_i64(tcg_hiaddr); + } + + tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64)); + tcg_temp_free_i64(tmplo); + + if (tmphi) { + tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx)); + tcg_temp_free_i64(tmphi); + } + clear_vec_high(s, tmphi != NULL, destidx); +} + +/* + * Vector load/store helpers. + * + * The principal difference between this and a FP load is that we don't + * zero extend as we are filling a partial chunk of the vector register. + * These functions don't support 128 bit loads/stores, which would be + * normal load/store operations. + * + * The _i32 versions are useful when operating on 32 bit quantities + * (eg for floating point single or using Neon helper functions). + */ + +/* Get value of an element within a vector register */ +static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx, + int element, MemOp memop) +{ + int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE); + switch ((unsigned)memop) { + case MO_8: + tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_16: + tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_32: + tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_8|MO_SIGN: + tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_16|MO_SIGN: + tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_32|MO_SIGN: + tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off); + break; + case MO_64: + case MO_64|MO_SIGN: + tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off); + break; + default: + g_assert_not_reached(); + } +} + +static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx, + int element, MemOp memop) +{ + int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE); + switch (memop) { + case MO_8: + tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off); + break; + case MO_16: + tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off); + break; + case MO_8|MO_SIGN: + tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off); + break; + case MO_16|MO_SIGN: + tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off); + break; + case MO_32: + case MO_32|MO_SIGN: + tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off); + break; + default: + g_assert_not_reached(); + } +} + +/* Set value of an element within a vector register */ +static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx, + int element, MemOp memop) +{ + int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE); + switch (memop) { + case MO_8: + tcg_gen_st8_i64(tcg_src, cpu_env, vect_off); + break; + case MO_16: + tcg_gen_st16_i64(tcg_src, cpu_env, vect_off); + break; + case MO_32: + tcg_gen_st32_i64(tcg_src, cpu_env, vect_off); + break; + case MO_64: + tcg_gen_st_i64(tcg_src, cpu_env, vect_off); + break; + default: + g_assert_not_reached(); + } +} + +static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src, + int destidx, int element, MemOp memop) +{ + int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE); + switch (memop) { + case MO_8: + tcg_gen_st8_i32(tcg_src, cpu_env, vect_off); + break; + case MO_16: + tcg_gen_st16_i32(tcg_src, cpu_env, vect_off); + break; + case MO_32: + tcg_gen_st_i32(tcg_src, cpu_env, vect_off); + break; + default: + g_assert_not_reached(); + } +} + +/* Store from vector register to memory */ +static void do_vec_st(DisasContext *s, int srcidx, int element, + TCGv_i64 tcg_addr, MemOp mop) +{ + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + + read_vec_element(s, tcg_tmp, srcidx, element, mop & MO_SIZE); + tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop); + + tcg_temp_free_i64(tcg_tmp); +} + +/* Load from memory to vector register */ +static void do_vec_ld(DisasContext *s, int destidx, int element, + TCGv_i64 tcg_addr, MemOp mop) +{ + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + + tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop); + write_vec_element(s, tcg_tmp, destidx, element, mop & MO_SIZE); + + tcg_temp_free_i64(tcg_tmp); +} + +/* Check that FP/Neon access is enabled. If it is, return + * true. If not, emit code to generate an appropriate exception, + * and return false; the caller should not emit any code for + * the instruction. Note that this check must happen after all + * unallocated-encoding checks (otherwise the syndrome information + * for the resulting exception will be incorrect). + */ +static bool fp_access_check_only(DisasContext *s) +{ + if (s->fp_excp_el) { + assert(!s->fp_access_checked); + s->fp_access_checked = true; + + gen_exception_insn_el(s, 0, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false, 0), + s->fp_excp_el); + return false; + } + s->fp_access_checked = true; + return true; +} + +static bool fp_access_check(DisasContext *s) +{ + if (!fp_access_check_only(s)) { + return false; + } + if (s->sme_trap_nonstreaming && s->is_nonstreaming) { + gen_exception_insn(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_Streaming, false)); + return false; + } + return true; +} + +/* + * Check that SVE access is enabled. If it is, return true. + * If not, emit code to generate an appropriate exception and return false. + * This function corresponds to CheckSVEEnabled(). + */ +bool sve_access_check(DisasContext *s) +{ + if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) { + assert(dc_isar_feature(aa64_sme, s)); + if (!sme_sm_enabled_check(s)) { + goto fail_exit; + } + } else if (s->sve_excp_el) { + gen_exception_insn_el(s, 0, EXCP_UDEF, + syn_sve_access_trap(), s->sve_excp_el); + goto fail_exit; + } + s->sve_access_checked = true; + return fp_access_check(s); + + fail_exit: + /* Assert that we only raise one exception per instruction. */ + assert(!s->sve_access_checked); + s->sve_access_checked = true; + return false; +} + +/* + * Check that SME access is enabled, raise an exception if not. + * Note that this function corresponds to CheckSMEAccess and is + * only used directly for cpregs. + */ +static bool sme_access_check(DisasContext *s) +{ + if (s->sme_excp_el) { + gen_exception_insn_el(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_AccessTrap, false), + s->sme_excp_el); + return false; + } + return true; +} + +/* This function corresponds to CheckSMEEnabled. */ +bool sme_enabled_check(DisasContext *s) +{ + /* + * Note that unlike sve_excp_el, we have not constrained sme_excp_el + * to be zero when fp_excp_el has priority. This is because we need + * sme_excp_el by itself for cpregs access checks. + */ + if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) { + s->fp_access_checked = true; + return sme_access_check(s); + } + return fp_access_check_only(s); +} + +/* Common subroutine for CheckSMEAnd*Enabled. */ +bool sme_enabled_check_with_svcr(DisasContext *s, unsigned req) +{ + if (!sme_enabled_check(s)) { + return false; + } + if (FIELD_EX64(req, SVCR, SM) && !s->pstate_sm) { + gen_exception_insn(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_NotStreaming, false)); + return false; + } + if (FIELD_EX64(req, SVCR, ZA) && !s->pstate_za) { + gen_exception_insn(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_InactiveZA, false)); + return false; + } + return true; +} + +/* + * This utility function is for doing register extension with an + * optional shift. You will likely want to pass a temporary for the + * destination register. See DecodeRegExtend() in the ARM ARM. + */ +static void ext_and_shift_reg(TCGv_i64 tcg_out, TCGv_i64 tcg_in, + int option, unsigned int shift) +{ + int extsize = extract32(option, 0, 2); + bool is_signed = extract32(option, 2, 1); + + if (is_signed) { + switch (extsize) { + case 0: + tcg_gen_ext8s_i64(tcg_out, tcg_in); + break; + case 1: + tcg_gen_ext16s_i64(tcg_out, tcg_in); + break; + case 2: + tcg_gen_ext32s_i64(tcg_out, tcg_in); + break; + case 3: + tcg_gen_mov_i64(tcg_out, tcg_in); + break; + } + } else { + switch (extsize) { + case 0: + tcg_gen_ext8u_i64(tcg_out, tcg_in); + break; + case 1: + tcg_gen_ext16u_i64(tcg_out, tcg_in); + break; + case 2: + tcg_gen_ext32u_i64(tcg_out, tcg_in); + break; + case 3: + tcg_gen_mov_i64(tcg_out, tcg_in); + break; + } + } + + if (shift) { + tcg_gen_shli_i64(tcg_out, tcg_out, shift); + } +} + +static inline void gen_check_sp_alignment(DisasContext *s) +{ + /* The AArch64 architecture mandates that (if enabled via PSTATE + * or SCTLR bits) there is a check that SP is 16-aligned on every + * SP-relative load or store (with an exception generated if it is not). + * In line with general QEMU practice regarding misaligned accesses, + * we omit these checks for the sake of guest program performance. + * This function is provided as a hook so we can more easily add these + * checks in future (possibly as a "favour catching guest program bugs + * over speed" user selectable option). + */ +} + +/* + * This provides a simple table based table lookup decoder. It is + * intended to be used when the relevant bits for decode are too + * awkwardly placed and switch/if based logic would be confusing and + * deeply nested. Since it's a linear search through the table, tables + * should be kept small. + * + * It returns the first handler where insn & mask == pattern, or + * NULL if there is no match. + * The table is terminated by an empty mask (i.e. 0) + */ +static inline AArch64DecodeFn *lookup_disas_fn(const AArch64DecodeTable *table, + uint32_t insn) +{ + const AArch64DecodeTable *tptr = table; + + while (tptr->mask) { + if ((insn & tptr->mask) == tptr->pattern) { + return tptr->disas_fn; + } + tptr++; + } + return NULL; +} + +/* + * The instruction disassembly implemented here matches + * the instruction encoding classifications in chapter C4 + * of the ARM Architecture Reference Manual (DDI0487B_a); + * classification names and decode diagrams here should generally + * match up with those in the manual. + */ + +/* Unconditional branch (immediate) + * 31 30 26 25 0 + * +----+-----------+-------------------------------------+ + * | op | 0 0 1 0 1 | imm26 | + * +----+-----------+-------------------------------------+ + */ +static void disas_uncond_b_imm(DisasContext *s, uint32_t insn) +{ + int64_t diff = sextract32(insn, 0, 26) * 4; + + if (insn & (1U << 31)) { + /* BL Branch with link */ + gen_pc_plus_diff(s, cpu_reg(s, 30), curr_insn_len(s)); + } + + /* B Branch / BL Branch with link */ + reset_btype(s); + gen_goto_tb(s, 0, diff); +} + +/* Compare and branch (immediate) + * 31 30 25 24 23 5 4 0 + * +----+-------------+----+---------------------+--------+ + * | sf | 0 1 1 0 1 0 | op | imm19 | Rt | + * +----+-------------+----+---------------------+--------+ + */ +static void disas_comp_b_imm(DisasContext *s, uint32_t insn) +{ + unsigned int sf, op, rt; + int64_t diff; + DisasLabel match; + TCGv_i64 tcg_cmp; + + sf = extract32(insn, 31, 1); + op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */ + rt = extract32(insn, 0, 5); + diff = sextract32(insn, 5, 19) * 4; + + tcg_cmp = read_cpu_reg(s, rt, sf); + reset_btype(s); + + match = gen_disas_label(s); + tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ, + tcg_cmp, 0, match.label); + gen_goto_tb(s, 0, 4); + set_disas_label(s, match); + gen_goto_tb(s, 1, diff); +} + +/* Test and branch (immediate) + * 31 30 25 24 23 19 18 5 4 0 + * +----+-------------+----+-------+-------------+------+ + * | b5 | 0 1 1 0 1 1 | op | b40 | imm14 | Rt | + * +----+-------------+----+-------+-------------+------+ + */ +static void disas_test_b_imm(DisasContext *s, uint32_t insn) +{ + unsigned int bit_pos, op, rt; + int64_t diff; + DisasLabel match; + TCGv_i64 tcg_cmp; + + bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5); + op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */ + diff = sextract32(insn, 5, 14) * 4; + rt = extract32(insn, 0, 5); + + tcg_cmp = tcg_temp_new_i64(); + tcg_gen_andi_i64(tcg_cmp, cpu_reg(s, rt), (1ULL << bit_pos)); + + reset_btype(s); + + match = gen_disas_label(s); + tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ, + tcg_cmp, 0, match.label); + tcg_temp_free_i64(tcg_cmp); + gen_goto_tb(s, 0, 4); + set_disas_label(s, match); + gen_goto_tb(s, 1, diff); +} + +/* Conditional branch (immediate) + * 31 25 24 23 5 4 3 0 + * +---------------+----+---------------------+----+------+ + * | 0 1 0 1 0 1 0 | o1 | imm19 | o0 | cond | + * +---------------+----+---------------------+----+------+ + */ +static void disas_cond_b_imm(DisasContext *s, uint32_t insn) +{ + unsigned int cond; + int64_t diff; + + if ((insn & (1 << 4)) || (insn & (1 << 24))) { + unallocated_encoding(s); + return; + } + diff = sextract32(insn, 5, 19) * 4; + cond = extract32(insn, 0, 4); + + reset_btype(s); + if (cond < 0x0e) { + /* genuinely conditional branches */ + DisasLabel match = gen_disas_label(s); + arm_gen_test_cc(cond, match.label); + gen_goto_tb(s, 0, 4); + set_disas_label(s, match); + gen_goto_tb(s, 1, diff); + } else { + /* 0xe and 0xf are both "always" conditions */ + gen_goto_tb(s, 0, diff); + } +} + +/* HINT instruction group, including various allocated HINTs */ +static void handle_hint(DisasContext *s, uint32_t insn, + unsigned int op1, unsigned int op2, unsigned int crm) +{ + unsigned int selector = crm << 3 | op2; + + if (op1 != 3) { + unallocated_encoding(s); + return; + } + + switch (selector) { + case 0b00000: /* NOP */ + break; + case 0b00011: /* WFI */ + s->base.is_jmp = DISAS_WFI; + break; + case 0b00001: /* YIELD */ + /* When running in MTTCG we don't generate jumps to the yield and + * WFE helpers as it won't affect the scheduling of other vCPUs. + * If we wanted to more completely model WFE/SEV so we don't busy + * spin unnecessarily we would need to do something more involved. + */ + if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { + s->base.is_jmp = DISAS_YIELD; + } + break; + case 0b00010: /* WFE */ + if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { + s->base.is_jmp = DISAS_WFE; + } + break; + case 0b00100: /* SEV */ + case 0b00101: /* SEVL */ + case 0b00110: /* DGH */ + /* we treat all as NOP at least for now */ + break; + case 0b00111: /* XPACLRI */ + if (s->pauth_active) { + gen_helper_xpaci(cpu_X[30], cpu_env, cpu_X[30]); + } + break; + case 0b01000: /* PACIA1716 */ + if (s->pauth_active) { + gen_helper_pacia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]); + } + break; + case 0b01010: /* PACIB1716 */ + if (s->pauth_active) { + gen_helper_pacib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]); + } + break; + case 0b01100: /* AUTIA1716 */ + if (s->pauth_active) { + gen_helper_autia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]); + } + break; + case 0b01110: /* AUTIB1716 */ + if (s->pauth_active) { + gen_helper_autib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]); + } + break; + case 0b10000: /* ESB */ + /* Without RAS, we must implement this as NOP. */ + if (dc_isar_feature(aa64_ras, s)) { + /* + * QEMU does not have a source of physical SErrors, + * so we are only concerned with virtual SErrors. + * The pseudocode in the ARM for this case is + * if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then + * AArch64.vESBOperation(); + * Most of the condition can be evaluated at translation time. + * Test for EL2 present, and defer test for SEL2 to runtime. + */ + if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) { + gen_helper_vesb(cpu_env); + } + } + break; + case 0b11000: /* PACIAZ */ + if (s->pauth_active) { + gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30], + new_tmp_a64_zero(s)); + } + break; + case 0b11001: /* PACIASP */ + if (s->pauth_active) { + gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]); + } + break; + case 0b11010: /* PACIBZ */ + if (s->pauth_active) { + gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30], + new_tmp_a64_zero(s)); + } + break; + case 0b11011: /* PACIBSP */ + if (s->pauth_active) { + gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]); + } + break; + case 0b11100: /* AUTIAZ */ + if (s->pauth_active) { + gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30], + new_tmp_a64_zero(s)); + } + break; + case 0b11101: /* AUTIASP */ + if (s->pauth_active) { + gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]); + } + break; + case 0b11110: /* AUTIBZ */ + if (s->pauth_active) { + gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30], + new_tmp_a64_zero(s)); + } + break; + case 0b11111: /* AUTIBSP */ + if (s->pauth_active) { + gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]); + } + break; + default: + /* default specified as NOP equivalent */ + break; + } +} + +static void gen_clrex(DisasContext *s, uint32_t insn) +{ + tcg_gen_movi_i64(cpu_exclusive_addr, -1); +} + +/* CLREX, DSB, DMB, ISB */ +static void handle_sync(DisasContext *s, uint32_t insn, + unsigned int op1, unsigned int op2, unsigned int crm) +{ + TCGBar bar; + + if (op1 != 3) { + unallocated_encoding(s); + return; + } + + switch (op2) { + case 2: /* CLREX */ + gen_clrex(s, insn); + return; + case 4: /* DSB */ + case 5: /* DMB */ + switch (crm & 3) { + case 1: /* MBReqTypes_Reads */ + bar = TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST; + break; + case 2: /* MBReqTypes_Writes */ + bar = TCG_BAR_SC | TCG_MO_ST_ST; + break; + default: /* MBReqTypes_All */ + bar = TCG_BAR_SC | TCG_MO_ALL; + break; + } + tcg_gen_mb(bar); + return; + case 6: /* ISB */ + /* We need to break the TB after this insn to execute + * a self-modified code correctly and also to take + * any pending interrupts immediately. + */ + reset_btype(s); + gen_goto_tb(s, 0, 4); + return; + + case 7: /* SB */ + if (crm != 0 || !dc_isar_feature(aa64_sb, s)) { + goto do_unallocated; + } + /* + * TODO: There is no speculation barrier opcode for TCG; + * MB and end the TB instead. + */ + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); + gen_goto_tb(s, 0, 4); + return; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } +} + +static void gen_xaflag(void) +{ + TCGv_i32 z = tcg_temp_new_i32(); + + tcg_gen_setcondi_i32(TCG_COND_EQ, z, cpu_ZF, 0); + + /* + * (!C & !Z) << 31 + * (!(C | Z)) << 31 + * ~((C | Z) << 31) + * ~-(C | Z) + * (C | Z) - 1 + */ + tcg_gen_or_i32(cpu_NF, cpu_CF, z); + tcg_gen_subi_i32(cpu_NF, cpu_NF, 1); + + /* !(Z & C) */ + tcg_gen_and_i32(cpu_ZF, z, cpu_CF); + tcg_gen_xori_i32(cpu_ZF, cpu_ZF, 1); + + /* (!C & Z) << 31 -> -(Z & ~C) */ + tcg_gen_andc_i32(cpu_VF, z, cpu_CF); + tcg_gen_neg_i32(cpu_VF, cpu_VF); + + /* C | Z */ + tcg_gen_or_i32(cpu_CF, cpu_CF, z); + + tcg_temp_free_i32(z); +} + +static void gen_axflag(void) +{ + tcg_gen_sari_i32(cpu_VF, cpu_VF, 31); /* V ? -1 : 0 */ + tcg_gen_andc_i32(cpu_CF, cpu_CF, cpu_VF); /* C & !V */ + + /* !(Z | V) -> !(!ZF | V) -> ZF & !V -> ZF & ~VF */ + tcg_gen_andc_i32(cpu_ZF, cpu_ZF, cpu_VF); + + tcg_gen_movi_i32(cpu_NF, 0); + tcg_gen_movi_i32(cpu_VF, 0); +} + +/* MSR (immediate) - move immediate to processor state field */ +static void handle_msr_i(DisasContext *s, uint32_t insn, + unsigned int op1, unsigned int op2, unsigned int crm) +{ + int op = op1 << 3 | op2; + + /* End the TB by default, chaining is ok. */ + s->base.is_jmp = DISAS_TOO_MANY; + + switch (op) { + case 0x00: /* CFINV */ + if (crm != 0 || !dc_isar_feature(aa64_condm_4, s)) { + goto do_unallocated; + } + tcg_gen_xori_i32(cpu_CF, cpu_CF, 1); + s->base.is_jmp = DISAS_NEXT; + break; + + case 0x01: /* XAFlag */ + if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) { + goto do_unallocated; + } + gen_xaflag(); + s->base.is_jmp = DISAS_NEXT; + break; + + case 0x02: /* AXFlag */ + if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) { + goto do_unallocated; + } + gen_axflag(); + s->base.is_jmp = DISAS_NEXT; + break; + + case 0x03: /* UAO */ + if (!dc_isar_feature(aa64_uao, s) || s->current_el == 0) { + goto do_unallocated; + } + if (crm & 1) { + set_pstate_bits(PSTATE_UAO); + } else { + clear_pstate_bits(PSTATE_UAO); + } + gen_rebuild_hflags(s); + break; + + case 0x04: /* PAN */ + if (!dc_isar_feature(aa64_pan, s) || s->current_el == 0) { + goto do_unallocated; + } + if (crm & 1) { + set_pstate_bits(PSTATE_PAN); + } else { + clear_pstate_bits(PSTATE_PAN); + } + gen_rebuild_hflags(s); + break; + + case 0x05: /* SPSel */ + if (s->current_el == 0) { + goto do_unallocated; + } + gen_helper_msr_i_spsel(cpu_env, tcg_constant_i32(crm & PSTATE_SP)); + break; + + case 0x19: /* SSBS */ + if (!dc_isar_feature(aa64_ssbs, s)) { + goto do_unallocated; + } + if (crm & 1) { + set_pstate_bits(PSTATE_SSBS); + } else { + clear_pstate_bits(PSTATE_SSBS); + } + /* Don't need to rebuild hflags since SSBS is a nop */ + break; + + case 0x1a: /* DIT */ + if (!dc_isar_feature(aa64_dit, s)) { + goto do_unallocated; + } + if (crm & 1) { + set_pstate_bits(PSTATE_DIT); + } else { + clear_pstate_bits(PSTATE_DIT); + } + /* There's no need to rebuild hflags because DIT is a nop */ + break; + + case 0x1e: /* DAIFSet */ + gen_helper_msr_i_daifset(cpu_env, tcg_constant_i32(crm)); + break; + + case 0x1f: /* DAIFClear */ + gen_helper_msr_i_daifclear(cpu_env, tcg_constant_i32(crm)); + /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs. */ + s->base.is_jmp = DISAS_UPDATE_EXIT; + break; + + case 0x1c: /* TCO */ + if (dc_isar_feature(aa64_mte, s)) { + /* Full MTE is enabled -- set the TCO bit as directed. */ + if (crm & 1) { + set_pstate_bits(PSTATE_TCO); + } else { + clear_pstate_bits(PSTATE_TCO); + } + gen_rebuild_hflags(s); + /* Many factors, including TCO, go into MTE_ACTIVE. */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + } else if (dc_isar_feature(aa64_mte_insn_reg, s)) { + /* Only "instructions accessible at EL0" -- PSTATE.TCO is WI. */ + s->base.is_jmp = DISAS_NEXT; + } else { + goto do_unallocated; + } + break; + + case 0x1b: /* SVCR* */ + if (!dc_isar_feature(aa64_sme, s) || crm < 2 || crm > 7) { + goto do_unallocated; + } + if (sme_access_check(s)) { + int old = s->pstate_sm | (s->pstate_za << 1); + int new = (crm & 1) * 3; + int msk = (crm >> 1) & 3; + + if ((old ^ new) & msk) { + /* At least one bit changes. */ + gen_helper_set_svcr(cpu_env, tcg_constant_i32(new), + tcg_constant_i32(msk)); + } else { + s->base.is_jmp = DISAS_NEXT; + } + } + break; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } +} + +static void gen_get_nzcv(TCGv_i64 tcg_rt) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + TCGv_i32 nzcv = tcg_temp_new_i32(); + + /* build bit 31, N */ + tcg_gen_andi_i32(nzcv, cpu_NF, (1U << 31)); + /* build bit 30, Z */ + tcg_gen_setcondi_i32(TCG_COND_EQ, tmp, cpu_ZF, 0); + tcg_gen_deposit_i32(nzcv, nzcv, tmp, 30, 1); + /* build bit 29, C */ + tcg_gen_deposit_i32(nzcv, nzcv, cpu_CF, 29, 1); + /* build bit 28, V */ + tcg_gen_shri_i32(tmp, cpu_VF, 31); + tcg_gen_deposit_i32(nzcv, nzcv, tmp, 28, 1); + /* generate result */ + tcg_gen_extu_i32_i64(tcg_rt, nzcv); + + tcg_temp_free_i32(nzcv); + tcg_temp_free_i32(tmp); +} + +static void gen_set_nzcv(TCGv_i64 tcg_rt) +{ + TCGv_i32 nzcv = tcg_temp_new_i32(); + + /* take NZCV from R[t] */ + tcg_gen_extrl_i64_i32(nzcv, tcg_rt); + + /* bit 31, N */ + tcg_gen_andi_i32(cpu_NF, nzcv, (1U << 31)); + /* bit 30, Z */ + tcg_gen_andi_i32(cpu_ZF, nzcv, (1 << 30)); + tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_ZF, cpu_ZF, 0); + /* bit 29, C */ + tcg_gen_andi_i32(cpu_CF, nzcv, (1 << 29)); + tcg_gen_shri_i32(cpu_CF, cpu_CF, 29); + /* bit 28, V */ + tcg_gen_andi_i32(cpu_VF, nzcv, (1 << 28)); + tcg_gen_shli_i32(cpu_VF, cpu_VF, 3); + tcg_temp_free_i32(nzcv); +} + +static void gen_sysreg_undef(DisasContext *s, bool isread, + uint8_t op0, uint8_t op1, uint8_t op2, + uint8_t crn, uint8_t crm, uint8_t rt) +{ + /* + * Generate code to emit an UNDEF with correct syndrome + * information for a failed system register access. + * This is EC_UNCATEGORIZED (ie a standard UNDEF) in most cases, + * but if FEAT_IDST is implemented then read accesses to registers + * in the feature ID space are reported with the EC_SYSTEMREGISTERTRAP + * syndrome. + */ + uint32_t syndrome; + + if (isread && dc_isar_feature(aa64_ids, s) && + arm_cpreg_encoding_in_idspace(op0, op1, op2, crn, crm)) { + syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread); + } else { + syndrome = syn_uncategorized(); + } + gen_exception_insn(s, 0, EXCP_UDEF, syndrome); +} + +/* MRS - move from system register + * MSR (register) - move to system register + * SYS + * SYSL + * These are all essentially the same insn in 'read' and 'write' + * versions, with varying op0 fields. + */ +static void handle_sys(DisasContext *s, uint32_t insn, bool isread, + unsigned int op0, unsigned int op1, unsigned int op2, + unsigned int crn, unsigned int crm, unsigned int rt) +{ + uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP, + crn, crm, op0, op1, op2); + const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key); + TCGv_ptr tcg_ri = NULL; + TCGv_i64 tcg_rt; + + if (!ri) { + /* Unknown register; this might be a guest error or a QEMU + * unimplemented feature. + */ + qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch64 " + "system register op0:%d op1:%d crn:%d crm:%d op2:%d\n", + isread ? "read" : "write", op0, op1, crn, crm, op2); + gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt); + return; + } + + /* Check access permissions */ + if (!cp_access_ok(s->current_el, ri, isread)) { + gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt); + return; + } + + if (ri->accessfn || (ri->fgt && s->fgt_active)) { + /* Emit code to perform further access permissions checks at + * runtime; this may result in an exception. + */ + uint32_t syndrome; + + syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread); + gen_a64_update_pc(s, 0); + tcg_ri = tcg_temp_new_ptr(); + gen_helper_access_check_cp_reg(tcg_ri, cpu_env, + tcg_constant_i32(key), + tcg_constant_i32(syndrome), + tcg_constant_i32(isread)); + } else if (ri->type & ARM_CP_RAISES_EXC) { + /* + * The readfn or writefn might raise an exception; + * synchronize the CPU state in case it does. + */ + gen_a64_update_pc(s, 0); + } + + /* Handle special cases first */ + switch (ri->type & ARM_CP_SPECIAL_MASK) { + case 0: + break; + case ARM_CP_NOP: + goto exit; + case ARM_CP_NZCV: + tcg_rt = cpu_reg(s, rt); + if (isread) { + gen_get_nzcv(tcg_rt); + } else { + gen_set_nzcv(tcg_rt); + } + goto exit; + case ARM_CP_CURRENTEL: + /* Reads as current EL value from pstate, which is + * guaranteed to be constant by the tb flags. + */ + tcg_rt = cpu_reg(s, rt); + tcg_gen_movi_i64(tcg_rt, s->current_el << 2); + goto exit; + case ARM_CP_DC_ZVA: + /* Writes clear the aligned block of memory which rt points into. */ + if (s->mte_active[0]) { + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + + tcg_rt = new_tmp_a64(s); + gen_helper_mte_check_zva(tcg_rt, cpu_env, + tcg_constant_i32(desc), cpu_reg(s, rt)); + } else { + tcg_rt = clean_data_tbi(s, cpu_reg(s, rt)); + } + gen_helper_dc_zva(cpu_env, tcg_rt); + goto exit; + case ARM_CP_DC_GVA: + { + TCGv_i64 clean_addr, tag; + + /* + * DC_GVA, like DC_ZVA, requires that we supply the original + * pointer for an invalid page. Probe that address first. + */ + tcg_rt = cpu_reg(s, rt); + clean_addr = clean_data_tbi(s, tcg_rt); + gen_probe_access(s, clean_addr, MMU_DATA_STORE, MO_8); + + if (s->ata) { + /* Extract the tag from the register to match STZGM. */ + tag = tcg_temp_new_i64(); + tcg_gen_shri_i64(tag, tcg_rt, 56); + gen_helper_stzgm_tags(cpu_env, clean_addr, tag); + tcg_temp_free_i64(tag); + } + } + goto exit; + case ARM_CP_DC_GZVA: + { + TCGv_i64 clean_addr, tag; + + /* For DC_GZVA, we can rely on DC_ZVA for the proper fault. */ + tcg_rt = cpu_reg(s, rt); + clean_addr = clean_data_tbi(s, tcg_rt); + gen_helper_dc_zva(cpu_env, clean_addr); + + if (s->ata) { + /* Extract the tag from the register to match STZGM. */ + tag = tcg_temp_new_i64(); + tcg_gen_shri_i64(tag, tcg_rt, 56); + gen_helper_stzgm_tags(cpu_env, clean_addr, tag); + tcg_temp_free_i64(tag); + } + } + goto exit; + default: + g_assert_not_reached(); + } + if ((ri->type & ARM_CP_FPU) && !fp_access_check_only(s)) { + goto exit; + } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) { + goto exit; + } else if ((ri->type & ARM_CP_SME) && !sme_access_check(s)) { + goto exit; + } + + if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) { + gen_io_start(); + } + + tcg_rt = cpu_reg(s, rt); + + if (isread) { + if (ri->type & ARM_CP_CONST) { + tcg_gen_movi_i64(tcg_rt, ri->resetvalue); + } else if (ri->readfn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + gen_helper_get_cp_reg64(tcg_rt, cpu_env, tcg_ri); + } else { + tcg_gen_ld_i64(tcg_rt, cpu_env, ri->fieldoffset); + } + } else { + if (ri->type & ARM_CP_CONST) { + /* If not forbidden by access permissions, treat as WI */ + goto exit; + } else if (ri->writefn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + gen_helper_set_cp_reg64(cpu_env, tcg_ri, tcg_rt); + } else { + tcg_gen_st_i64(tcg_rt, cpu_env, ri->fieldoffset); + } + } + + if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) { + /* I/O operations must end the TB here (whether read or write) */ + s->base.is_jmp = DISAS_UPDATE_EXIT; + } + if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) { + /* + * A write to any coprocessor regiser that ends a TB + * must rebuild the hflags for the next TB. + */ + gen_rebuild_hflags(s); + /* + * We default to ending the TB on a coprocessor register write, + * but allow this to be suppressed by the register definition + * (usually only necessary to work around guest bugs). + */ + s->base.is_jmp = DISAS_UPDATE_EXIT; + } + + exit: + if (tcg_ri) { + tcg_temp_free_ptr(tcg_ri); + } +} + +/* System + * 31 22 21 20 19 18 16 15 12 11 8 7 5 4 0 + * +---------------------+---+-----+-----+-------+-------+-----+------+ + * | 1 1 0 1 0 1 0 1 0 0 | L | op0 | op1 | CRn | CRm | op2 | Rt | + * +---------------------+---+-----+-----+-------+-------+-----+------+ + */ +static void disas_system(DisasContext *s, uint32_t insn) +{ + unsigned int l, op0, op1, crn, crm, op2, rt; + l = extract32(insn, 21, 1); + op0 = extract32(insn, 19, 2); + op1 = extract32(insn, 16, 3); + crn = extract32(insn, 12, 4); + crm = extract32(insn, 8, 4); + op2 = extract32(insn, 5, 3); + rt = extract32(insn, 0, 5); + + if (op0 == 0) { + if (l || rt != 31) { + unallocated_encoding(s); + return; + } + switch (crn) { + case 2: /* HINT (including allocated hints like NOP, YIELD, etc) */ + handle_hint(s, insn, op1, op2, crm); + break; + case 3: /* CLREX, DSB, DMB, ISB */ + handle_sync(s, insn, op1, op2, crm); + break; + case 4: /* MSR (immediate) */ + handle_msr_i(s, insn, op1, op2, crm); + break; + default: + unallocated_encoding(s); + break; + } + return; + } + handle_sys(s, insn, l, op0, op1, op2, crn, crm, rt); +} + +/* Exception generation + * + * 31 24 23 21 20 5 4 2 1 0 + * +-----------------+-----+------------------------+-----+----+ + * | 1 1 0 1 0 1 0 0 | opc | imm16 | op2 | LL | + * +-----------------------+------------------------+----------+ + */ +static void disas_exc(DisasContext *s, uint32_t insn) +{ + int opc = extract32(insn, 21, 3); + int op2_ll = extract32(insn, 0, 5); + int imm16 = extract32(insn, 5, 16); + uint32_t syndrome; + + switch (opc) { + case 0: + /* For SVC, HVC and SMC we advance the single-step state + * machine before taking the exception. This is architecturally + * mandated, to ensure that single-stepping a system call + * instruction works properly. + */ + switch (op2_ll) { + case 1: /* SVC */ + syndrome = syn_aa64_svc(imm16); + if (s->fgt_svc) { + gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2); + break; + } + gen_ss_advance(s); + gen_exception_insn(s, 4, EXCP_SWI, syndrome); + break; + case 2: /* HVC */ + if (s->current_el == 0) { + unallocated_encoding(s); + break; + } + /* The pre HVC helper handles cases when HVC gets trapped + * as an undefined insn by runtime configuration. + */ + gen_a64_update_pc(s, 0); + gen_helper_pre_hvc(cpu_env); + gen_ss_advance(s); + gen_exception_insn_el(s, 4, EXCP_HVC, syn_aa64_hvc(imm16), 2); + break; + case 3: /* SMC */ + if (s->current_el == 0) { + unallocated_encoding(s); + break; + } + gen_a64_update_pc(s, 0); + gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa64_smc(imm16))); + gen_ss_advance(s); + gen_exception_insn_el(s, 4, EXCP_SMC, syn_aa64_smc(imm16), 3); + break; + default: + unallocated_encoding(s); + break; + } + break; + case 1: + if (op2_ll != 0) { + unallocated_encoding(s); + break; + } + /* BRK */ + gen_exception_bkpt_insn(s, syn_aa64_bkpt(imm16)); + break; + case 2: + if (op2_ll != 0) { + unallocated_encoding(s); + break; + } + /* HLT. This has two purposes. + * Architecturally, it is an external halting debug instruction. + * Since QEMU doesn't implement external debug, we treat this as + * it is required for halting debug disabled: it will UNDEF. + * Secondly, "HLT 0xf000" is the A64 semihosting syscall instruction. + */ + if (semihosting_enabled(s->current_el == 0) && imm16 == 0xf000) { + gen_exception_internal_insn(s, EXCP_SEMIHOST); + } else { + unallocated_encoding(s); + } + break; + case 5: + if (op2_ll < 1 || op2_ll > 3) { + unallocated_encoding(s); + break; + } + /* DCPS1, DCPS2, DCPS3 */ + unallocated_encoding(s); + break; + default: + unallocated_encoding(s); + break; + } +} + +/* Unconditional branch (register) + * 31 25 24 21 20 16 15 10 9 5 4 0 + * +---------------+-------+-------+-------+------+-------+ + * | 1 1 0 1 0 1 1 | opc | op2 | op3 | Rn | op4 | + * +---------------+-------+-------+-------+------+-------+ + */ +static void disas_uncond_b_reg(DisasContext *s, uint32_t insn) +{ + unsigned int opc, op2, op3, rn, op4; + unsigned btype_mod = 2; /* 0: BR, 1: BLR, 2: other */ + TCGv_i64 dst; + TCGv_i64 modifier; + + opc = extract32(insn, 21, 4); + op2 = extract32(insn, 16, 5); + op3 = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + op4 = extract32(insn, 0, 5); + + if (op2 != 0x1f) { + goto do_unallocated; + } + + switch (opc) { + case 0: /* BR */ + case 1: /* BLR */ + case 2: /* RET */ + btype_mod = opc; + switch (op3) { + case 0: + /* BR, BLR, RET */ + if (op4 != 0) { + goto do_unallocated; + } + dst = cpu_reg(s, rn); + break; + + case 2: + case 3: + if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + if (opc == 2) { + /* RETAA, RETAB */ + if (rn != 0x1f || op4 != 0x1f) { + goto do_unallocated; + } + rn = 30; + modifier = cpu_X[31]; + } else { + /* BRAAZ, BRABZ, BLRAAZ, BLRABZ */ + if (op4 != 0x1f) { + goto do_unallocated; + } + modifier = new_tmp_a64_zero(s); + } + if (s->pauth_active) { + dst = new_tmp_a64(s); + if (op3 == 2) { + gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier); + } else { + gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier); + } + } else { + dst = cpu_reg(s, rn); + } + break; + + default: + goto do_unallocated; + } + /* BLR also needs to load return address */ + if (opc == 1) { + TCGv_i64 lr = cpu_reg(s, 30); + if (dst == lr) { + TCGv_i64 tmp = new_tmp_a64(s); + tcg_gen_mov_i64(tmp, dst); + dst = tmp; + } + gen_pc_plus_diff(s, lr, curr_insn_len(s)); + } + gen_a64_set_pc(s, dst); + break; + + case 8: /* BRAA */ + case 9: /* BLRAA */ + if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + if ((op3 & ~1) != 2) { + goto do_unallocated; + } + btype_mod = opc & 1; + if (s->pauth_active) { + dst = new_tmp_a64(s); + modifier = cpu_reg_sp(s, op4); + if (op3 == 2) { + gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier); + } else { + gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier); + } + } else { + dst = cpu_reg(s, rn); + } + /* BLRAA also needs to load return address */ + if (opc == 9) { + TCGv_i64 lr = cpu_reg(s, 30); + if (dst == lr) { + TCGv_i64 tmp = new_tmp_a64(s); + tcg_gen_mov_i64(tmp, dst); + dst = tmp; + } + gen_pc_plus_diff(s, lr, curr_insn_len(s)); + } + gen_a64_set_pc(s, dst); + break; + + case 4: /* ERET */ + if (s->current_el == 0) { + goto do_unallocated; + } + switch (op3) { + case 0: /* ERET */ + if (op4 != 0) { + goto do_unallocated; + } + if (s->fgt_eret) { + gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2); + return; + } + dst = tcg_temp_new_i64(); + tcg_gen_ld_i64(dst, cpu_env, + offsetof(CPUARMState, elr_el[s->current_el])); + break; + + case 2: /* ERETAA */ + case 3: /* ERETAB */ + if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + if (rn != 0x1f || op4 != 0x1f) { + goto do_unallocated; + } + /* The FGT trap takes precedence over an auth trap. */ + if (s->fgt_eret) { + gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2); + return; + } + dst = tcg_temp_new_i64(); + tcg_gen_ld_i64(dst, cpu_env, + offsetof(CPUARMState, elr_el[s->current_el])); + if (s->pauth_active) { + modifier = cpu_X[31]; + if (op3 == 2) { + gen_helper_autia(dst, cpu_env, dst, modifier); + } else { + gen_helper_autib(dst, cpu_env, dst, modifier); + } + } + break; + + default: + goto do_unallocated; + } + if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { + gen_io_start(); + } + + gen_helper_exception_return(cpu_env, dst); + tcg_temp_free_i64(dst); + /* Must exit loop to check un-masked IRQs */ + s->base.is_jmp = DISAS_EXIT; + return; + + case 5: /* DRPS */ + if (op3 != 0 || op4 != 0 || rn != 0x1f) { + goto do_unallocated; + } else { + unallocated_encoding(s); + } + return; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } + + switch (btype_mod) { + case 0: /* BR */ + if (dc_isar_feature(aa64_bti, s)) { + /* BR to {x16,x17} or !guard -> 1, else 3. */ + set_btype(s, rn == 16 || rn == 17 || !s->guarded_page ? 1 : 3); + } + break; + + case 1: /* BLR */ + if (dc_isar_feature(aa64_bti, s)) { + /* BLR sets BTYPE to 2, regardless of source guarded page. */ + set_btype(s, 2); + } + break; + + default: /* RET or none of the above. */ + /* BTYPE will be set to 0 by normal end-of-insn processing. */ + break; + } + + s->base.is_jmp = DISAS_JUMP; +} + +/* Branches, exception generating and system instructions */ +static void disas_b_exc_sys(DisasContext *s, uint32_t insn) +{ + switch (extract32(insn, 25, 7)) { + case 0x0a: case 0x0b: + case 0x4a: case 0x4b: /* Unconditional branch (immediate) */ + disas_uncond_b_imm(s, insn); + break; + case 0x1a: case 0x5a: /* Compare & branch (immediate) */ + disas_comp_b_imm(s, insn); + break; + case 0x1b: case 0x5b: /* Test & branch (immediate) */ + disas_test_b_imm(s, insn); + break; + case 0x2a: /* Conditional branch (immediate) */ + disas_cond_b_imm(s, insn); + break; + case 0x6a: /* Exception generation / System */ + if (insn & (1 << 24)) { + if (extract32(insn, 22, 2) == 0) { + disas_system(s, insn); + } else { + unallocated_encoding(s); + } + } else { + disas_exc(s, insn); + } + break; + case 0x6b: /* Unconditional branch (register) */ + disas_uncond_b_reg(s, insn); + break; + default: + unallocated_encoding(s); + break; + } +} + +/* + * Load/Store exclusive instructions are implemented by remembering + * the value/address loaded, and seeing if these are the same + * when the store is performed. This is not actually the architecturally + * mandated semantics, but it works for typical guest code sequences + * and avoids having to monitor regular stores. + * + * The store exclusive uses the atomic cmpxchg primitives to avoid + * races in multi-threaded linux-user and when MTTCG softmmu is + * enabled. + */ +static void gen_load_exclusive(DisasContext *s, int rt, int rt2, + TCGv_i64 addr, int size, bool is_pair) +{ + int idx = get_mem_index(s); + MemOp memop = s->be_data; + + g_assert(size <= 3); + if (is_pair) { + g_assert(size >= 2); + if (size == 2) { + /* The pair must be single-copy atomic for the doubleword. */ + memop |= MO_64 | MO_ALIGN; + tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop); + if (s->be_data == MO_LE) { + tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 0, 32); + tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 32, 32); + } else { + tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 32, 32); + tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 0, 32); + } + } else { + /* The pair must be single-copy atomic for *each* doubleword, not + the entire quadword, however it must be quadword aligned. */ + memop |= MO_64; + tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, + memop | MO_ALIGN_16); + + TCGv_i64 addr2 = tcg_temp_new_i64(); + tcg_gen_addi_i64(addr2, addr, 8); + tcg_gen_qemu_ld_i64(cpu_exclusive_high, addr2, idx, memop); + tcg_temp_free_i64(addr2); + + tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val); + tcg_gen_mov_i64(cpu_reg(s, rt2), cpu_exclusive_high); + } + } else { + memop |= size | MO_ALIGN; + tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop); + tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val); + } + tcg_gen_mov_i64(cpu_exclusive_addr, addr); +} + +static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, + TCGv_i64 addr, int size, int is_pair) +{ + /* if (env->exclusive_addr == addr && env->exclusive_val == [addr] + * && (!is_pair || env->exclusive_high == [addr + datasize])) { + * [addr] = {Rt}; + * if (is_pair) { + * [addr + datasize] = {Rt2}; + * } + * {Rd} = 0; + * } else { + * {Rd} = 1; + * } + * env->exclusive_addr = -1; + */ + TCGLabel *fail_label = gen_new_label(); + TCGLabel *done_label = gen_new_label(); + TCGv_i64 tmp; + + tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label); + + tmp = tcg_temp_new_i64(); + if (is_pair) { + if (size == 2) { + if (s->be_data == MO_LE) { + tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2)); + } else { + tcg_gen_concat32_i64(tmp, cpu_reg(s, rt2), cpu_reg(s, rt)); + } + tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, + cpu_exclusive_val, tmp, + get_mem_index(s), + MO_64 | MO_ALIGN | s->be_data); + tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val); + } else { + TCGv_i128 t16 = tcg_temp_new_i128(); + TCGv_i128 c16 = tcg_temp_new_i128(); + TCGv_i64 a, b; + + if (s->be_data == MO_LE) { + tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt), cpu_reg(s, rt2)); + tcg_gen_concat_i64_i128(c16, cpu_exclusive_val, + cpu_exclusive_high); + } else { + tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt2), cpu_reg(s, rt)); + tcg_gen_concat_i64_i128(c16, cpu_exclusive_high, + cpu_exclusive_val); + } + + tcg_gen_atomic_cmpxchg_i128(t16, cpu_exclusive_addr, c16, t16, + get_mem_index(s), + MO_128 | MO_ALIGN | s->be_data); + tcg_temp_free_i128(c16); + + a = tcg_temp_new_i64(); + b = tcg_temp_new_i64(); + if (s->be_data == MO_LE) { + tcg_gen_extr_i128_i64(a, b, t16); + } else { + tcg_gen_extr_i128_i64(b, a, t16); + } + + tcg_gen_xor_i64(a, a, cpu_exclusive_val); + tcg_gen_xor_i64(b, b, cpu_exclusive_high); + tcg_gen_or_i64(tmp, a, b); + tcg_temp_free_i64(a); + tcg_temp_free_i64(b); + tcg_temp_free_i128(t16); + + tcg_gen_setcondi_i64(TCG_COND_NE, tmp, tmp, 0); + } + } else { + tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val, + cpu_reg(s, rt), get_mem_index(s), + size | MO_ALIGN | s->be_data); + tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val); + } + tcg_gen_mov_i64(cpu_reg(s, rd), tmp); + tcg_temp_free_i64(tmp); + tcg_gen_br(done_label); + + gen_set_label(fail_label); + tcg_gen_movi_i64(cpu_reg(s, rd), 1); + gen_set_label(done_label); + tcg_gen_movi_i64(cpu_exclusive_addr, -1); +} + +static void gen_compare_and_swap(DisasContext *s, int rs, int rt, + int rn, int size) +{ + TCGv_i64 tcg_rs = cpu_reg(s, rs); + TCGv_i64 tcg_rt = cpu_reg(s, rt); + int memidx = get_mem_index(s); + TCGv_i64 clean_addr; + + if (rn == 31) { + gen_check_sp_alignment(s); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size); + tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx, + size | MO_ALIGN | s->be_data); +} + +static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, + int rn, int size) +{ + TCGv_i64 s1 = cpu_reg(s, rs); + TCGv_i64 s2 = cpu_reg(s, rs + 1); + TCGv_i64 t1 = cpu_reg(s, rt); + TCGv_i64 t2 = cpu_reg(s, rt + 1); + TCGv_i64 clean_addr; + int memidx = get_mem_index(s); + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + /* This is a single atomic access, despite the "pair". */ + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1); + + if (size == 2) { + TCGv_i64 cmp = tcg_temp_new_i64(); + TCGv_i64 val = tcg_temp_new_i64(); + + if (s->be_data == MO_LE) { + tcg_gen_concat32_i64(val, t1, t2); + tcg_gen_concat32_i64(cmp, s1, s2); + } else { + tcg_gen_concat32_i64(val, t2, t1); + tcg_gen_concat32_i64(cmp, s2, s1); + } + + tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx, + MO_64 | MO_ALIGN | s->be_data); + tcg_temp_free_i64(val); + + if (s->be_data == MO_LE) { + tcg_gen_extr32_i64(s1, s2, cmp); + } else { + tcg_gen_extr32_i64(s2, s1, cmp); + } + tcg_temp_free_i64(cmp); + } else { + TCGv_i128 cmp = tcg_temp_new_i128(); + TCGv_i128 val = tcg_temp_new_i128(); + + if (s->be_data == MO_LE) { + tcg_gen_concat_i64_i128(val, t1, t2); + tcg_gen_concat_i64_i128(cmp, s1, s2); + } else { + tcg_gen_concat_i64_i128(val, t2, t1); + tcg_gen_concat_i64_i128(cmp, s2, s1); + } + + tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx, + MO_128 | MO_ALIGN | s->be_data); + tcg_temp_free_i128(val); + + if (s->be_data == MO_LE) { + tcg_gen_extr_i128_i64(s1, s2, cmp); + } else { + tcg_gen_extr_i128_i64(s2, s1, cmp); + } + tcg_temp_free_i128(cmp); + } +} + +/* Update the Sixty-Four bit (SF) registersize. This logic is derived + * from the ARMv8 specs for LDR (Shared decode for all encodings). + */ +static bool disas_ldst_compute_iss_sf(int size, bool is_signed, int opc) +{ + int opc0 = extract32(opc, 0, 1); + int regsize; + + if (is_signed) { + regsize = opc0 ? 32 : 64; + } else { + regsize = size == 3 ? 64 : 32; + } + return regsize == 64; +} + +/* Load/store exclusive + * + * 31 30 29 24 23 22 21 20 16 15 14 10 9 5 4 0 + * +-----+-------------+----+---+----+------+----+-------+------+------+ + * | sz | 0 0 1 0 0 0 | o2 | L | o1 | Rs | o0 | Rt2 | Rn | Rt | + * +-----+-------------+----+---+----+------+----+-------+------+------+ + * + * sz: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64 bit + * L: 0 -> store, 1 -> load + * o2: 0 -> exclusive, 1 -> not + * o1: 0 -> single register, 1 -> register pair + * o0: 1 -> load-acquire/store-release, 0 -> not + */ +static void disas_ldst_excl(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rt2 = extract32(insn, 10, 5); + int rs = extract32(insn, 16, 5); + int is_lasr = extract32(insn, 15, 1); + int o2_L_o1_o0 = extract32(insn, 21, 3) * 2 | is_lasr; + int size = extract32(insn, 30, 2); + TCGv_i64 clean_addr; + + switch (o2_L_o1_o0) { + case 0x0: /* STXR */ + case 0x1: /* STLXR */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + if (is_lasr) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); + gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false); + return; + + case 0x4: /* LDXR */ + case 0x5: /* LDAXR */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); + s->is_ldex = true; + gen_load_exclusive(s, rt, rt2, clean_addr, size, false); + if (is_lasr) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + } + return; + + case 0x8: /* STLLR */ + if (!dc_isar_feature(aa64_lor, s)) { + break; + } + /* StoreLORelease is the same as Store-Release for QEMU. */ + /* fall through */ + case 0x9: /* STLR */ + /* Generate ISS for non-exclusive accesses including LASR. */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); + /* TODO: ARMv8.4-LSE SCTLR.nAA */ + do_gpr_st(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, true, rt, + disas_ldst_compute_iss_sf(size, false, 0), is_lasr); + return; + + case 0xc: /* LDLAR */ + if (!dc_isar_feature(aa64_lor, s)) { + break; + } + /* LoadLOAcquire is the same as Load-Acquire for QEMU. */ + /* fall through */ + case 0xd: /* LDAR */ + /* Generate ISS for non-exclusive accesses including LASR. */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); + /* TODO: ARMv8.4-LSE SCTLR.nAA */ + do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, false, true, + rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr); + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + return; + + case 0x2: case 0x3: /* CASP / STXP */ + if (size & 2) { /* STXP / STLXP */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + if (is_lasr) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); + gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true); + return; + } + if (rt2 == 31 + && ((rt | rs) & 1) == 0 + && dc_isar_feature(aa64_atomics, s)) { + /* CASP / CASPL */ + gen_compare_and_swap_pair(s, rs, rt, rn, size | 2); + return; + } + break; + + case 0x6: case 0x7: /* CASPA / LDXP */ + if (size & 2) { /* LDXP / LDAXP */ + if (rn == 31) { + gen_check_sp_alignment(s); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); + s->is_ldex = true; + gen_load_exclusive(s, rt, rt2, clean_addr, size, true); + if (is_lasr) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + } + return; + } + if (rt2 == 31 + && ((rt | rs) & 1) == 0 + && dc_isar_feature(aa64_atomics, s)) { + /* CASPA / CASPAL */ + gen_compare_and_swap_pair(s, rs, rt, rn, size | 2); + return; + } + break; + + case 0xa: /* CAS */ + case 0xb: /* CASL */ + case 0xe: /* CASA */ + case 0xf: /* CASAL */ + if (rt2 == 31 && dc_isar_feature(aa64_atomics, s)) { + gen_compare_and_swap(s, rs, rt, rn, size); + return; + } + break; + } + unallocated_encoding(s); +} + +/* + * Load register (literal) + * + * 31 30 29 27 26 25 24 23 5 4 0 + * +-----+-------+---+-----+-------------------+-------+ + * | opc | 0 1 1 | V | 0 0 | imm19 | Rt | + * +-----+-------+---+-----+-------------------+-------+ + * + * V: 1 -> vector (simd/fp) + * opc (non-vector): 00 -> 32 bit, 01 -> 64 bit, + * 10-> 32 bit signed, 11 -> prefetch + * opc (vector): 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit (11 unallocated) + */ +static void disas_ld_lit(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int64_t imm = sextract32(insn, 5, 19) << 2; + bool is_vector = extract32(insn, 26, 1); + int opc = extract32(insn, 30, 2); + bool is_signed = false; + int size = 2; + TCGv_i64 tcg_rt, clean_addr; + + if (is_vector) { + if (opc == 3) { + unallocated_encoding(s); + return; + } + size = 2 + opc; + if (!fp_access_check(s)) { + return; + } + } else { + if (opc == 3) { + /* PRFM (literal) : prefetch */ + return; + } + size = 2 + extract32(opc, 0, 1); + is_signed = extract32(opc, 1, 1); + } + + tcg_rt = cpu_reg(s, rt); + + clean_addr = new_tmp_a64(s); + gen_pc_plus_diff(s, clean_addr, imm); + if (is_vector) { + do_fp_ld(s, rt, clean_addr, size); + } else { + /* Only unsigned 32bit loads target 32bit registers. */ + bool iss_sf = opc != 0; + + do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + false, true, rt, iss_sf, false); + } +} + +/* + * LDNP (Load Pair - non-temporal hint) + * LDP (Load Pair - non vector) + * LDPSW (Load Pair Signed Word - non vector) + * STNP (Store Pair - non-temporal hint) + * STP (Store Pair - non vector) + * LDNP (Load Pair of SIMD&FP - non-temporal hint) + * LDP (Load Pair of SIMD&FP) + * STNP (Store Pair of SIMD&FP - non-temporal hint) + * STP (Store Pair of SIMD&FP) + * + * 31 30 29 27 26 25 24 23 22 21 15 14 10 9 5 4 0 + * +-----+-------+---+---+-------+---+-----------------------------+ + * | opc | 1 0 1 | V | 0 | index | L | imm7 | Rt2 | Rn | Rt | + * +-----+-------+---+---+-------+---+-------+-------+------+------+ + * + * opc: LDP/STP/LDNP/STNP 00 -> 32 bit, 10 -> 64 bit + * LDPSW/STGP 01 + * LDP/STP/LDNP/STNP (SIMD) 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit + * V: 0 -> GPR, 1 -> Vector + * idx: 00 -> signed offset with non-temporal hint, 01 -> post-index, + * 10 -> signed offset, 11 -> pre-index + * L: 0 -> Store 1 -> Load + * + * Rt, Rt2 = GPR or SIMD registers to be stored + * Rn = general purpose register containing address + * imm7 = signed offset (multiple of 4 or 8 depending on size) + */ +static void disas_ldst_pair(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rt2 = extract32(insn, 10, 5); + uint64_t offset = sextract64(insn, 15, 7); + int index = extract32(insn, 23, 2); + bool is_vector = extract32(insn, 26, 1); + bool is_load = extract32(insn, 22, 1); + int opc = extract32(insn, 30, 2); + + bool is_signed = false; + bool postindex = false; + bool wback = false; + bool set_tag = false; + + TCGv_i64 clean_addr, dirty_addr; + + int size; + + if (opc == 3) { + unallocated_encoding(s); + return; + } + + if (is_vector) { + size = 2 + opc; + } else if (opc == 1 && !is_load) { + /* STGP */ + if (!dc_isar_feature(aa64_mte_insn_reg, s) || index == 0) { + unallocated_encoding(s); + return; + } + size = 3; + set_tag = true; + } else { + size = 2 + extract32(opc, 1, 1); + is_signed = extract32(opc, 0, 1); + if (!is_load && is_signed) { + unallocated_encoding(s); + return; + } + } + + switch (index) { + case 1: /* post-index */ + postindex = true; + wback = true; + break; + case 0: + /* signed offset with "non-temporal" hint. Since we don't emulate + * caches we don't care about hints to the cache system about + * data access patterns, and handle this identically to plain + * signed offset. + */ + if (is_signed) { + /* There is no non-temporal-hint version of LDPSW */ + unallocated_encoding(s); + return; + } + postindex = false; + break; + case 2: /* signed offset, rn not updated */ + postindex = false; + break; + case 3: /* pre-index */ + postindex = false; + wback = true; + break; + } + + if (is_vector && !fp_access_check(s)) { + return; + } + + offset <<= (set_tag ? LOG2_TAG_GRANULE : size); + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + dirty_addr = read_cpu_reg_sp(s, rn, 1); + if (!postindex) { + tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + } + + if (set_tag) { + if (!s->ata) { + /* + * TODO: We could rely on the stores below, at least for + * system mode, if we arrange to add MO_ALIGN_16. + */ + gen_helper_stg_stub(cpu_env, dirty_addr); + } else if (tb_cflags(s->base.tb) & CF_PARALLEL) { + gen_helper_stg_parallel(cpu_env, dirty_addr, dirty_addr); + } else { + gen_helper_stg(cpu_env, dirty_addr, dirty_addr); + } + } + + clean_addr = gen_mte_checkN(s, dirty_addr, !is_load, + (wback || rn != 31) && !set_tag, 2 << size); + + if (is_vector) { + if (is_load) { + do_fp_ld(s, rt, clean_addr, size); + } else { + do_fp_st(s, rt, clean_addr, size); + } + tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); + if (is_load) { + do_fp_ld(s, rt2, clean_addr, size); + } else { + do_fp_st(s, rt2, clean_addr, size); + } + } else { + TCGv_i64 tcg_rt = cpu_reg(s, rt); + TCGv_i64 tcg_rt2 = cpu_reg(s, rt2); + + if (is_load) { + TCGv_i64 tmp = tcg_temp_new_i64(); + + /* Do not modify tcg_rt before recognizing any exception + * from the second load. + */ + do_gpr_ld(s, tmp, clean_addr, size + is_signed * MO_SIGN, + false, false, 0, false, false); + tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); + do_gpr_ld(s, tcg_rt2, clean_addr, size + is_signed * MO_SIGN, + false, false, 0, false, false); + + tcg_gen_mov_i64(tcg_rt, tmp); + tcg_temp_free_i64(tmp); + } else { + do_gpr_st(s, tcg_rt, clean_addr, size, + false, 0, false, false); + tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); + do_gpr_st(s, tcg_rt2, clean_addr, size, + false, 0, false, false); + } + } + + if (wback) { + if (postindex) { + tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + } + tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr); + } +} + +/* + * Load/store (immediate post-indexed) + * Load/store (immediate pre-indexed) + * Load/store (unscaled immediate) + * + * 31 30 29 27 26 25 24 23 22 21 20 12 11 10 9 5 4 0 + * +----+-------+---+-----+-----+---+--------+-----+------+------+ + * |size| 1 1 1 | V | 0 0 | opc | 0 | imm9 | idx | Rn | Rt | + * +----+-------+---+-----+-----+---+--------+-----+------+------+ + * + * idx = 01 -> post-indexed, 11 pre-indexed, 00 unscaled imm. (no writeback) + 10 -> unprivileged + * V = 0 -> non-vector + * size: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64bit + * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32 + */ +static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, + int opc, + int size, + int rt, + bool is_vector) +{ + int rn = extract32(insn, 5, 5); + int imm9 = sextract32(insn, 12, 9); + int idx = extract32(insn, 10, 2); + bool is_signed = false; + bool is_store = false; + bool is_extended = false; + bool is_unpriv = (idx == 2); + bool iss_valid; + bool post_index; + bool writeback; + int memidx; + + TCGv_i64 clean_addr, dirty_addr; + + if (is_vector) { + size |= (opc & 2) << 1; + if (size > 4 || is_unpriv) { + unallocated_encoding(s); + return; + } + is_store = ((opc & 1) == 0); + if (!fp_access_check(s)) { + return; + } + } else { + if (size == 3 && opc == 2) { + /* PRFM - prefetch */ + if (idx != 0) { + unallocated_encoding(s); + return; + } + return; + } + if (opc == 3 && size > 1) { + unallocated_encoding(s); + return; + } + is_store = (opc == 0); + is_signed = extract32(opc, 1, 1); + is_extended = (size < 3) && extract32(opc, 0, 1); + } + + switch (idx) { + case 0: + case 2: + post_index = false; + writeback = false; + break; + case 1: + post_index = true; + writeback = true; + break; + case 3: + post_index = false; + writeback = true; + break; + default: + g_assert_not_reached(); + } + + iss_valid = !is_vector && !writeback; + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + dirty_addr = read_cpu_reg_sp(s, rn, 1); + if (!post_index) { + tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9); + } + + memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s); + clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store, + writeback || rn != 31, + size, is_unpriv, memidx); + + if (is_vector) { + if (is_store) { + do_fp_st(s, rt, clean_addr, size); + } else { + do_fp_ld(s, rt, clean_addr, size); + } + } else { + TCGv_i64 tcg_rt = cpu_reg(s, rt); + bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); + + if (is_store) { + do_gpr_st_memidx(s, tcg_rt, clean_addr, size, memidx, + iss_valid, rt, iss_sf, false); + } else { + do_gpr_ld_memidx(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + is_extended, memidx, + iss_valid, rt, iss_sf, false); + } + } + + if (writeback) { + TCGv_i64 tcg_rn = cpu_reg_sp(s, rn); + if (post_index) { + tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9); + } + tcg_gen_mov_i64(tcg_rn, dirty_addr); + } +} + +/* + * Load/store (register offset) + * + * 31 30 29 27 26 25 24 23 22 21 20 16 15 13 12 11 10 9 5 4 0 + * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+ + * |size| 1 1 1 | V | 0 0 | opc | 1 | Rm | opt | S| 1 0 | Rn | Rt | + * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+ + * + * For non-vector: + * size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit + * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32 + * For vector: + * size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated + * opc<0>: 0 -> store, 1 -> load + * V: 1 -> vector/simd + * opt: extend encoding (see DecodeRegExtend) + * S: if S=1 then scale (essentially index by sizeof(size)) + * Rt: register to transfer into/out of + * Rn: address register or SP for base + * Rm: offset register or ZR for offset + */ +static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn, + int opc, + int size, + int rt, + bool is_vector) +{ + int rn = extract32(insn, 5, 5); + int shift = extract32(insn, 12, 1); + int rm = extract32(insn, 16, 5); + int opt = extract32(insn, 13, 3); + bool is_signed = false; + bool is_store = false; + bool is_extended = false; + + TCGv_i64 tcg_rm, clean_addr, dirty_addr; + + if (extract32(opt, 1, 1) == 0) { + unallocated_encoding(s); + return; + } + + if (is_vector) { + size |= (opc & 2) << 1; + if (size > 4) { + unallocated_encoding(s); + return; + } + is_store = !extract32(opc, 0, 1); + if (!fp_access_check(s)) { + return; + } + } else { + if (size == 3 && opc == 2) { + /* PRFM - prefetch */ + return; + } + if (opc == 3 && size > 1) { + unallocated_encoding(s); + return; + } + is_store = (opc == 0); + is_signed = extract32(opc, 1, 1); + is_extended = (size < 3) && extract32(opc, 0, 1); + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + dirty_addr = read_cpu_reg_sp(s, rn, 1); + + tcg_rm = read_cpu_reg(s, rm, 1); + ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0); + + tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size); + + if (is_vector) { + if (is_store) { + do_fp_st(s, rt, clean_addr, size); + } else { + do_fp_ld(s, rt, clean_addr, size); + } + } else { + TCGv_i64 tcg_rt = cpu_reg(s, rt); + bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); + if (is_store) { + do_gpr_st(s, tcg_rt, clean_addr, size, + true, rt, iss_sf, false); + } else { + do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + is_extended, true, rt, iss_sf, false); + } + } +} + +/* + * Load/store (unsigned immediate) + * + * 31 30 29 27 26 25 24 23 22 21 10 9 5 + * +----+-------+---+-----+-----+------------+-------+------+ + * |size| 1 1 1 | V | 0 1 | opc | imm12 | Rn | Rt | + * +----+-------+---+-----+-----+------------+-------+------+ + * + * For non-vector: + * size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit + * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32 + * For vector: + * size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated + * opc<0>: 0 -> store, 1 -> load + * Rn: base address register (inc SP) + * Rt: target register + */ +static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn, + int opc, + int size, + int rt, + bool is_vector) +{ + int rn = extract32(insn, 5, 5); + unsigned int imm12 = extract32(insn, 10, 12); + unsigned int offset; + + TCGv_i64 clean_addr, dirty_addr; + + bool is_store; + bool is_signed = false; + bool is_extended = false; + + if (is_vector) { + size |= (opc & 2) << 1; + if (size > 4) { + unallocated_encoding(s); + return; + } + is_store = !extract32(opc, 0, 1); + if (!fp_access_check(s)) { + return; + } + } else { + if (size == 3 && opc == 2) { + /* PRFM - prefetch */ + return; + } + if (opc == 3 && size > 1) { + unallocated_encoding(s); + return; + } + is_store = (opc == 0); + is_signed = extract32(opc, 1, 1); + is_extended = (size < 3) && extract32(opc, 0, 1); + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + dirty_addr = read_cpu_reg_sp(s, rn, 1); + offset = imm12 << size; + tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size); + + if (is_vector) { + if (is_store) { + do_fp_st(s, rt, clean_addr, size); + } else { + do_fp_ld(s, rt, clean_addr, size); + } + } else { + TCGv_i64 tcg_rt = cpu_reg(s, rt); + bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); + if (is_store) { + do_gpr_st(s, tcg_rt, clean_addr, size, + true, rt, iss_sf, false); + } else { + do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + is_extended, true, rt, iss_sf, false); + } + } +} + +/* Atomic memory operations + * + * 31 30 27 26 24 22 21 16 15 12 10 5 0 + * +------+-------+---+-----+-----+---+----+----+-----+-----+----+-----+ + * | size | 1 1 1 | V | 0 0 | A R | 1 | Rs | o3 | opc | 0 0 | Rn | Rt | + * +------+-------+---+-----+-----+--------+----+-----+-----+----+-----+ + * + * Rt: the result register + * Rn: base address or SP + * Rs: the source register for the operation + * V: vector flag (always 0 as of v8.3) + * A: acquire flag + * R: release flag + */ +static void disas_ldst_atomic(DisasContext *s, uint32_t insn, + int size, int rt, bool is_vector) +{ + int rs = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int o3_opc = extract32(insn, 12, 4); + bool r = extract32(insn, 22, 1); + bool a = extract32(insn, 23, 1); + TCGv_i64 tcg_rs, tcg_rt, clean_addr; + AtomicThreeOpFn *fn = NULL; + MemOp mop = s->be_data | size | MO_ALIGN; + + if (is_vector || !dc_isar_feature(aa64_atomics, s)) { + unallocated_encoding(s); + return; + } + switch (o3_opc) { + case 000: /* LDADD */ + fn = tcg_gen_atomic_fetch_add_i64; + break; + case 001: /* LDCLR */ + fn = tcg_gen_atomic_fetch_and_i64; + break; + case 002: /* LDEOR */ + fn = tcg_gen_atomic_fetch_xor_i64; + break; + case 003: /* LDSET */ + fn = tcg_gen_atomic_fetch_or_i64; + break; + case 004: /* LDSMAX */ + fn = tcg_gen_atomic_fetch_smax_i64; + mop |= MO_SIGN; + break; + case 005: /* LDSMIN */ + fn = tcg_gen_atomic_fetch_smin_i64; + mop |= MO_SIGN; + break; + case 006: /* LDUMAX */ + fn = tcg_gen_atomic_fetch_umax_i64; + break; + case 007: /* LDUMIN */ + fn = tcg_gen_atomic_fetch_umin_i64; + break; + case 010: /* SWP */ + fn = tcg_gen_atomic_xchg_i64; + break; + case 014: /* LDAPR, LDAPRH, LDAPRB */ + if (!dc_isar_feature(aa64_rcpc_8_3, s) || + rs != 31 || a != 1 || r != 0) { + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size); + + if (o3_opc == 014) { + /* + * LDAPR* are a special case because they are a simple load, not a + * fetch-and-do-something op. + * The architectural consistency requirements here are weaker than + * full load-acquire (we only need "load-acquire processor consistent"), + * but we choose to implement them as full LDAQ. + */ + do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false, + true, rt, disas_ldst_compute_iss_sf(size, false, 0), true); + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + return; + } + + tcg_rs = read_cpu_reg(s, rs, true); + tcg_rt = cpu_reg(s, rt); + + if (o3_opc == 1) { /* LDCLR */ + tcg_gen_not_i64(tcg_rs, tcg_rs); + } + + /* The tcg atomic primitives are all full barriers. Therefore we + * can ignore the Acquire and Release bits of this instruction. + */ + fn(tcg_rt, clean_addr, tcg_rs, get_mem_index(s), mop); + + if ((mop & MO_SIGN) && size != MO_64) { + tcg_gen_ext32u_i64(tcg_rt, tcg_rt); + } +} + +/* + * PAC memory operations + * + * 31 30 27 26 24 22 21 12 11 10 5 0 + * +------+-------+---+-----+-----+---+--------+---+---+----+-----+ + * | size | 1 1 1 | V | 0 0 | M S | 1 | imm9 | W | 1 | Rn | Rt | + * +------+-------+---+-----+-----+---+--------+---+---+----+-----+ + * + * Rt: the result register + * Rn: base address or SP + * V: vector flag (always 0 as of v8.3) + * M: clear for key DA, set for key DB + * W: pre-indexing flag + * S: sign for imm9. + */ +static void disas_ldst_pac(DisasContext *s, uint32_t insn, + int size, int rt, bool is_vector) +{ + int rn = extract32(insn, 5, 5); + bool is_wback = extract32(insn, 11, 1); + bool use_key_a = !extract32(insn, 23, 1); + int offset; + TCGv_i64 clean_addr, dirty_addr, tcg_rt; + + if (size != 3 || is_vector || !dc_isar_feature(aa64_pauth, s)) { + unallocated_encoding(s); + return; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + dirty_addr = read_cpu_reg_sp(s, rn, 1); + + if (s->pauth_active) { + if (use_key_a) { + gen_helper_autda(dirty_addr, cpu_env, dirty_addr, + new_tmp_a64_zero(s)); + } else { + gen_helper_autdb(dirty_addr, cpu_env, dirty_addr, + new_tmp_a64_zero(s)); + } + } + + /* Form the 10-bit signed, scaled offset. */ + offset = (extract32(insn, 22, 1) << 9) | extract32(insn, 12, 9); + offset = sextract32(offset << size, 0, 10 + size); + tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + + /* Note that "clean" and "dirty" here refer to TBI not PAC. */ + clean_addr = gen_mte_check1(s, dirty_addr, false, + is_wback || rn != 31, size); + + tcg_rt = cpu_reg(s, rt); + do_gpr_ld(s, tcg_rt, clean_addr, size, + /* extend */ false, /* iss_valid */ !is_wback, + /* iss_srt */ rt, /* iss_sf */ true, /* iss_ar */ false); + + if (is_wback) { + tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr); + } +} + +/* + * LDAPR/STLR (unscaled immediate) + * + * 31 30 24 22 21 12 10 5 0 + * +------+-------------+-----+---+--------+-----+----+-----+ + * | size | 0 1 1 0 0 1 | opc | 0 | imm9 | 0 0 | Rn | Rt | + * +------+-------------+-----+---+--------+-----+----+-----+ + * + * Rt: source or destination register + * Rn: base register + * imm9: unscaled immediate offset + * opc: 00: STLUR*, 01/10/11: various LDAPUR* + * size: size of load/store + */ +static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int offset = sextract32(insn, 12, 9); + int opc = extract32(insn, 22, 2); + int size = extract32(insn, 30, 2); + TCGv_i64 clean_addr, dirty_addr; + bool is_store = false; + bool extend = false; + bool iss_sf; + MemOp mop; + + if (!dc_isar_feature(aa64_rcpc_8_4, s)) { + unallocated_encoding(s); + return; + } + + /* TODO: ARMv8.4-LSE SCTLR.nAA */ + mop = size | MO_ALIGN; + + switch (opc) { + case 0: /* STLURB */ + is_store = true; + break; + case 1: /* LDAPUR* */ + break; + case 2: /* LDAPURS* 64-bit variant */ + if (size == 3) { + unallocated_encoding(s); + return; + } + mop |= MO_SIGN; + break; + case 3: /* LDAPURS* 32-bit variant */ + if (size > 1) { + unallocated_encoding(s); + return; + } + mop |= MO_SIGN; + extend = true; /* zero-extend 32->64 after signed load */ + break; + default: + g_assert_not_reached(); + } + + iss_sf = disas_ldst_compute_iss_sf(size, (mop & MO_SIGN) != 0, opc); + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + dirty_addr = read_cpu_reg_sp(s, rn, 1); + tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + clean_addr = clean_data_tbi(s, dirty_addr); + + if (is_store) { + /* Store-Release semantics */ + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + do_gpr_st(s, cpu_reg(s, rt), clean_addr, mop, true, rt, iss_sf, true); + } else { + /* + * Load-AcquirePC semantics; we implement as the slightly more + * restrictive Load-Acquire. + */ + do_gpr_ld(s, cpu_reg(s, rt), clean_addr, mop, + extend, true, rt, iss_sf, true); + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + } +} + +/* Load/store register (all forms) */ +static void disas_ldst_reg(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int opc = extract32(insn, 22, 2); + bool is_vector = extract32(insn, 26, 1); + int size = extract32(insn, 30, 2); + + switch (extract32(insn, 24, 2)) { + case 0: + if (extract32(insn, 21, 1) == 0) { + /* Load/store register (unscaled immediate) + * Load/store immediate pre/post-indexed + * Load/store register unprivileged + */ + disas_ldst_reg_imm9(s, insn, opc, size, rt, is_vector); + return; + } + switch (extract32(insn, 10, 2)) { + case 0: + disas_ldst_atomic(s, insn, size, rt, is_vector); + return; + case 2: + disas_ldst_reg_roffset(s, insn, opc, size, rt, is_vector); + return; + default: + disas_ldst_pac(s, insn, size, rt, is_vector); + return; + } + break; + case 1: + disas_ldst_reg_unsigned_imm(s, insn, opc, size, rt, is_vector); + return; + } + unallocated_encoding(s); +} + +/* AdvSIMD load/store multiple structures + * + * 31 30 29 23 22 21 16 15 12 11 10 9 5 4 0 + * +---+---+---------------+---+-------------+--------+------+------+------+ + * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size | Rn | Rt | + * +---+---+---------------+---+-------------+--------+------+------+------+ + * + * AdvSIMD load/store multiple structures (post-indexed) + * + * 31 30 29 23 22 21 20 16 15 12 11 10 9 5 4 0 + * +---+---+---------------+---+---+---------+--------+------+------+------+ + * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 | Rm | opcode | size | Rn | Rt | + * +---+---+---------------+---+---+---------+--------+------+------+------+ + * + * Rt: first (or only) SIMD&FP register to be transferred + * Rn: base address or SP + * Rm (post-index only): post-index register (when !31) or size dependent #imm + */ +static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 10, 2); + int opcode = extract32(insn, 12, 4); + bool is_store = !extract32(insn, 22, 1); + bool is_postidx = extract32(insn, 23, 1); + bool is_q = extract32(insn, 30, 1); + TCGv_i64 clean_addr, tcg_rn, tcg_ebytes; + MemOp endian, align, mop; + + int total; /* total bytes */ + int elements; /* elements per vector */ + int rpt; /* num iterations */ + int selem; /* structure elements */ + int r; + + if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) { + unallocated_encoding(s); + return; + } + + if (!is_postidx && rm != 0) { + unallocated_encoding(s); + return; + } + + /* From the shared decode logic */ + switch (opcode) { + case 0x0: + rpt = 1; + selem = 4; + break; + case 0x2: + rpt = 4; + selem = 1; + break; + case 0x4: + rpt = 1; + selem = 3; + break; + case 0x6: + rpt = 3; + selem = 1; + break; + case 0x7: + rpt = 1; + selem = 1; + break; + case 0x8: + rpt = 1; + selem = 2; + break; + case 0xa: + rpt = 2; + selem = 1; + break; + default: + unallocated_encoding(s); + return; + } + + if (size == 3 && !is_q && selem != 1) { + /* reserved */ + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + /* For our purposes, bytes are always little-endian. */ + endian = s->be_data; + if (size == 0) { + endian = MO_LE; + } + + total = rpt * selem * (is_q ? 16 : 8); + tcg_rn = cpu_reg_sp(s, rn); + + /* + * Issue the MTE check vs the logical repeat count, before we + * promote consecutive little-endian elements below. + */ + clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31, + total); + + /* + * Consecutive little-endian elements from a single register + * can be promoted to a larger little-endian operation. + */ + align = MO_ALIGN; + if (selem == 1 && endian == MO_LE) { + align = pow2_align(size); + size = 3; + } + if (!s->align_mem) { + align = 0; + } + mop = endian | size | align; + + elements = (is_q ? 16 : 8) >> size; + tcg_ebytes = tcg_constant_i64(1 << size); + for (r = 0; r < rpt; r++) { + int e; + for (e = 0; e < elements; e++) { + int xs; + for (xs = 0; xs < selem; xs++) { + int tt = (rt + r + xs) % 32; + if (is_store) { + do_vec_st(s, tt, e, clean_addr, mop); + } else { + do_vec_ld(s, tt, e, clean_addr, mop); + } + tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes); + } + } + } + + if (!is_store) { + /* For non-quad operations, setting a slice of the low + * 64 bits of the register clears the high 64 bits (in + * the ARM ARM pseudocode this is implicit in the fact + * that 'rval' is a 64 bit wide variable). + * For quad operations, we might still need to zero the + * high bits of SVE. + */ + for (r = 0; r < rpt * selem; r++) { + int tt = (rt + r) % 32; + clear_vec_high(s, is_q, tt); + } + } + + if (is_postidx) { + if (rm == 31) { + tcg_gen_addi_i64(tcg_rn, tcg_rn, total); + } else { + tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm)); + } + } +} + +/* AdvSIMD load/store single structure + * + * 31 30 29 23 22 21 20 16 15 13 12 11 10 9 5 4 0 + * +---+---+---------------+-----+-----------+-----+---+------+------+------+ + * | 0 | Q | 0 0 1 1 0 1 0 | L R | 0 0 0 0 0 | opc | S | size | Rn | Rt | + * +---+---+---------------+-----+-----------+-----+---+------+------+------+ + * + * AdvSIMD load/store single structure (post-indexed) + * + * 31 30 29 23 22 21 20 16 15 13 12 11 10 9 5 4 0 + * +---+---+---------------+-----+-----------+-----+---+------+------+------+ + * | 0 | Q | 0 0 1 1 0 1 1 | L R | Rm | opc | S | size | Rn | Rt | + * +---+---+---------------+-----+-----------+-----+---+------+------+------+ + * + * Rt: first (or only) SIMD&FP register to be transferred + * Rn: base address or SP + * Rm (post-index only): post-index register (when !31) or size dependent #imm + * index = encoded in Q:S:size dependent on size + * + * lane_size = encoded in R, opc + * transfer width = encoded in opc, S, size + */ +static void disas_ldst_single_struct(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 10, 2); + int S = extract32(insn, 12, 1); + int opc = extract32(insn, 13, 3); + int R = extract32(insn, 21, 1); + int is_load = extract32(insn, 22, 1); + int is_postidx = extract32(insn, 23, 1); + int is_q = extract32(insn, 30, 1); + + int scale = extract32(opc, 1, 2); + int selem = (extract32(opc, 0, 1) << 1 | R) + 1; + bool replicate = false; + int index = is_q << 3 | S << 2 | size; + int xs, total; + TCGv_i64 clean_addr, tcg_rn, tcg_ebytes; + MemOp mop; + + if (extract32(insn, 31, 1)) { + unallocated_encoding(s); + return; + } + if (!is_postidx && rm != 0) { + unallocated_encoding(s); + return; + } + + switch (scale) { + case 3: + if (!is_load || S) { + unallocated_encoding(s); + return; + } + scale = size; + replicate = true; + break; + case 0: + break; + case 1: + if (extract32(size, 0, 1)) { + unallocated_encoding(s); + return; + } + index >>= 1; + break; + case 2: + if (extract32(size, 1, 1)) { + unallocated_encoding(s); + return; + } + if (!extract32(size, 0, 1)) { + index >>= 2; + } else { + if (S) { + unallocated_encoding(s); + return; + } + index >>= 3; + scale = 3; + } + break; + default: + g_assert_not_reached(); + } + + if (!fp_access_check(s)) { + return; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + total = selem << scale; + tcg_rn = cpu_reg_sp(s, rn); + + clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31, + total); + mop = finalize_memop(s, scale); + + tcg_ebytes = tcg_constant_i64(1 << scale); + for (xs = 0; xs < selem; xs++) { + if (replicate) { + /* Load and replicate to all elements */ + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + + tcg_gen_qemu_ld_i64(tcg_tmp, clean_addr, get_mem_index(s), mop); + tcg_gen_gvec_dup_i64(scale, vec_full_reg_offset(s, rt), + (is_q + 1) * 8, vec_full_reg_size(s), + tcg_tmp); + tcg_temp_free_i64(tcg_tmp); + } else { + /* Load/store one element per register */ + if (is_load) { + do_vec_ld(s, rt, index, clean_addr, mop); + } else { + do_vec_st(s, rt, index, clean_addr, mop); + } + } + tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes); + rt = (rt + 1) % 32; + } + + if (is_postidx) { + if (rm == 31) { + tcg_gen_addi_i64(tcg_rn, tcg_rn, total); + } else { + tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm)); + } + } +} + +/* + * Load/Store memory tags + * + * 31 30 29 24 22 21 12 10 5 0 + * +-----+-------------+-----+---+------+-----+------+------+ + * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 | Rn | Rt | + * +-----+-------------+-----+---+------+-----+------+------+ + */ +static void disas_ldst_tag(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE; + int op2 = extract32(insn, 10, 2); + int op1 = extract32(insn, 22, 2); + bool is_load = false, is_pair = false, is_zero = false, is_mult = false; + int index = 0; + TCGv_i64 addr, clean_addr, tcg_rt; + + /* We checked insn bits [29:24,21] in the caller. */ + if (extract32(insn, 30, 2) != 3) { + goto do_unallocated; + } + + /* + * @index is a tri-state variable which has 3 states: + * < 0 : post-index, writeback + * = 0 : signed offset + * > 0 : pre-index, writeback + */ + switch (op1) { + case 0: + if (op2 != 0) { + /* STG */ + index = op2 - 2; + } else { + /* STZGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = is_zero = true; + } + break; + case 1: + if (op2 != 0) { + /* STZG */ + is_zero = true; + index = op2 - 2; + } else { + /* LDG */ + is_load = true; + } + break; + case 2: + if (op2 != 0) { + /* ST2G */ + is_pair = true; + index = op2 - 2; + } else { + /* STGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = true; + } + break; + case 3: + if (op2 != 0) { + /* STZ2G */ + is_pair = is_zero = true; + index = op2 - 2; + } else { + /* LDGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = is_load = true; + } + break; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } + + if (is_mult + ? !dc_isar_feature(aa64_mte, s) + : !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + addr = read_cpu_reg_sp(s, rn, true); + if (index >= 0) { + /* pre-index or signed offset */ + tcg_gen_addi_i64(addr, addr, offset); + } + + if (is_mult) { + tcg_rt = cpu_reg(s, rt); + + if (is_zero) { + int size = 4 << s->dcz_blocksize; + + if (s->ata) { + gen_helper_stzgm_tags(cpu_env, addr, tcg_rt); + } + /* + * The non-tags portion of STZGM is mostly like DC_ZVA, + * except the alignment happens before the access. + */ + clean_addr = clean_data_tbi(s, addr); + tcg_gen_andi_i64(clean_addr, clean_addr, -size); + gen_helper_dc_zva(cpu_env, clean_addr); + } else if (s->ata) { + if (is_load) { + gen_helper_ldgm(tcg_rt, cpu_env, addr); + } else { + gen_helper_stgm(cpu_env, addr, tcg_rt); + } + } else { + MMUAccessType acc = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE; + int size = 4 << GMID_EL1_BS; + + clean_addr = clean_data_tbi(s, addr); + tcg_gen_andi_i64(clean_addr, clean_addr, -size); + gen_probe_access(s, clean_addr, acc, size); + + if (is_load) { + /* The result tags are zeros. */ + tcg_gen_movi_i64(tcg_rt, 0); + } + } + return; + } + + if (is_load) { + tcg_gen_andi_i64(addr, addr, -TAG_GRANULE); + tcg_rt = cpu_reg(s, rt); + if (s->ata) { + gen_helper_ldg(tcg_rt, cpu_env, addr, tcg_rt); + } else { + clean_addr = clean_data_tbi(s, addr); + gen_probe_access(s, clean_addr, MMU_DATA_LOAD, MO_8); + gen_address_with_allocation_tag0(tcg_rt, addr); + } + } else { + tcg_rt = cpu_reg_sp(s, rt); + if (!s->ata) { + /* + * For STG and ST2G, we need to check alignment and probe memory. + * TODO: For STZG and STZ2G, we could rely on the stores below, + * at least for system mode; user-only won't enforce alignment. + */ + if (is_pair) { + gen_helper_st2g_stub(cpu_env, addr); + } else { + gen_helper_stg_stub(cpu_env, addr); + } + } else if (tb_cflags(s->base.tb) & CF_PARALLEL) { + if (is_pair) { + gen_helper_st2g_parallel(cpu_env, addr, tcg_rt); + } else { + gen_helper_stg_parallel(cpu_env, addr, tcg_rt); + } + } else { + if (is_pair) { + gen_helper_st2g(cpu_env, addr, tcg_rt); + } else { + gen_helper_stg(cpu_env, addr, tcg_rt); + } + } + } + + if (is_zero) { + TCGv_i64 clean_addr = clean_data_tbi(s, addr); + TCGv_i64 tcg_zero = tcg_constant_i64(0); + int mem_index = get_mem_index(s); + int i, n = (1 + is_pair) << LOG2_TAG_GRANULE; + + tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, + MO_UQ | MO_ALIGN_16); + for (i = 8; i < n; i += 8) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_UQ); + } + } + + if (index != 0) { + /* pre-index or post-index */ + if (index < 0) { + /* post-index */ + tcg_gen_addi_i64(addr, addr, offset); + } + tcg_gen_mov_i64(cpu_reg_sp(s, rn), addr); + } +} + +/* Loads and stores */ +static void disas_ldst(DisasContext *s, uint32_t insn) +{ + switch (extract32(insn, 24, 6)) { + case 0x08: /* Load/store exclusive */ + disas_ldst_excl(s, insn); + break; + case 0x18: case 0x1c: /* Load register (literal) */ + disas_ld_lit(s, insn); + break; + case 0x28: case 0x29: + case 0x2c: case 0x2d: /* Load/store pair (all forms) */ + disas_ldst_pair(s, insn); + break; + case 0x38: case 0x39: + case 0x3c: case 0x3d: /* Load/store register (all forms) */ + disas_ldst_reg(s, insn); + break; + case 0x0c: /* AdvSIMD load/store multiple structures */ + disas_ldst_multiple_struct(s, insn); + break; + case 0x0d: /* AdvSIMD load/store single structure */ + disas_ldst_single_struct(s, insn); + break; + case 0x19: + if (extract32(insn, 21, 1) != 0) { + disas_ldst_tag(s, insn); + } else if (extract32(insn, 10, 2) == 0) { + disas_ldst_ldapr_stlr(s, insn); + } else { + unallocated_encoding(s); + } + break; + default: + unallocated_encoding(s); + break; + } +} + +/* PC-rel. addressing + * 31 30 29 28 24 23 5 4 0 + * +----+-------+-----------+-------------------+------+ + * | op | immlo | 1 0 0 0 0 | immhi | Rd | + * +----+-------+-----------+-------------------+------+ + */ +static void disas_pc_rel_adr(DisasContext *s, uint32_t insn) +{ + unsigned int page, rd; + int64_t offset; + + page = extract32(insn, 31, 1); + /* SignExtend(immhi:immlo) -> offset */ + offset = sextract64(insn, 5, 19); + offset = offset << 2 | extract32(insn, 29, 2); + rd = extract32(insn, 0, 5); + + if (page) { + /* ADRP (page based) */ + offset <<= 12; + /* The page offset is ok for TARGET_TB_PCREL. */ + offset -= s->pc_curr & 0xfff; + } + + gen_pc_plus_diff(s, cpu_reg(s, rd), offset); +} + +/* + * Add/subtract (immediate) + * + * 31 30 29 28 23 22 21 10 9 5 4 0 + * +--+--+--+-------------+--+-------------+-----+-----+ + * |sf|op| S| 1 0 0 0 1 0 |sh| imm12 | Rn | Rd | + * +--+--+--+-------------+--+-------------+-----+-----+ + * + * sf: 0 -> 32bit, 1 -> 64bit + * op: 0 -> add , 1 -> sub + * S: 1 -> set flags + * sh: 1 -> LSL imm by 12 + */ +static void disas_add_sub_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + uint64_t imm = extract32(insn, 10, 12); + bool shift = extract32(insn, 22, 1); + bool setflags = extract32(insn, 29, 1); + bool sub_op = extract32(insn, 30, 1); + bool is_64bit = extract32(insn, 31, 1); + + TCGv_i64 tcg_rn = cpu_reg_sp(s, rn); + TCGv_i64 tcg_rd = setflags ? cpu_reg(s, rd) : cpu_reg_sp(s, rd); + TCGv_i64 tcg_result; + + if (shift) { + imm <<= 12; + } + + tcg_result = tcg_temp_new_i64(); + if (!setflags) { + if (sub_op) { + tcg_gen_subi_i64(tcg_result, tcg_rn, imm); + } else { + tcg_gen_addi_i64(tcg_result, tcg_rn, imm); + } + } else { + TCGv_i64 tcg_imm = tcg_constant_i64(imm); + if (sub_op) { + gen_sub_CC(is_64bit, tcg_result, tcg_rn, tcg_imm); + } else { + gen_add_CC(is_64bit, tcg_result, tcg_rn, tcg_imm); + } + } + + if (is_64bit) { + tcg_gen_mov_i64(tcg_rd, tcg_result); + } else { + tcg_gen_ext32u_i64(tcg_rd, tcg_result); + } + + tcg_temp_free_i64(tcg_result); +} + +/* + * Add/subtract (immediate, with tags) + * + * 31 30 29 28 23 22 21 16 14 10 9 5 4 0 + * +--+--+--+-------------+--+---------+--+-------+-----+-----+ + * |sf|op| S| 1 0 0 0 1 1 |o2| uimm6 |o3| uimm4 | Rn | Rd | + * +--+--+--+-------------+--+---------+--+-------+-----+-----+ + * + * op: 0 -> add, 1 -> sub + */ +static void disas_add_sub_imm_with_tags(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int uimm4 = extract32(insn, 10, 4); + int uimm6 = extract32(insn, 16, 6); + bool sub_op = extract32(insn, 30, 1); + TCGv_i64 tcg_rn, tcg_rd; + int imm; + + /* Test all of sf=1, S=0, o2=0, o3=0. */ + if ((insn & 0xa040c000u) != 0x80000000u || + !dc_isar_feature(aa64_mte_insn_reg, s)) { + unallocated_encoding(s); + return; + } + + imm = uimm6 << LOG2_TAG_GRANULE; + if (sub_op) { + imm = -imm; + } + + tcg_rn = cpu_reg_sp(s, rn); + tcg_rd = cpu_reg_sp(s, rd); + + if (s->ata) { + gen_helper_addsubg(tcg_rd, cpu_env, tcg_rn, + tcg_constant_i32(imm), + tcg_constant_i32(uimm4)); + } else { + tcg_gen_addi_i64(tcg_rd, tcg_rn, imm); + gen_address_with_allocation_tag0(tcg_rd, tcg_rd); + } +} + +/* The input should be a value in the bottom e bits (with higher + * bits zero); returns that value replicated into every element + * of size e in a 64 bit integer. + */ +static uint64_t bitfield_replicate(uint64_t mask, unsigned int e) +{ + assert(e != 0); + while (e < 64) { + mask |= mask << e; + e *= 2; + } + return mask; +} + +/* Return a value with the bottom len bits set (where 0 < len <= 64) */ +static inline uint64_t bitmask64(unsigned int length) +{ + assert(length > 0 && length <= 64); + return ~0ULL >> (64 - length); +} + +/* Simplified variant of pseudocode DecodeBitMasks() for the case where we + * only require the wmask. Returns false if the imms/immr/immn are a reserved + * value (ie should cause a guest UNDEF exception), and true if they are + * valid, in which case the decoded bit pattern is written to result. + */ +bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn, + unsigned int imms, unsigned int immr) +{ + uint64_t mask; + unsigned e, levels, s, r; + int len; + + assert(immn < 2 && imms < 64 && immr < 64); + + /* The bit patterns we create here are 64 bit patterns which + * are vectors of identical elements of size e = 2, 4, 8, 16, 32 or + * 64 bits each. Each element contains the same value: a run + * of between 1 and e-1 non-zero bits, rotated within the + * element by between 0 and e-1 bits. + * + * The element size and run length are encoded into immn (1 bit) + * and imms (6 bits) as follows: + * 64 bit elements: immn = 1, imms = <length of run - 1> + * 32 bit elements: immn = 0, imms = 0 : <length of run - 1> + * 16 bit elements: immn = 0, imms = 10 : <length of run - 1> + * 8 bit elements: immn = 0, imms = 110 : <length of run - 1> + * 4 bit elements: immn = 0, imms = 1110 : <length of run - 1> + * 2 bit elements: immn = 0, imms = 11110 : <length of run - 1> + * Notice that immn = 0, imms = 11111x is the only combination + * not covered by one of the above options; this is reserved. + * Further, <length of run - 1> all-ones is a reserved pattern. + * + * In all cases the rotation is by immr % e (and immr is 6 bits). + */ + + /* First determine the element size */ + len = 31 - clz32((immn << 6) | (~imms & 0x3f)); + if (len < 1) { + /* This is the immn == 0, imms == 0x11111x case */ + return false; + } + e = 1 << len; + + levels = e - 1; + s = imms & levels; + r = immr & levels; + + if (s == levels) { + /* <length of run - 1> mustn't be all-ones. */ + return false; + } + + /* Create the value of one element: s+1 set bits rotated + * by r within the element (which is e bits wide)... + */ + mask = bitmask64(s + 1); + if (r) { + mask = (mask >> r) | (mask << (e - r)); + mask &= bitmask64(e); + } + /* ...then replicate the element over the whole 64 bit value */ + mask = bitfield_replicate(mask, e); + *result = mask; + return true; +} + +/* Logical (immediate) + * 31 30 29 28 23 22 21 16 15 10 9 5 4 0 + * +----+-----+-------------+---+------+------+------+------+ + * | sf | opc | 1 0 0 1 0 0 | N | immr | imms | Rn | Rd | + * +----+-----+-------------+---+------+------+------+------+ + */ +static void disas_logic_imm(DisasContext *s, uint32_t insn) +{ + unsigned int sf, opc, is_n, immr, imms, rn, rd; + TCGv_i64 tcg_rd, tcg_rn; + uint64_t wmask; + bool is_and = false; + + sf = extract32(insn, 31, 1); + opc = extract32(insn, 29, 2); + is_n = extract32(insn, 22, 1); + immr = extract32(insn, 16, 6); + imms = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + if (!sf && is_n) { + unallocated_encoding(s); + return; + } + + if (opc == 0x3) { /* ANDS */ + tcg_rd = cpu_reg(s, rd); + } else { + tcg_rd = cpu_reg_sp(s, rd); + } + tcg_rn = cpu_reg(s, rn); + + if (!logic_imm_decode_wmask(&wmask, is_n, imms, immr)) { + /* some immediate field values are reserved */ + unallocated_encoding(s); + return; + } + + if (!sf) { + wmask &= 0xffffffff; + } + + switch (opc) { + case 0x3: /* ANDS */ + case 0x0: /* AND */ + tcg_gen_andi_i64(tcg_rd, tcg_rn, wmask); + is_and = true; + break; + case 0x1: /* ORR */ + tcg_gen_ori_i64(tcg_rd, tcg_rn, wmask); + break; + case 0x2: /* EOR */ + tcg_gen_xori_i64(tcg_rd, tcg_rn, wmask); + break; + default: + assert(FALSE); /* must handle all above */ + break; + } + + if (!sf && !is_and) { + /* zero extend final result; we know we can skip this for AND + * since the immediate had the high 32 bits clear. + */ + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } + + if (opc == 3) { /* ANDS */ + gen_logic_CC(sf, tcg_rd); + } +} + +/* + * Move wide (immediate) + * + * 31 30 29 28 23 22 21 20 5 4 0 + * +--+-----+-------------+-----+----------------+------+ + * |sf| opc | 1 0 0 1 0 1 | hw | imm16 | Rd | + * +--+-----+-------------+-----+----------------+------+ + * + * sf: 0 -> 32 bit, 1 -> 64 bit + * opc: 00 -> N, 10 -> Z, 11 -> K + * hw: shift/16 (0,16, and sf only 32, 48) + */ +static void disas_movw_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + uint64_t imm = extract32(insn, 5, 16); + int sf = extract32(insn, 31, 1); + int opc = extract32(insn, 29, 2); + int pos = extract32(insn, 21, 2) << 4; + TCGv_i64 tcg_rd = cpu_reg(s, rd); + + if (!sf && (pos >= 32)) { + unallocated_encoding(s); + return; + } + + switch (opc) { + case 0: /* MOVN */ + case 2: /* MOVZ */ + imm <<= pos; + if (opc == 0) { + imm = ~imm; + } + if (!sf) { + imm &= 0xffffffffu; + } + tcg_gen_movi_i64(tcg_rd, imm); + break; + case 3: /* MOVK */ + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_constant_i64(imm), pos, 16); + if (!sf) { + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } + break; + default: + unallocated_encoding(s); + break; + } +} + +/* Bitfield + * 31 30 29 28 23 22 21 16 15 10 9 5 4 0 + * +----+-----+-------------+---+------+------+------+------+ + * | sf | opc | 1 0 0 1 1 0 | N | immr | imms | Rn | Rd | + * +----+-----+-------------+---+------+------+------+------+ + */ +static void disas_bitfield(DisasContext *s, uint32_t insn) +{ + unsigned int sf, n, opc, ri, si, rn, rd, bitsize, pos, len; + TCGv_i64 tcg_rd, tcg_tmp; + + sf = extract32(insn, 31, 1); + opc = extract32(insn, 29, 2); + n = extract32(insn, 22, 1); + ri = extract32(insn, 16, 6); + si = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + bitsize = sf ? 64 : 32; + + if (sf != n || ri >= bitsize || si >= bitsize || opc > 2) { + unallocated_encoding(s); + return; + } + + tcg_rd = cpu_reg(s, rd); + + /* Suppress the zero-extend for !sf. Since RI and SI are constrained + to be smaller than bitsize, we'll never reference data outside the + low 32-bits anyway. */ + tcg_tmp = read_cpu_reg(s, rn, 1); + + /* Recognize simple(r) extractions. */ + if (si >= ri) { + /* Wd<s-r:0> = Wn<s:r> */ + len = (si - ri) + 1; + if (opc == 0) { /* SBFM: ASR, SBFX, SXTB, SXTH, SXTW */ + tcg_gen_sextract_i64(tcg_rd, tcg_tmp, ri, len); + goto done; + } else if (opc == 2) { /* UBFM: UBFX, LSR, UXTB, UXTH */ + tcg_gen_extract_i64(tcg_rd, tcg_tmp, ri, len); + return; + } + /* opc == 1, BFXIL fall through to deposit */ + tcg_gen_shri_i64(tcg_tmp, tcg_tmp, ri); + pos = 0; + } else { + /* Handle the ri > si case with a deposit + * Wd<32+s-r,32-r> = Wn<s:0> + */ + len = si + 1; + pos = (bitsize - ri) & (bitsize - 1); + } + + if (opc == 0 && len < ri) { + /* SBFM: sign extend the destination field from len to fill + the balance of the word. Let the deposit below insert all + of those sign bits. */ + tcg_gen_sextract_i64(tcg_tmp, tcg_tmp, 0, len); + len = ri; + } + + if (opc == 1) { /* BFM, BFXIL */ + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, pos, len); + } else { + /* SBFM or UBFM: We start with zero, and we haven't modified + any bits outside bitsize, therefore the zero-extension + below is unneeded. */ + tcg_gen_deposit_z_i64(tcg_rd, tcg_tmp, pos, len); + return; + } + + done: + if (!sf) { /* zero extend final result */ + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } +} + +/* Extract + * 31 30 29 28 23 22 21 20 16 15 10 9 5 4 0 + * +----+------+-------------+---+----+------+--------+------+------+ + * | sf | op21 | 1 0 0 1 1 1 | N | o0 | Rm | imms | Rn | Rd | + * +----+------+-------------+---+----+------+--------+------+------+ + */ +static void disas_extract(DisasContext *s, uint32_t insn) +{ + unsigned int sf, n, rm, imm, rn, rd, bitsize, op21, op0; + + sf = extract32(insn, 31, 1); + n = extract32(insn, 22, 1); + rm = extract32(insn, 16, 5); + imm = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + op21 = extract32(insn, 29, 2); + op0 = extract32(insn, 21, 1); + bitsize = sf ? 64 : 32; + + if (sf != n || op21 || op0 || imm >= bitsize) { + unallocated_encoding(s); + } else { + TCGv_i64 tcg_rd, tcg_rm, tcg_rn; + + tcg_rd = cpu_reg(s, rd); + + if (unlikely(imm == 0)) { + /* tcg shl_i32/shl_i64 is undefined for 32/64 bit shifts, + * so an extract from bit 0 is a special case. + */ + if (sf) { + tcg_gen_mov_i64(tcg_rd, cpu_reg(s, rm)); + } else { + tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rm)); + } + } else { + tcg_rm = cpu_reg(s, rm); + tcg_rn = cpu_reg(s, rn); + + if (sf) { + /* Specialization to ROR happens in EXTRACT2. */ + tcg_gen_extract2_i64(tcg_rd, tcg_rm, tcg_rn, imm); + } else { + TCGv_i32 t0 = tcg_temp_new_i32(); + + tcg_gen_extrl_i64_i32(t0, tcg_rm); + if (rm == rn) { + tcg_gen_rotri_i32(t0, t0, imm); + } else { + TCGv_i32 t1 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t1, tcg_rn); + tcg_gen_extract2_i32(t0, t0, t1, imm); + tcg_temp_free_i32(t1); + } + tcg_gen_extu_i32_i64(tcg_rd, t0); + tcg_temp_free_i32(t0); + } + } + } +} + +/* Data processing - immediate */ +static void disas_data_proc_imm(DisasContext *s, uint32_t insn) +{ + switch (extract32(insn, 23, 6)) { + case 0x20: case 0x21: /* PC-rel. addressing */ + disas_pc_rel_adr(s, insn); + break; + case 0x22: /* Add/subtract (immediate) */ + disas_add_sub_imm(s, insn); + break; + case 0x23: /* Add/subtract (immediate, with tags) */ + disas_add_sub_imm_with_tags(s, insn); + break; + case 0x24: /* Logical (immediate) */ + disas_logic_imm(s, insn); + break; + case 0x25: /* Move wide (immediate) */ + disas_movw_imm(s, insn); + break; + case 0x26: /* Bitfield */ + disas_bitfield(s, insn); + break; + case 0x27: /* Extract */ + disas_extract(s, insn); + break; + default: + unallocated_encoding(s); + break; + } +} + +/* Shift a TCGv src by TCGv shift_amount, put result in dst. + * Note that it is the caller's responsibility to ensure that the + * shift amount is in range (ie 0..31 or 0..63) and provide the ARM + * mandated semantics for out of range shifts. + */ +static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf, + enum a64_shift_type shift_type, TCGv_i64 shift_amount) +{ + switch (shift_type) { + case A64_SHIFT_TYPE_LSL: + tcg_gen_shl_i64(dst, src, shift_amount); + break; + case A64_SHIFT_TYPE_LSR: + tcg_gen_shr_i64(dst, src, shift_amount); + break; + case A64_SHIFT_TYPE_ASR: + if (!sf) { + tcg_gen_ext32s_i64(dst, src); + } + tcg_gen_sar_i64(dst, sf ? src : dst, shift_amount); + break; + case A64_SHIFT_TYPE_ROR: + if (sf) { + tcg_gen_rotr_i64(dst, src, shift_amount); + } else { + TCGv_i32 t0, t1; + t0 = tcg_temp_new_i32(); + t1 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t0, src); + tcg_gen_extrl_i64_i32(t1, shift_amount); + tcg_gen_rotr_i32(t0, t0, t1); + tcg_gen_extu_i32_i64(dst, t0); + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); + } + break; + default: + assert(FALSE); /* all shift types should be handled */ + break; + } + + if (!sf) { /* zero extend final result */ + tcg_gen_ext32u_i64(dst, dst); + } +} + +/* Shift a TCGv src by immediate, put result in dst. + * The shift amount must be in range (this should always be true as the + * relevant instructions will UNDEF on bad shift immediates). + */ +static void shift_reg_imm(TCGv_i64 dst, TCGv_i64 src, int sf, + enum a64_shift_type shift_type, unsigned int shift_i) +{ + assert(shift_i < (sf ? 64 : 32)); + + if (shift_i == 0) { + tcg_gen_mov_i64(dst, src); + } else { + shift_reg(dst, src, sf, shift_type, tcg_constant_i64(shift_i)); + } +} + +/* Logical (shifted register) + * 31 30 29 28 24 23 22 21 20 16 15 10 9 5 4 0 + * +----+-----+-----------+-------+---+------+--------+------+------+ + * | sf | opc | 0 1 0 1 0 | shift | N | Rm | imm6 | Rn | Rd | + * +----+-----+-----------+-------+---+------+--------+------+------+ + */ +static void disas_logic_reg(DisasContext *s, uint32_t insn) +{ + TCGv_i64 tcg_rd, tcg_rn, tcg_rm; + unsigned int sf, opc, shift_type, invert, rm, shift_amount, rn, rd; + + sf = extract32(insn, 31, 1); + opc = extract32(insn, 29, 2); + shift_type = extract32(insn, 22, 2); + invert = extract32(insn, 21, 1); + rm = extract32(insn, 16, 5); + shift_amount = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + if (!sf && (shift_amount & (1 << 5))) { + unallocated_encoding(s); + return; + } + + tcg_rd = cpu_reg(s, rd); + + if (opc == 1 && shift_amount == 0 && shift_type == 0 && rn == 31) { + /* Unshifted ORR and ORN with WZR/XZR is the standard encoding for + * register-register MOV and MVN, so it is worth special casing. + */ + tcg_rm = cpu_reg(s, rm); + if (invert) { + tcg_gen_not_i64(tcg_rd, tcg_rm); + if (!sf) { + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } + } else { + if (sf) { + tcg_gen_mov_i64(tcg_rd, tcg_rm); + } else { + tcg_gen_ext32u_i64(tcg_rd, tcg_rm); + } + } + return; + } + + tcg_rm = read_cpu_reg(s, rm, sf); + + if (shift_amount) { + shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, shift_amount); + } + + tcg_rn = cpu_reg(s, rn); + + switch (opc | (invert << 2)) { + case 0: /* AND */ + case 3: /* ANDS */ + tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 1: /* ORR */ + tcg_gen_or_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 2: /* EOR */ + tcg_gen_xor_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 4: /* BIC */ + case 7: /* BICS */ + tcg_gen_andc_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 5: /* ORN */ + tcg_gen_orc_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 6: /* EON */ + tcg_gen_eqv_i64(tcg_rd, tcg_rn, tcg_rm); + break; + default: + assert(FALSE); + break; + } + + if (!sf) { + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } + + if (opc == 3) { + gen_logic_CC(sf, tcg_rd); + } +} + +/* + * Add/subtract (extended register) + * + * 31|30|29|28 24|23 22|21|20 16|15 13|12 10|9 5|4 0| + * +--+--+--+-----------+-----+--+-------+------+------+----+----+ + * |sf|op| S| 0 1 0 1 1 | opt | 1| Rm |option| imm3 | Rn | Rd | + * +--+--+--+-----------+-----+--+-------+------+------+----+----+ + * + * sf: 0 -> 32bit, 1 -> 64bit + * op: 0 -> add , 1 -> sub + * S: 1 -> set flags + * opt: 00 + * option: extension type (see DecodeRegExtend) + * imm3: optional shift to Rm + * + * Rd = Rn + LSL(extend(Rm), amount) + */ +static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int imm3 = extract32(insn, 10, 3); + int option = extract32(insn, 13, 3); + int rm = extract32(insn, 16, 5); + int opt = extract32(insn, 22, 2); + bool setflags = extract32(insn, 29, 1); + bool sub_op = extract32(insn, 30, 1); + bool sf = extract32(insn, 31, 1); + + TCGv_i64 tcg_rm, tcg_rn; /* temps */ + TCGv_i64 tcg_rd; + TCGv_i64 tcg_result; + + if (imm3 > 4 || opt != 0) { + unallocated_encoding(s); + return; + } + + /* non-flag setting ops may use SP */ + if (!setflags) { + tcg_rd = cpu_reg_sp(s, rd); + } else { + tcg_rd = cpu_reg(s, rd); + } + tcg_rn = read_cpu_reg_sp(s, rn, sf); + + tcg_rm = read_cpu_reg(s, rm, sf); + ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3); + + tcg_result = tcg_temp_new_i64(); + + if (!setflags) { + if (sub_op) { + tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm); + } else { + tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm); + } + } else { + if (sub_op) { + gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm); + } else { + gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm); + } + } + + if (sf) { + tcg_gen_mov_i64(tcg_rd, tcg_result); + } else { + tcg_gen_ext32u_i64(tcg_rd, tcg_result); + } + + tcg_temp_free_i64(tcg_result); +} + +/* + * Add/subtract (shifted register) + * + * 31 30 29 28 24 23 22 21 20 16 15 10 9 5 4 0 + * +--+--+--+-----------+-----+--+-------+---------+------+------+ + * |sf|op| S| 0 1 0 1 1 |shift| 0| Rm | imm6 | Rn | Rd | + * +--+--+--+-----------+-----+--+-------+---------+------+------+ + * + * sf: 0 -> 32bit, 1 -> 64bit + * op: 0 -> add , 1 -> sub + * S: 1 -> set flags + * shift: 00 -> LSL, 01 -> LSR, 10 -> ASR, 11 -> RESERVED + * imm6: Shift amount to apply to Rm before the add/sub + */ +static void disas_add_sub_reg(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int imm6 = extract32(insn, 10, 6); + int rm = extract32(insn, 16, 5); + int shift_type = extract32(insn, 22, 2); + bool setflags = extract32(insn, 29, 1); + bool sub_op = extract32(insn, 30, 1); + bool sf = extract32(insn, 31, 1); + + TCGv_i64 tcg_rd = cpu_reg(s, rd); + TCGv_i64 tcg_rn, tcg_rm; + TCGv_i64 tcg_result; + + if ((shift_type == 3) || (!sf && (imm6 > 31))) { + unallocated_encoding(s); + return; + } + + tcg_rn = read_cpu_reg(s, rn, sf); + tcg_rm = read_cpu_reg(s, rm, sf); + + shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, imm6); + + tcg_result = tcg_temp_new_i64(); + + if (!setflags) { + if (sub_op) { + tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm); + } else { + tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm); + } + } else { + if (sub_op) { + gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm); + } else { + gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm); + } + } + + if (sf) { + tcg_gen_mov_i64(tcg_rd, tcg_result); + } else { + tcg_gen_ext32u_i64(tcg_rd, tcg_result); + } + + tcg_temp_free_i64(tcg_result); +} + +/* Data-processing (3 source) + * + * 31 30 29 28 24 23 21 20 16 15 14 10 9 5 4 0 + * +--+------+-----------+------+------+----+------+------+------+ + * |sf| op54 | 1 1 0 1 1 | op31 | Rm | o0 | Ra | Rn | Rd | + * +--+------+-----------+------+------+----+------+------+------+ + */ +static void disas_data_proc_3src(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int ra = extract32(insn, 10, 5); + int rm = extract32(insn, 16, 5); + int op_id = (extract32(insn, 29, 3) << 4) | + (extract32(insn, 21, 3) << 1) | + extract32(insn, 15, 1); + bool sf = extract32(insn, 31, 1); + bool is_sub = extract32(op_id, 0, 1); + bool is_high = extract32(op_id, 2, 1); + bool is_signed = false; + TCGv_i64 tcg_op1; + TCGv_i64 tcg_op2; + TCGv_i64 tcg_tmp; + + /* Note that op_id is sf:op54:op31:o0 so it includes the 32/64 size flag */ + switch (op_id) { + case 0x42: /* SMADDL */ + case 0x43: /* SMSUBL */ + case 0x44: /* SMULH */ + is_signed = true; + break; + case 0x0: /* MADD (32bit) */ + case 0x1: /* MSUB (32bit) */ + case 0x40: /* MADD (64bit) */ + case 0x41: /* MSUB (64bit) */ + case 0x4a: /* UMADDL */ + case 0x4b: /* UMSUBL */ + case 0x4c: /* UMULH */ + break; + default: + unallocated_encoding(s); + return; + } + + if (is_high) { + TCGv_i64 low_bits = tcg_temp_new_i64(); /* low bits discarded */ + TCGv_i64 tcg_rd = cpu_reg(s, rd); + TCGv_i64 tcg_rn = cpu_reg(s, rn); + TCGv_i64 tcg_rm = cpu_reg(s, rm); + + if (is_signed) { + tcg_gen_muls2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm); + } else { + tcg_gen_mulu2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm); + } + + tcg_temp_free_i64(low_bits); + return; + } + + tcg_op1 = tcg_temp_new_i64(); + tcg_op2 = tcg_temp_new_i64(); + tcg_tmp = tcg_temp_new_i64(); + + if (op_id < 0x42) { + tcg_gen_mov_i64(tcg_op1, cpu_reg(s, rn)); + tcg_gen_mov_i64(tcg_op2, cpu_reg(s, rm)); + } else { + if (is_signed) { + tcg_gen_ext32s_i64(tcg_op1, cpu_reg(s, rn)); + tcg_gen_ext32s_i64(tcg_op2, cpu_reg(s, rm)); + } else { + tcg_gen_ext32u_i64(tcg_op1, cpu_reg(s, rn)); + tcg_gen_ext32u_i64(tcg_op2, cpu_reg(s, rm)); + } + } + + if (ra == 31 && !is_sub) { + /* Special-case MADD with rA == XZR; it is the standard MUL alias */ + tcg_gen_mul_i64(cpu_reg(s, rd), tcg_op1, tcg_op2); + } else { + tcg_gen_mul_i64(tcg_tmp, tcg_op1, tcg_op2); + if (is_sub) { + tcg_gen_sub_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp); + } else { + tcg_gen_add_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp); + } + } + + if (!sf) { + tcg_gen_ext32u_i64(cpu_reg(s, rd), cpu_reg(s, rd)); + } + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_tmp); +} + +/* Add/subtract (with carry) + * 31 30 29 28 27 26 25 24 23 22 21 20 16 15 10 9 5 4 0 + * +--+--+--+------------------------+------+-------------+------+-----+ + * |sf|op| S| 1 1 0 1 0 0 0 0 | rm | 0 0 0 0 0 0 | Rn | Rd | + * +--+--+--+------------------------+------+-------------+------+-----+ + */ + +static void disas_adc_sbc(DisasContext *s, uint32_t insn) +{ + unsigned int sf, op, setflags, rm, rn, rd; + TCGv_i64 tcg_y, tcg_rn, tcg_rd; + + sf = extract32(insn, 31, 1); + op = extract32(insn, 30, 1); + setflags = extract32(insn, 29, 1); + rm = extract32(insn, 16, 5); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + tcg_rd = cpu_reg(s, rd); + tcg_rn = cpu_reg(s, rn); + + if (op) { + tcg_y = new_tmp_a64(s); + tcg_gen_not_i64(tcg_y, cpu_reg(s, rm)); + } else { + tcg_y = cpu_reg(s, rm); + } + + if (setflags) { + gen_adc_CC(sf, tcg_rd, tcg_rn, tcg_y); + } else { + gen_adc(sf, tcg_rd, tcg_rn, tcg_y); + } +} + +/* + * Rotate right into flags + * 31 30 29 21 15 10 5 4 0 + * +--+--+--+-----------------+--------+-----------+------+--+------+ + * |sf|op| S| 1 1 0 1 0 0 0 0 | imm6 | 0 0 0 0 1 | Rn |o2| mask | + * +--+--+--+-----------------+--------+-----------+------+--+------+ + */ +static void disas_rotate_right_into_flags(DisasContext *s, uint32_t insn) +{ + int mask = extract32(insn, 0, 4); + int o2 = extract32(insn, 4, 1); + int rn = extract32(insn, 5, 5); + int imm6 = extract32(insn, 15, 6); + int sf_op_s = extract32(insn, 29, 3); + TCGv_i64 tcg_rn; + TCGv_i32 nzcv; + + if (sf_op_s != 5 || o2 != 0 || !dc_isar_feature(aa64_condm_4, s)) { + unallocated_encoding(s); + return; + } + + tcg_rn = read_cpu_reg(s, rn, 1); + tcg_gen_rotri_i64(tcg_rn, tcg_rn, imm6); + + nzcv = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(nzcv, tcg_rn); + + if (mask & 8) { /* N */ + tcg_gen_shli_i32(cpu_NF, nzcv, 31 - 3); + } + if (mask & 4) { /* Z */ + tcg_gen_not_i32(cpu_ZF, nzcv); + tcg_gen_andi_i32(cpu_ZF, cpu_ZF, 4); + } + if (mask & 2) { /* C */ + tcg_gen_extract_i32(cpu_CF, nzcv, 1, 1); + } + if (mask & 1) { /* V */ + tcg_gen_shli_i32(cpu_VF, nzcv, 31 - 0); + } + + tcg_temp_free_i32(nzcv); +} + +/* + * Evaluate into flags + * 31 30 29 21 15 14 10 5 4 0 + * +--+--+--+-----------------+---------+----+---------+------+--+------+ + * |sf|op| S| 1 1 0 1 0 0 0 0 | opcode2 | sz | 0 0 1 0 | Rn |o3| mask | + * +--+--+--+-----------------+---------+----+---------+------+--+------+ + */ +static void disas_evaluate_into_flags(DisasContext *s, uint32_t insn) +{ + int o3_mask = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int o2 = extract32(insn, 15, 6); + int sz = extract32(insn, 14, 1); + int sf_op_s = extract32(insn, 29, 3); + TCGv_i32 tmp; + int shift; + + if (sf_op_s != 1 || o2 != 0 || o3_mask != 0xd || + !dc_isar_feature(aa64_condm_4, s)) { + unallocated_encoding(s); + return; + } + shift = sz ? 16 : 24; /* SETF16 or SETF8 */ + + tmp = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tmp, cpu_reg(s, rn)); + tcg_gen_shli_i32(cpu_NF, tmp, shift); + tcg_gen_shli_i32(cpu_VF, tmp, shift - 1); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_xor_i32(cpu_VF, cpu_VF, cpu_NF); + tcg_temp_free_i32(tmp); +} + +/* Conditional compare (immediate / register) + * 31 30 29 28 27 26 25 24 23 22 21 20 16 15 12 11 10 9 5 4 3 0 + * +--+--+--+------------------------+--------+------+----+--+------+--+-----+ + * |sf|op| S| 1 1 0 1 0 0 1 0 |imm5/rm | cond |i/r |o2| Rn |o3|nzcv | + * +--+--+--+------------------------+--------+------+----+--+------+--+-----+ + * [1] y [0] [0] + */ +static void disas_cc(DisasContext *s, uint32_t insn) +{ + unsigned int sf, op, y, cond, rn, nzcv, is_imm; + TCGv_i32 tcg_t0, tcg_t1, tcg_t2; + TCGv_i64 tcg_tmp, tcg_y, tcg_rn; + DisasCompare c; + + if (!extract32(insn, 29, 1)) { + unallocated_encoding(s); + return; + } + if (insn & (1 << 10 | 1 << 4)) { + unallocated_encoding(s); + return; + } + sf = extract32(insn, 31, 1); + op = extract32(insn, 30, 1); + is_imm = extract32(insn, 11, 1); + y = extract32(insn, 16, 5); /* y = rm (reg) or imm5 (imm) */ + cond = extract32(insn, 12, 4); + rn = extract32(insn, 5, 5); + nzcv = extract32(insn, 0, 4); + + /* Set T0 = !COND. */ + tcg_t0 = tcg_temp_new_i32(); + arm_test_cc(&c, cond); + tcg_gen_setcondi_i32(tcg_invert_cond(c.cond), tcg_t0, c.value, 0); + arm_free_cc(&c); + + /* Load the arguments for the new comparison. */ + if (is_imm) { + tcg_y = new_tmp_a64(s); + tcg_gen_movi_i64(tcg_y, y); + } else { + tcg_y = cpu_reg(s, y); + } + tcg_rn = cpu_reg(s, rn); + + /* Set the flags for the new comparison. */ + tcg_tmp = tcg_temp_new_i64(); + if (op) { + gen_sub_CC(sf, tcg_tmp, tcg_rn, tcg_y); + } else { + gen_add_CC(sf, tcg_tmp, tcg_rn, tcg_y); + } + tcg_temp_free_i64(tcg_tmp); + + /* If COND was false, force the flags to #nzcv. Compute two masks + * to help with this: T1 = (COND ? 0 : -1), T2 = (COND ? -1 : 0). + * For tcg hosts that support ANDC, we can make do with just T1. + * In either case, allow the tcg optimizer to delete any unused mask. + */ + tcg_t1 = tcg_temp_new_i32(); + tcg_t2 = tcg_temp_new_i32(); + tcg_gen_neg_i32(tcg_t1, tcg_t0); + tcg_gen_subi_i32(tcg_t2, tcg_t0, 1); + + if (nzcv & 8) { /* N */ + tcg_gen_or_i32(cpu_NF, cpu_NF, tcg_t1); + } else { + if (TCG_TARGET_HAS_andc_i32) { + tcg_gen_andc_i32(cpu_NF, cpu_NF, tcg_t1); + } else { + tcg_gen_and_i32(cpu_NF, cpu_NF, tcg_t2); + } + } + if (nzcv & 4) { /* Z */ + if (TCG_TARGET_HAS_andc_i32) { + tcg_gen_andc_i32(cpu_ZF, cpu_ZF, tcg_t1); + } else { + tcg_gen_and_i32(cpu_ZF, cpu_ZF, tcg_t2); + } + } else { + tcg_gen_or_i32(cpu_ZF, cpu_ZF, tcg_t0); + } + if (nzcv & 2) { /* C */ + tcg_gen_or_i32(cpu_CF, cpu_CF, tcg_t0); + } else { + if (TCG_TARGET_HAS_andc_i32) { + tcg_gen_andc_i32(cpu_CF, cpu_CF, tcg_t1); + } else { + tcg_gen_and_i32(cpu_CF, cpu_CF, tcg_t2); + } + } + if (nzcv & 1) { /* V */ + tcg_gen_or_i32(cpu_VF, cpu_VF, tcg_t1); + } else { + if (TCG_TARGET_HAS_andc_i32) { + tcg_gen_andc_i32(cpu_VF, cpu_VF, tcg_t1); + } else { + tcg_gen_and_i32(cpu_VF, cpu_VF, tcg_t2); + } + } + tcg_temp_free_i32(tcg_t0); + tcg_temp_free_i32(tcg_t1); + tcg_temp_free_i32(tcg_t2); +} + +/* Conditional select + * 31 30 29 28 21 20 16 15 12 11 10 9 5 4 0 + * +----+----+---+-----------------+------+------+-----+------+------+ + * | sf | op | S | 1 1 0 1 0 1 0 0 | Rm | cond | op2 | Rn | Rd | + * +----+----+---+-----------------+------+------+-----+------+------+ + */ +static void disas_cond_select(DisasContext *s, uint32_t insn) +{ + unsigned int sf, else_inv, rm, cond, else_inc, rn, rd; + TCGv_i64 tcg_rd, zero; + DisasCompare64 c; + + if (extract32(insn, 29, 1) || extract32(insn, 11, 1)) { + /* S == 1 or op2<1> == 1 */ + unallocated_encoding(s); + return; + } + sf = extract32(insn, 31, 1); + else_inv = extract32(insn, 30, 1); + rm = extract32(insn, 16, 5); + cond = extract32(insn, 12, 4); + else_inc = extract32(insn, 10, 1); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + tcg_rd = cpu_reg(s, rd); + + a64_test_cc(&c, cond); + zero = tcg_constant_i64(0); + + if (rn == 31 && rm == 31 && (else_inc ^ else_inv)) { + /* CSET & CSETM. */ + tcg_gen_setcond_i64(tcg_invert_cond(c.cond), tcg_rd, c.value, zero); + if (else_inv) { + tcg_gen_neg_i64(tcg_rd, tcg_rd); + } + } else { + TCGv_i64 t_true = cpu_reg(s, rn); + TCGv_i64 t_false = read_cpu_reg(s, rm, 1); + if (else_inv && else_inc) { + tcg_gen_neg_i64(t_false, t_false); + } else if (else_inv) { + tcg_gen_not_i64(t_false, t_false); + } else if (else_inc) { + tcg_gen_addi_i64(t_false, t_false, 1); + } + tcg_gen_movcond_i64(c.cond, tcg_rd, c.value, zero, t_true, t_false); + } + + a64_free_cc(&c); + + if (!sf) { + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } +} + +static void handle_clz(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_rd, tcg_rn; + tcg_rd = cpu_reg(s, rd); + tcg_rn = cpu_reg(s, rn); + + if (sf) { + tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64); + } else { + TCGv_i32 tcg_tmp32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn); + tcg_gen_clzi_i32(tcg_tmp32, tcg_tmp32, 32); + tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32); + tcg_temp_free_i32(tcg_tmp32); + } +} + +static void handle_cls(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_rd, tcg_rn; + tcg_rd = cpu_reg(s, rd); + tcg_rn = cpu_reg(s, rn); + + if (sf) { + tcg_gen_clrsb_i64(tcg_rd, tcg_rn); + } else { + TCGv_i32 tcg_tmp32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn); + tcg_gen_clrsb_i32(tcg_tmp32, tcg_tmp32); + tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32); + tcg_temp_free_i32(tcg_tmp32); + } +} + +static void handle_rbit(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_rd, tcg_rn; + tcg_rd = cpu_reg(s, rd); + tcg_rn = cpu_reg(s, rn); + + if (sf) { + gen_helper_rbit64(tcg_rd, tcg_rn); + } else { + TCGv_i32 tcg_tmp32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn); + gen_helper_rbit(tcg_tmp32, tcg_tmp32); + tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32); + tcg_temp_free_i32(tcg_tmp32); + } +} + +/* REV with sf==1, opcode==3 ("REV64") */ +static void handle_rev64(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + if (!sf) { + unallocated_encoding(s); + return; + } + tcg_gen_bswap64_i64(cpu_reg(s, rd), cpu_reg(s, rn)); +} + +/* REV with sf==0, opcode==2 + * REV32 (sf==1, opcode==2) + */ +static void handle_rev32(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_rd = cpu_reg(s, rd); + TCGv_i64 tcg_rn = cpu_reg(s, rn); + + if (sf) { + tcg_gen_bswap64_i64(tcg_rd, tcg_rn); + tcg_gen_rotri_i64(tcg_rd, tcg_rd, 32); + } else { + tcg_gen_bswap32_i64(tcg_rd, tcg_rn, TCG_BSWAP_OZ); + } +} + +/* REV16 (opcode==1) */ +static void handle_rev16(DisasContext *s, unsigned int sf, + unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_rd = cpu_reg(s, rd); + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf); + TCGv_i64 mask = tcg_constant_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff); + + tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8); + tcg_gen_and_i64(tcg_rd, tcg_rn, mask); + tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask); + tcg_gen_shli_i64(tcg_rd, tcg_rd, 8); + tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_tmp); + + tcg_temp_free_i64(tcg_tmp); +} + +/* Data-processing (1 source) + * 31 30 29 28 21 20 16 15 10 9 5 4 0 + * +----+---+---+-----------------+---------+--------+------+------+ + * | sf | 1 | S | 1 1 0 1 0 1 1 0 | opcode2 | opcode | Rn | Rd | + * +----+---+---+-----------------+---------+--------+------+------+ + */ +static void disas_data_proc_1src(DisasContext *s, uint32_t insn) +{ + unsigned int sf, opcode, opcode2, rn, rd; + TCGv_i64 tcg_rd; + + if (extract32(insn, 29, 1)) { + unallocated_encoding(s); + return; + } + + sf = extract32(insn, 31, 1); + opcode = extract32(insn, 10, 6); + opcode2 = extract32(insn, 16, 5); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + +#define MAP(SF, O2, O1) ((SF) | (O1 << 1) | (O2 << 7)) + + switch (MAP(sf, opcode2, opcode)) { + case MAP(0, 0x00, 0x00): /* RBIT */ + case MAP(1, 0x00, 0x00): + handle_rbit(s, sf, rn, rd); + break; + case MAP(0, 0x00, 0x01): /* REV16 */ + case MAP(1, 0x00, 0x01): + handle_rev16(s, sf, rn, rd); + break; + case MAP(0, 0x00, 0x02): /* REV/REV32 */ + case MAP(1, 0x00, 0x02): + handle_rev32(s, sf, rn, rd); + break; + case MAP(1, 0x00, 0x03): /* REV64 */ + handle_rev64(s, sf, rn, rd); + break; + case MAP(0, 0x00, 0x04): /* CLZ */ + case MAP(1, 0x00, 0x04): + handle_clz(s, sf, rn, rd); + break; + case MAP(0, 0x00, 0x05): /* CLS */ + case MAP(1, 0x00, 0x05): + handle_cls(s, sf, rn, rd); + break; + case MAP(1, 0x01, 0x00): /* PACIA */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x01): /* PACIB */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x02): /* PACDA */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x03): /* PACDB */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x04): /* AUTIA */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x05): /* AUTIB */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x06): /* AUTDA */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x07): /* AUTDB */ + if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn)); + } else if (!dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + break; + case MAP(1, 0x01, 0x08): /* PACIZA */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x09): /* PACIZB */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0a): /* PACDZA */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0b): /* PACDZB */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0c): /* AUTIZA */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0d): /* AUTIZB */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0e): /* AUTDZA */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x0f): /* AUTDZB */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s)); + } + break; + case MAP(1, 0x01, 0x10): /* XPACI */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_xpaci(tcg_rd, cpu_env, tcg_rd); + } + break; + case MAP(1, 0x01, 0x11): /* XPACD */ + if (!dc_isar_feature(aa64_pauth, s) || rn != 31) { + goto do_unallocated; + } else if (s->pauth_active) { + tcg_rd = cpu_reg(s, rd); + gen_helper_xpacd(tcg_rd, cpu_env, tcg_rd); + } + break; + default: + do_unallocated: + unallocated_encoding(s); + break; + } + +#undef MAP +} + +static void handle_div(DisasContext *s, bool is_signed, unsigned int sf, + unsigned int rm, unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_n, tcg_m, tcg_rd; + tcg_rd = cpu_reg(s, rd); + + if (!sf && is_signed) { + tcg_n = new_tmp_a64(s); + tcg_m = new_tmp_a64(s); + tcg_gen_ext32s_i64(tcg_n, cpu_reg(s, rn)); + tcg_gen_ext32s_i64(tcg_m, cpu_reg(s, rm)); + } else { + tcg_n = read_cpu_reg(s, rn, sf); + tcg_m = read_cpu_reg(s, rm, sf); + } + + if (is_signed) { + gen_helper_sdiv64(tcg_rd, tcg_n, tcg_m); + } else { + gen_helper_udiv64(tcg_rd, tcg_n, tcg_m); + } + + if (!sf) { /* zero extend final result */ + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } +} + +/* LSLV, LSRV, ASRV, RORV */ +static void handle_shift_reg(DisasContext *s, + enum a64_shift_type shift_type, unsigned int sf, + unsigned int rm, unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_shift = tcg_temp_new_i64(); + TCGv_i64 tcg_rd = cpu_reg(s, rd); + TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf); + + tcg_gen_andi_i64(tcg_shift, cpu_reg(s, rm), sf ? 63 : 31); + shift_reg(tcg_rd, tcg_rn, sf, shift_type, tcg_shift); + tcg_temp_free_i64(tcg_shift); +} + +/* CRC32[BHWX], CRC32C[BHWX] */ +static void handle_crc32(DisasContext *s, + unsigned int sf, unsigned int sz, bool crc32c, + unsigned int rm, unsigned int rn, unsigned int rd) +{ + TCGv_i64 tcg_acc, tcg_val; + TCGv_i32 tcg_bytes; + + if (!dc_isar_feature(aa64_crc32, s) + || (sf == 1 && sz != 3) + || (sf == 0 && sz == 3)) { + unallocated_encoding(s); + return; + } + + if (sz == 3) { + tcg_val = cpu_reg(s, rm); + } else { + uint64_t mask; + switch (sz) { + case 0: + mask = 0xFF; + break; + case 1: + mask = 0xFFFF; + break; + case 2: + mask = 0xFFFFFFFF; + break; + default: + g_assert_not_reached(); + } + tcg_val = new_tmp_a64(s); + tcg_gen_andi_i64(tcg_val, cpu_reg(s, rm), mask); + } + + tcg_acc = cpu_reg(s, rn); + tcg_bytes = tcg_constant_i32(1 << sz); + + if (crc32c) { + gen_helper_crc32c_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes); + } else { + gen_helper_crc32_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes); + } +} + +/* Data-processing (2 source) + * 31 30 29 28 21 20 16 15 10 9 5 4 0 + * +----+---+---+-----------------+------+--------+------+------+ + * | sf | 0 | S | 1 1 0 1 0 1 1 0 | Rm | opcode | Rn | Rd | + * +----+---+---+-----------------+------+--------+------+------+ + */ +static void disas_data_proc_2src(DisasContext *s, uint32_t insn) +{ + unsigned int sf, rm, opcode, rn, rd, setflag; + sf = extract32(insn, 31, 1); + setflag = extract32(insn, 29, 1); + rm = extract32(insn, 16, 5); + opcode = extract32(insn, 10, 6); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + if (setflag && opcode != 0) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0: /* SUBP(S) */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } else { + TCGv_i64 tcg_n, tcg_m, tcg_d; + + tcg_n = read_cpu_reg_sp(s, rn, true); + tcg_m = read_cpu_reg_sp(s, rm, true); + tcg_gen_sextract_i64(tcg_n, tcg_n, 0, 56); + tcg_gen_sextract_i64(tcg_m, tcg_m, 0, 56); + tcg_d = cpu_reg(s, rd); + + if (setflag) { + gen_sub_CC(true, tcg_d, tcg_n, tcg_m); + } else { + tcg_gen_sub_i64(tcg_d, tcg_n, tcg_m); + } + } + break; + case 2: /* UDIV */ + handle_div(s, false, sf, rm, rn, rd); + break; + case 3: /* SDIV */ + handle_div(s, true, sf, rm, rn, rd); + break; + case 4: /* IRG */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } + if (s->ata) { + gen_helper_irg(cpu_reg_sp(s, rd), cpu_env, + cpu_reg_sp(s, rn), cpu_reg(s, rm)); + } else { + gen_address_with_allocation_tag0(cpu_reg_sp(s, rd), + cpu_reg_sp(s, rn)); + } + break; + case 5: /* GMI */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } else { + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_extract_i64(t, cpu_reg_sp(s, rn), 56, 4); + tcg_gen_shl_i64(t, tcg_constant_i64(1), t); + tcg_gen_or_i64(cpu_reg(s, rd), cpu_reg(s, rm), t); + + tcg_temp_free_i64(t); + } + break; + case 8: /* LSLV */ + handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd); + break; + case 9: /* LSRV */ + handle_shift_reg(s, A64_SHIFT_TYPE_LSR, sf, rm, rn, rd); + break; + case 10: /* ASRV */ + handle_shift_reg(s, A64_SHIFT_TYPE_ASR, sf, rm, rn, rd); + break; + case 11: /* RORV */ + handle_shift_reg(s, A64_SHIFT_TYPE_ROR, sf, rm, rn, rd); + break; + case 12: /* PACGA */ + if (sf == 0 || !dc_isar_feature(aa64_pauth, s)) { + goto do_unallocated; + } + gen_helper_pacga(cpu_reg(s, rd), cpu_env, + cpu_reg(s, rn), cpu_reg_sp(s, rm)); + break; + case 16: + case 17: + case 18: + case 19: + case 20: + case 21: + case 22: + case 23: /* CRC32 */ + { + int sz = extract32(opcode, 0, 2); + bool crc32c = extract32(opcode, 2, 1); + handle_crc32(s, sf, sz, crc32c, rm, rn, rd); + break; + } + default: + do_unallocated: + unallocated_encoding(s); + break; + } +} + +/* + * Data processing - register + * 31 30 29 28 25 21 20 16 10 0 + * +--+---+--+---+-------+-----+-------+-------+---------+ + * | |op0| |op1| 1 0 1 | op2 | | op3 | | + * +--+---+--+---+-------+-----+-------+-------+---------+ + */ +static void disas_data_proc_reg(DisasContext *s, uint32_t insn) +{ + int op0 = extract32(insn, 30, 1); + int op1 = extract32(insn, 28, 1); + int op2 = extract32(insn, 21, 4); + int op3 = extract32(insn, 10, 6); + + if (!op1) { + if (op2 & 8) { + if (op2 & 1) { + /* Add/sub (extended register) */ + disas_add_sub_ext_reg(s, insn); + } else { + /* Add/sub (shifted register) */ + disas_add_sub_reg(s, insn); + } + } else { + /* Logical (shifted register) */ + disas_logic_reg(s, insn); + } + return; + } + + switch (op2) { + case 0x0: + switch (op3) { + case 0x00: /* Add/subtract (with carry) */ + disas_adc_sbc(s, insn); + break; + + case 0x01: /* Rotate right into flags */ + case 0x21: + disas_rotate_right_into_flags(s, insn); + break; + + case 0x02: /* Evaluate into flags */ + case 0x12: + case 0x22: + case 0x32: + disas_evaluate_into_flags(s, insn); + break; + + default: + goto do_unallocated; + } + break; + + case 0x2: /* Conditional compare */ + disas_cc(s, insn); /* both imm and reg forms */ + break; + + case 0x4: /* Conditional select */ + disas_cond_select(s, insn); + break; + + case 0x6: /* Data-processing */ + if (op0) { /* (1 source) */ + disas_data_proc_1src(s, insn); + } else { /* (2 source) */ + disas_data_proc_2src(s, insn); + } + break; + case 0x8 ... 0xf: /* (3 source) */ + disas_data_proc_3src(s, insn); + break; + + default: + do_unallocated: + unallocated_encoding(s); + break; + } +} + +static void handle_fp_compare(DisasContext *s, int size, + unsigned int rn, unsigned int rm, + bool cmp_with_zero, bool signal_all_nans) +{ + TCGv_i64 tcg_flags = tcg_temp_new_i64(); + TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + + if (size == MO_64) { + TCGv_i64 tcg_vn, tcg_vm; + + tcg_vn = read_fp_dreg(s, rn); + if (cmp_with_zero) { + tcg_vm = tcg_constant_i64(0); + } else { + tcg_vm = read_fp_dreg(s, rm); + } + if (signal_all_nans) { + gen_helper_vfp_cmped_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } else { + gen_helper_vfp_cmpd_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } + tcg_temp_free_i64(tcg_vn); + tcg_temp_free_i64(tcg_vm); + } else { + TCGv_i32 tcg_vn = tcg_temp_new_i32(); + TCGv_i32 tcg_vm = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_vn, rn, 0, size); + if (cmp_with_zero) { + tcg_gen_movi_i32(tcg_vm, 0); + } else { + read_vec_element_i32(s, tcg_vm, rm, 0, size); + } + + switch (size) { + case MO_32: + if (signal_all_nans) { + gen_helper_vfp_cmpes_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } else { + gen_helper_vfp_cmps_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } + break; + case MO_16: + if (signal_all_nans) { + gen_helper_vfp_cmpeh_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } else { + gen_helper_vfp_cmph_a64(tcg_flags, tcg_vn, tcg_vm, fpst); + } + break; + default: + g_assert_not_reached(); + } + + tcg_temp_free_i32(tcg_vn); + tcg_temp_free_i32(tcg_vm); + } + + tcg_temp_free_ptr(fpst); + + gen_set_nzcv(tcg_flags); + + tcg_temp_free_i64(tcg_flags); +} + +/* Floating point compare + * 31 30 29 28 24 23 22 21 20 16 15 14 13 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+-----+---------+------+-------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | op | 1 0 0 0 | Rn | op2 | + * +---+---+---+-----------+------+---+------+-----+---------+------+-------+ + */ +static void disas_fp_compare(DisasContext *s, uint32_t insn) +{ + unsigned int mos, type, rm, op, rn, opc, op2r; + int size; + + mos = extract32(insn, 29, 3); + type = extract32(insn, 22, 2); + rm = extract32(insn, 16, 5); + op = extract32(insn, 14, 2); + rn = extract32(insn, 5, 5); + opc = extract32(insn, 3, 2); + op2r = extract32(insn, 0, 3); + + if (mos || op || op2r) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + size = MO_32; + break; + case 1: + size = MO_64; + break; + case 3: + size = MO_16; + if (dc_isar_feature(aa64_fp16, s)) { + break; + } + /* fallthru */ + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + handle_fp_compare(s, size, rn, rm, opc & 1, opc & 2); +} + +/* Floating point conditional compare + * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 3 0 + * +---+---+---+-----------+------+---+------+------+-----+------+----+------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | cond | 0 1 | Rn | op | nzcv | + * +---+---+---+-----------+------+---+------+------+-----+------+----+------+ + */ +static void disas_fp_ccomp(DisasContext *s, uint32_t insn) +{ + unsigned int mos, type, rm, cond, rn, op, nzcv; + TCGLabel *label_continue = NULL; + int size; + + mos = extract32(insn, 29, 3); + type = extract32(insn, 22, 2); + rm = extract32(insn, 16, 5); + cond = extract32(insn, 12, 4); + rn = extract32(insn, 5, 5); + op = extract32(insn, 4, 1); + nzcv = extract32(insn, 0, 4); + + if (mos) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + size = MO_32; + break; + case 1: + size = MO_64; + break; + case 3: + size = MO_16; + if (dc_isar_feature(aa64_fp16, s)) { + break; + } + /* fallthru */ + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (cond < 0x0e) { /* not always */ + TCGLabel *label_match = gen_new_label(); + label_continue = gen_new_label(); + arm_gen_test_cc(cond, label_match); + /* nomatch: */ + gen_set_nzcv(tcg_constant_i64(nzcv << 28)); + tcg_gen_br(label_continue); + gen_set_label(label_match); + } + + handle_fp_compare(s, size, rn, rm, false, op); + + if (cond < 0x0e) { + gen_set_label(label_continue); + } +} + +/* Floating point conditional select + * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+------+-----+------+------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | cond | 1 1 | Rn | Rd | + * +---+---+---+-----------+------+---+------+------+-----+------+------+ + */ +static void disas_fp_csel(DisasContext *s, uint32_t insn) +{ + unsigned int mos, type, rm, cond, rn, rd; + TCGv_i64 t_true, t_false; + DisasCompare64 c; + MemOp sz; + + mos = extract32(insn, 29, 3); + type = extract32(insn, 22, 2); + rm = extract32(insn, 16, 5); + cond = extract32(insn, 12, 4); + rn = extract32(insn, 5, 5); + rd = extract32(insn, 0, 5); + + if (mos) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + sz = MO_32; + break; + case 1: + sz = MO_64; + break; + case 3: + sz = MO_16; + if (dc_isar_feature(aa64_fp16, s)) { + break; + } + /* fallthru */ + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + /* Zero extend sreg & hreg inputs to 64 bits now. */ + t_true = tcg_temp_new_i64(); + t_false = tcg_temp_new_i64(); + read_vec_element(s, t_true, rn, 0, sz); + read_vec_element(s, t_false, rm, 0, sz); + + a64_test_cc(&c, cond); + tcg_gen_movcond_i64(c.cond, t_true, c.value, tcg_constant_i64(0), + t_true, t_false); + tcg_temp_free_i64(t_false); + a64_free_cc(&c); + + /* Note that sregs & hregs write back zeros to the high bits, + and we've already done the zero-extension. */ + write_fp_dreg(s, rd, t_true); + tcg_temp_free_i64(t_true); +} + +/* Floating-point data-processing (1 source) - half precision */ +static void handle_fp_1src_half(DisasContext *s, int opcode, int rd, int rn) +{ + TCGv_ptr fpst = NULL; + TCGv_i32 tcg_op = read_fp_hreg(s, rn); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + switch (opcode) { + case 0x0: /* FMOV */ + tcg_gen_mov_i32(tcg_res, tcg_op); + break; + case 0x1: /* FABS */ + tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff); + break; + case 0x2: /* FNEG */ + tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000); + break; + case 0x3: /* FSQRT */ + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_sqrt_f16(tcg_res, tcg_op, fpst); + break; + case 0x8: /* FRINTN */ + case 0x9: /* FRINTP */ + case 0xa: /* FRINTM */ + case 0xb: /* FRINTZ */ + case 0xc: /* FRINTA */ + { + TCGv_i32 tcg_rmode = tcg_const_i32(arm_rmode_to_sf(opcode & 7)); + fpst = fpstatus_ptr(FPST_FPCR_F16); + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst); + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + tcg_temp_free_i32(tcg_rmode); + break; + } + case 0xe: /* FRINTX */ + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, fpst); + break; + case 0xf: /* FRINTI */ + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst); + break; + default: + g_assert_not_reached(); + } + + write_fp_sreg(s, rd, tcg_res); + + if (fpst) { + tcg_temp_free_ptr(fpst); + } + tcg_temp_free_i32(tcg_op); + tcg_temp_free_i32(tcg_res); +} + +/* Floating-point data-processing (1 source) - single precision */ +static void handle_fp_1src_single(DisasContext *s, int opcode, int rd, int rn) +{ + void (*gen_fpst)(TCGv_i32, TCGv_i32, TCGv_ptr); + TCGv_i32 tcg_op, tcg_res; + TCGv_ptr fpst; + int rmode = -1; + + tcg_op = read_fp_sreg(s, rn); + tcg_res = tcg_temp_new_i32(); + + switch (opcode) { + case 0x0: /* FMOV */ + tcg_gen_mov_i32(tcg_res, tcg_op); + goto done; + case 0x1: /* FABS */ + gen_helper_vfp_abss(tcg_res, tcg_op); + goto done; + case 0x2: /* FNEG */ + gen_helper_vfp_negs(tcg_res, tcg_op); + goto done; + case 0x3: /* FSQRT */ + gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env); + goto done; + case 0x6: /* BFCVT */ + gen_fpst = gen_helper_bfcvt; + break; + case 0x8: /* FRINTN */ + case 0x9: /* FRINTP */ + case 0xa: /* FRINTM */ + case 0xb: /* FRINTZ */ + case 0xc: /* FRINTA */ + rmode = arm_rmode_to_sf(opcode & 7); + gen_fpst = gen_helper_rints; + break; + case 0xe: /* FRINTX */ + gen_fpst = gen_helper_rints_exact; + break; + case 0xf: /* FRINTI */ + gen_fpst = gen_helper_rints; + break; + case 0x10: /* FRINT32Z */ + rmode = float_round_to_zero; + gen_fpst = gen_helper_frint32_s; + break; + case 0x11: /* FRINT32X */ + gen_fpst = gen_helper_frint32_s; + break; + case 0x12: /* FRINT64Z */ + rmode = float_round_to_zero; + gen_fpst = gen_helper_frint64_s; + break; + case 0x13: /* FRINT64X */ + gen_fpst = gen_helper_frint64_s; + break; + default: + g_assert_not_reached(); + } + + fpst = fpstatus_ptr(FPST_FPCR); + if (rmode >= 0) { + TCGv_i32 tcg_rmode = tcg_const_i32(rmode); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_fpst(tcg_res, tcg_op, fpst); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + tcg_temp_free_i32(tcg_rmode); + } else { + gen_fpst(tcg_res, tcg_op, fpst); + } + tcg_temp_free_ptr(fpst); + + done: + write_fp_sreg(s, rd, tcg_res); + tcg_temp_free_i32(tcg_op); + tcg_temp_free_i32(tcg_res); +} + +/* Floating-point data-processing (1 source) - double precision */ +static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn) +{ + void (*gen_fpst)(TCGv_i64, TCGv_i64, TCGv_ptr); + TCGv_i64 tcg_op, tcg_res; + TCGv_ptr fpst; + int rmode = -1; + + switch (opcode) { + case 0x0: /* FMOV */ + gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0); + return; + } + + tcg_op = read_fp_dreg(s, rn); + tcg_res = tcg_temp_new_i64(); + + switch (opcode) { + case 0x1: /* FABS */ + gen_helper_vfp_absd(tcg_res, tcg_op); + goto done; + case 0x2: /* FNEG */ + gen_helper_vfp_negd(tcg_res, tcg_op); + goto done; + case 0x3: /* FSQRT */ + gen_helper_vfp_sqrtd(tcg_res, tcg_op, cpu_env); + goto done; + case 0x8: /* FRINTN */ + case 0x9: /* FRINTP */ + case 0xa: /* FRINTM */ + case 0xb: /* FRINTZ */ + case 0xc: /* FRINTA */ + rmode = arm_rmode_to_sf(opcode & 7); + gen_fpst = gen_helper_rintd; + break; + case 0xe: /* FRINTX */ + gen_fpst = gen_helper_rintd_exact; + break; + case 0xf: /* FRINTI */ + gen_fpst = gen_helper_rintd; + break; + case 0x10: /* FRINT32Z */ + rmode = float_round_to_zero; + gen_fpst = gen_helper_frint32_d; + break; + case 0x11: /* FRINT32X */ + gen_fpst = gen_helper_frint32_d; + break; + case 0x12: /* FRINT64Z */ + rmode = float_round_to_zero; + gen_fpst = gen_helper_frint64_d; + break; + case 0x13: /* FRINT64X */ + gen_fpst = gen_helper_frint64_d; + break; + default: + g_assert_not_reached(); + } + + fpst = fpstatus_ptr(FPST_FPCR); + if (rmode >= 0) { + TCGv_i32 tcg_rmode = tcg_const_i32(rmode); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_fpst(tcg_res, tcg_op, fpst); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + tcg_temp_free_i32(tcg_rmode); + } else { + gen_fpst(tcg_res, tcg_op, fpst); + } + tcg_temp_free_ptr(fpst); + + done: + write_fp_dreg(s, rd, tcg_res); + tcg_temp_free_i64(tcg_op); + tcg_temp_free_i64(tcg_res); +} + +static void handle_fp_fcvt(DisasContext *s, int opcode, + int rd, int rn, int dtype, int ntype) +{ + switch (ntype) { + case 0x0: + { + TCGv_i32 tcg_rn = read_fp_sreg(s, rn); + if (dtype == 1) { + /* Single to double */ + TCGv_i64 tcg_rd = tcg_temp_new_i64(); + gen_helper_vfp_fcvtds(tcg_rd, tcg_rn, cpu_env); + write_fp_dreg(s, rd, tcg_rd); + tcg_temp_free_i64(tcg_rd); + } else { + /* Single to half */ + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + TCGv_i32 ahp = get_ahp_flag(); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + + gen_helper_vfp_fcvt_f32_to_f16(tcg_rd, tcg_rn, fpst, ahp); + /* write_fp_sreg is OK here because top half of tcg_rd is zero */ + write_fp_sreg(s, rd, tcg_rd); + tcg_temp_free_i32(tcg_rd); + tcg_temp_free_i32(ahp); + tcg_temp_free_ptr(fpst); + } + tcg_temp_free_i32(tcg_rn); + break; + } + case 0x1: + { + TCGv_i64 tcg_rn = read_fp_dreg(s, rn); + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + if (dtype == 0) { + /* Double to single */ + gen_helper_vfp_fcvtsd(tcg_rd, tcg_rn, cpu_env); + } else { + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + TCGv_i32 ahp = get_ahp_flag(); + /* Double to half */ + gen_helper_vfp_fcvt_f64_to_f16(tcg_rd, tcg_rn, fpst, ahp); + /* write_fp_sreg is OK here because top half of tcg_rd is zero */ + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(ahp); + } + write_fp_sreg(s, rd, tcg_rd); + tcg_temp_free_i32(tcg_rd); + tcg_temp_free_i64(tcg_rn); + break; + } + case 0x3: + { + TCGv_i32 tcg_rn = read_fp_sreg(s, rn); + TCGv_ptr tcg_fpst = fpstatus_ptr(FPST_FPCR); + TCGv_i32 tcg_ahp = get_ahp_flag(); + tcg_gen_ext16u_i32(tcg_rn, tcg_rn); + if (dtype == 0) { + /* Half to single */ + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + gen_helper_vfp_fcvt_f16_to_f32(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp); + write_fp_sreg(s, rd, tcg_rd); + tcg_temp_free_i32(tcg_rd); + } else { + /* Half to double */ + TCGv_i64 tcg_rd = tcg_temp_new_i64(); + gen_helper_vfp_fcvt_f16_to_f64(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp); + write_fp_dreg(s, rd, tcg_rd); + tcg_temp_free_i64(tcg_rd); + } + tcg_temp_free_i32(tcg_rn); + tcg_temp_free_ptr(tcg_fpst); + tcg_temp_free_i32(tcg_ahp); + break; + } + default: + g_assert_not_reached(); + } +} + +/* Floating point data-processing (1 source) + * 31 30 29 28 24 23 22 21 20 15 14 10 9 5 4 0 + * +---+---+---+-----------+------+---+--------+-----------+------+------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | opcode | 1 0 0 0 0 | Rn | Rd | + * +---+---+---+-----------+------+---+--------+-----------+------+------+ + */ +static void disas_fp_1src(DisasContext *s, uint32_t insn) +{ + int mos = extract32(insn, 29, 3); + int type = extract32(insn, 22, 2); + int opcode = extract32(insn, 15, 6); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + if (mos) { + goto do_unallocated; + } + + switch (opcode) { + case 0x4: case 0x5: case 0x7: + { + /* FCVT between half, single and double precision */ + int dtype = extract32(opcode, 0, 2); + if (type == 2 || dtype == type) { + goto do_unallocated; + } + if (!fp_access_check(s)) { + return; + } + + handle_fp_fcvt(s, opcode, rd, rn, dtype, type); + break; + } + + case 0x10 ... 0x13: /* FRINT{32,64}{X,Z} */ + if (type > 1 || !dc_isar_feature(aa64_frint, s)) { + goto do_unallocated; + } + /* fall through */ + case 0x0 ... 0x3: + case 0x8 ... 0xc: + case 0xe ... 0xf: + /* 32-to-32 and 64-to-64 ops */ + switch (type) { + case 0: + if (!fp_access_check(s)) { + return; + } + handle_fp_1src_single(s, opcode, rd, rn); + break; + case 1: + if (!fp_access_check(s)) { + return; + } + handle_fp_1src_double(s, opcode, rd, rn); + break; + case 3: + if (!dc_isar_feature(aa64_fp16, s)) { + goto do_unallocated; + } + + if (!fp_access_check(s)) { + return; + } + handle_fp_1src_half(s, opcode, rd, rn); + break; + default: + goto do_unallocated; + } + break; + + case 0x6: + switch (type) { + case 1: /* BFCVT */ + if (!dc_isar_feature(aa64_bf16, s)) { + goto do_unallocated; + } + if (!fp_access_check(s)) { + return; + } + handle_fp_1src_single(s, opcode, rd, rn); + break; + default: + goto do_unallocated; + } + break; + + default: + do_unallocated: + unallocated_encoding(s); + break; + } +} + +/* Floating-point data-processing (2 source) - single precision */ +static void handle_fp_2src_single(DisasContext *s, int opcode, + int rd, int rn, int rm) +{ + TCGv_i32 tcg_op1; + TCGv_i32 tcg_op2; + TCGv_i32 tcg_res; + TCGv_ptr fpst; + + tcg_res = tcg_temp_new_i32(); + fpst = fpstatus_ptr(FPST_FPCR); + tcg_op1 = read_fp_sreg(s, rn); + tcg_op2 = read_fp_sreg(s, rm); + + switch (opcode) { + case 0x0: /* FMUL */ + gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1: /* FDIV */ + gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2: /* FADD */ + gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3: /* FSUB */ + gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x4: /* FMAX */ + gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5: /* FMIN */ + gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x6: /* FMAXNM */ + gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7: /* FMINNM */ + gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x8: /* FNMUL */ + gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst); + gen_helper_vfp_negs(tcg_res, tcg_res); + break; + } + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_res); +} + +/* Floating-point data-processing (2 source) - double precision */ +static void handle_fp_2src_double(DisasContext *s, int opcode, + int rd, int rn, int rm) +{ + TCGv_i64 tcg_op1; + TCGv_i64 tcg_op2; + TCGv_i64 tcg_res; + TCGv_ptr fpst; + + tcg_res = tcg_temp_new_i64(); + fpst = fpstatus_ptr(FPST_FPCR); + tcg_op1 = read_fp_dreg(s, rn); + tcg_op2 = read_fp_dreg(s, rm); + + switch (opcode) { + case 0x0: /* FMUL */ + gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1: /* FDIV */ + gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2: /* FADD */ + gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3: /* FSUB */ + gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x4: /* FMAX */ + gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5: /* FMIN */ + gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x6: /* FMAXNM */ + gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7: /* FMINNM */ + gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x8: /* FNMUL */ + gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst); + gen_helper_vfp_negd(tcg_res, tcg_res); + break; + } + + write_fp_dreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res); +} + +/* Floating-point data-processing (2 source) - half precision */ +static void handle_fp_2src_half(DisasContext *s, int opcode, + int rd, int rn, int rm) +{ + TCGv_i32 tcg_op1; + TCGv_i32 tcg_op2; + TCGv_i32 tcg_res; + TCGv_ptr fpst; + + tcg_res = tcg_temp_new_i32(); + fpst = fpstatus_ptr(FPST_FPCR_F16); + tcg_op1 = read_fp_hreg(s, rn); + tcg_op2 = read_fp_hreg(s, rm); + + switch (opcode) { + case 0x0: /* FMUL */ + gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1: /* FDIV */ + gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2: /* FADD */ + gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3: /* FSUB */ + gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x4: /* FMAX */ + gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5: /* FMIN */ + gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x6: /* FMAXNM */ + gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7: /* FMINNM */ + gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x8: /* FNMUL */ + gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst); + tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000); + break; + default: + g_assert_not_reached(); + } + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_res); +} + +/* Floating point data-processing (2 source) + * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+--------+-----+------+------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | Rm | opcode | 1 0 | Rn | Rd | + * +---+---+---+-----------+------+---+------+--------+-----+------+------+ + */ +static void disas_fp_2src(DisasContext *s, uint32_t insn) +{ + int mos = extract32(insn, 29, 3); + int type = extract32(insn, 22, 2); + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rm = extract32(insn, 16, 5); + int opcode = extract32(insn, 12, 4); + + if (opcode > 8 || mos) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + if (!fp_access_check(s)) { + return; + } + handle_fp_2src_single(s, opcode, rd, rn, rm); + break; + case 1: + if (!fp_access_check(s)) { + return; + } + handle_fp_2src_double(s, opcode, rd, rn, rm); + break; + case 3: + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_fp_2src_half(s, opcode, rd, rn, rm); + break; + default: + unallocated_encoding(s); + } +} + +/* Floating-point data-processing (3 source) - single precision */ +static void handle_fp_3src_single(DisasContext *s, bool o0, bool o1, + int rd, int rn, int rm, int ra) +{ + TCGv_i32 tcg_op1, tcg_op2, tcg_op3; + TCGv_i32 tcg_res = tcg_temp_new_i32(); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + + tcg_op1 = read_fp_sreg(s, rn); + tcg_op2 = read_fp_sreg(s, rm); + tcg_op3 = read_fp_sreg(s, ra); + + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (o1 == true) { + gen_helper_vfp_negs(tcg_op3, tcg_op3); + } + + if (o0 != o1) { + gen_helper_vfp_negs(tcg_op1, tcg_op1); + } + + gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst); + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_op3); + tcg_temp_free_i32(tcg_res); +} + +/* Floating-point data-processing (3 source) - double precision */ +static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1, + int rd, int rn, int rm, int ra) +{ + TCGv_i64 tcg_op1, tcg_op2, tcg_op3; + TCGv_i64 tcg_res = tcg_temp_new_i64(); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + + tcg_op1 = read_fp_dreg(s, rn); + tcg_op2 = read_fp_dreg(s, rm); + tcg_op3 = read_fp_dreg(s, ra); + + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (o1 == true) { + gen_helper_vfp_negd(tcg_op3, tcg_op3); + } + + if (o0 != o1) { + gen_helper_vfp_negd(tcg_op1, tcg_op1); + } + + gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst); + + write_fp_dreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_op3); + tcg_temp_free_i64(tcg_res); +} + +/* Floating-point data-processing (3 source) - half precision */ +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1, + int rd, int rn, int rm, int ra) +{ + TCGv_i32 tcg_op1, tcg_op2, tcg_op3; + TCGv_i32 tcg_res = tcg_temp_new_i32(); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR_F16); + + tcg_op1 = read_fp_hreg(s, rn); + tcg_op2 = read_fp_hreg(s, rm); + tcg_op3 = read_fp_hreg(s, ra); + + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (o1 == true) { + tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000); + } + + if (o0 != o1) { + tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000); + } + + gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst); + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_op3); + tcg_temp_free_i32(tcg_res); +} + +/* Floating point data-processing (3 source) + * 31 30 29 28 24 23 22 21 20 16 15 14 10 9 5 4 0 + * +---+---+---+-----------+------+----+------+----+------+------+------+ + * | M | 0 | S | 1 1 1 1 1 | type | o1 | Rm | o0 | Ra | Rn | Rd | + * +---+---+---+-----------+------+----+------+----+------+------+------+ + */ +static void disas_fp_3src(DisasContext *s, uint32_t insn) +{ + int mos = extract32(insn, 29, 3); + int type = extract32(insn, 22, 2); + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int ra = extract32(insn, 10, 5); + int rm = extract32(insn, 16, 5); + bool o0 = extract32(insn, 15, 1); + bool o1 = extract32(insn, 21, 1); + + if (mos) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + if (!fp_access_check(s)) { + return; + } + handle_fp_3src_single(s, o0, o1, rd, rn, rm, ra); + break; + case 1: + if (!fp_access_check(s)) { + return; + } + handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra); + break; + case 3: + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra); + break; + default: + unallocated_encoding(s); + } +} + +/* Floating point immediate + * 31 30 29 28 24 23 22 21 20 13 12 10 9 5 4 0 + * +---+---+---+-----------+------+---+------------+-------+------+------+ + * | M | 0 | S | 1 1 1 1 0 | type | 1 | imm8 | 1 0 0 | imm5 | Rd | + * +---+---+---+-----------+------+---+------------+-------+------+------+ + */ +static void disas_fp_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int imm5 = extract32(insn, 5, 5); + int imm8 = extract32(insn, 13, 8); + int type = extract32(insn, 22, 2); + int mos = extract32(insn, 29, 3); + uint64_t imm; + MemOp sz; + + if (mos || imm5) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: + sz = MO_32; + break; + case 1: + sz = MO_64; + break; + case 3: + sz = MO_16; + if (dc_isar_feature(aa64_fp16, s)) { + break; + } + /* fallthru */ + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + imm = vfp_expand_imm(sz, imm8); + write_fp_dreg(s, rd, tcg_constant_i64(imm)); +} + +/* Handle floating point <=> fixed point conversions. Note that we can + * also deal with fp <=> integer conversions as a special case (scale == 64) + * OPTME: consider handling that special case specially or at least skipping + * the call to scalbn in the helpers for zero shifts. + */ +static void handle_fpfpcvt(DisasContext *s, int rd, int rn, int opcode, + bool itof, int rmode, int scale, int sf, int type) +{ + bool is_signed = !(opcode & 1); + TCGv_ptr tcg_fpstatus; + TCGv_i32 tcg_shift, tcg_single; + TCGv_i64 tcg_double; + + tcg_fpstatus = fpstatus_ptr(type == 3 ? FPST_FPCR_F16 : FPST_FPCR); + + tcg_shift = tcg_constant_i32(64 - scale); + + if (itof) { + TCGv_i64 tcg_int = cpu_reg(s, rn); + if (!sf) { + TCGv_i64 tcg_extend = new_tmp_a64(s); + + if (is_signed) { + tcg_gen_ext32s_i64(tcg_extend, tcg_int); + } else { + tcg_gen_ext32u_i64(tcg_extend, tcg_int); + } + + tcg_int = tcg_extend; + } + + switch (type) { + case 1: /* float64 */ + tcg_double = tcg_temp_new_i64(); + if (is_signed) { + gen_helper_vfp_sqtod(tcg_double, tcg_int, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_uqtod(tcg_double, tcg_int, + tcg_shift, tcg_fpstatus); + } + write_fp_dreg(s, rd, tcg_double); + tcg_temp_free_i64(tcg_double); + break; + + case 0: /* float32 */ + tcg_single = tcg_temp_new_i32(); + if (is_signed) { + gen_helper_vfp_sqtos(tcg_single, tcg_int, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_uqtos(tcg_single, tcg_int, + tcg_shift, tcg_fpstatus); + } + write_fp_sreg(s, rd, tcg_single); + tcg_temp_free_i32(tcg_single); + break; + + case 3: /* float16 */ + tcg_single = tcg_temp_new_i32(); + if (is_signed) { + gen_helper_vfp_sqtoh(tcg_single, tcg_int, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_uqtoh(tcg_single, tcg_int, + tcg_shift, tcg_fpstatus); + } + write_fp_sreg(s, rd, tcg_single); + tcg_temp_free_i32(tcg_single); + break; + + default: + g_assert_not_reached(); + } + } else { + TCGv_i64 tcg_int = cpu_reg(s, rd); + TCGv_i32 tcg_rmode; + + if (extract32(opcode, 2, 1)) { + /* There are too many rounding modes to all fit into rmode, + * so FCVTA[US] is a special case. + */ + rmode = FPROUNDING_TIEAWAY; + } + + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + + switch (type) { + case 1: /* float64 */ + tcg_double = read_fp_dreg(s, rn); + if (is_signed) { + if (!sf) { + gen_helper_vfp_tosld(tcg_int, tcg_double, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_tosqd(tcg_int, tcg_double, + tcg_shift, tcg_fpstatus); + } + } else { + if (!sf) { + gen_helper_vfp_tould(tcg_int, tcg_double, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_touqd(tcg_int, tcg_double, + tcg_shift, tcg_fpstatus); + } + } + if (!sf) { + tcg_gen_ext32u_i64(tcg_int, tcg_int); + } + tcg_temp_free_i64(tcg_double); + break; + + case 0: /* float32 */ + tcg_single = read_fp_sreg(s, rn); + if (sf) { + if (is_signed) { + gen_helper_vfp_tosqs(tcg_int, tcg_single, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_touqs(tcg_int, tcg_single, + tcg_shift, tcg_fpstatus); + } + } else { + TCGv_i32 tcg_dest = tcg_temp_new_i32(); + if (is_signed) { + gen_helper_vfp_tosls(tcg_dest, tcg_single, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_touls(tcg_dest, tcg_single, + tcg_shift, tcg_fpstatus); + } + tcg_gen_extu_i32_i64(tcg_int, tcg_dest); + tcg_temp_free_i32(tcg_dest); + } + tcg_temp_free_i32(tcg_single); + break; + + case 3: /* float16 */ + tcg_single = read_fp_sreg(s, rn); + if (sf) { + if (is_signed) { + gen_helper_vfp_tosqh(tcg_int, tcg_single, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_touqh(tcg_int, tcg_single, + tcg_shift, tcg_fpstatus); + } + } else { + TCGv_i32 tcg_dest = tcg_temp_new_i32(); + if (is_signed) { + gen_helper_vfp_toslh(tcg_dest, tcg_single, + tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_toulh(tcg_dest, tcg_single, + tcg_shift, tcg_fpstatus); + } + tcg_gen_extu_i32_i64(tcg_int, tcg_dest); + tcg_temp_free_i32(tcg_dest); + } + tcg_temp_free_i32(tcg_single); + break; + + default: + g_assert_not_reached(); + } + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + tcg_temp_free_i32(tcg_rmode); + } + + tcg_temp_free_ptr(tcg_fpstatus); +} + +/* Floating point <-> fixed point conversions + * 31 30 29 28 24 23 22 21 20 19 18 16 15 10 9 5 4 0 + * +----+---+---+-----------+------+---+-------+--------+-------+------+------+ + * | sf | 0 | S | 1 1 1 1 0 | type | 0 | rmode | opcode | scale | Rn | Rd | + * +----+---+---+-----------+------+---+-------+--------+-------+------+------+ + */ +static void disas_fp_fixed_conv(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int scale = extract32(insn, 10, 6); + int opcode = extract32(insn, 16, 3); + int rmode = extract32(insn, 19, 2); + int type = extract32(insn, 22, 2); + bool sbit = extract32(insn, 29, 1); + bool sf = extract32(insn, 31, 1); + bool itof; + + if (sbit || (!sf && scale < 32)) { + unallocated_encoding(s); + return; + } + + switch (type) { + case 0: /* float32 */ + case 1: /* float64 */ + break; + case 3: /* float16 */ + if (dc_isar_feature(aa64_fp16, s)) { + break; + } + /* fallthru */ + default: + unallocated_encoding(s); + return; + } + + switch ((rmode << 3) | opcode) { + case 0x2: /* SCVTF */ + case 0x3: /* UCVTF */ + itof = true; + break; + case 0x18: /* FCVTZS */ + case 0x19: /* FCVTZU */ + itof = false; + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + handle_fpfpcvt(s, rd, rn, opcode, itof, FPROUNDING_ZERO, scale, sf, type); +} + +static void handle_fmov(DisasContext *s, int rd, int rn, int type, bool itof) +{ + /* FMOV: gpr to or from float, double, or top half of quad fp reg, + * without conversion. + */ + + if (itof) { + TCGv_i64 tcg_rn = cpu_reg(s, rn); + TCGv_i64 tmp; + + switch (type) { + case 0: + /* 32 bit */ + tmp = tcg_temp_new_i64(); + tcg_gen_ext32u_i64(tmp, tcg_rn); + write_fp_dreg(s, rd, tmp); + tcg_temp_free_i64(tmp); + break; + case 1: + /* 64 bit */ + write_fp_dreg(s, rd, tcg_rn); + break; + case 2: + /* 64 bit to top half. */ + tcg_gen_st_i64(tcg_rn, cpu_env, fp_reg_hi_offset(s, rd)); + clear_vec_high(s, true, rd); + break; + case 3: + /* 16 bit */ + tmp = tcg_temp_new_i64(); + tcg_gen_ext16u_i64(tmp, tcg_rn); + write_fp_dreg(s, rd, tmp); + tcg_temp_free_i64(tmp); + break; + default: + g_assert_not_reached(); + } + } else { + TCGv_i64 tcg_rd = cpu_reg(s, rd); + + switch (type) { + case 0: + /* 32 bit */ + tcg_gen_ld32u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_32)); + break; + case 1: + /* 64 bit */ + tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_64)); + break; + case 2: + /* 64 bits from top half */ + tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_hi_offset(s, rn)); + break; + case 3: + /* 16 bit */ + tcg_gen_ld16u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_16)); + break; + default: + g_assert_not_reached(); + } + } +} + +static void handle_fjcvtzs(DisasContext *s, int rd, int rn) +{ + TCGv_i64 t = read_fp_dreg(s, rn); + TCGv_ptr fpstatus = fpstatus_ptr(FPST_FPCR); + + gen_helper_fjcvtzs(t, t, fpstatus); + + tcg_temp_free_ptr(fpstatus); + + tcg_gen_ext32u_i64(cpu_reg(s, rd), t); + tcg_gen_extrh_i64_i32(cpu_ZF, t); + tcg_gen_movi_i32(cpu_CF, 0); + tcg_gen_movi_i32(cpu_NF, 0); + tcg_gen_movi_i32(cpu_VF, 0); + + tcg_temp_free_i64(t); +} + +/* Floating point <-> integer conversions + * 31 30 29 28 24 23 22 21 20 19 18 16 15 10 9 5 4 0 + * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+ + * | sf | 0 | S | 1 1 1 1 0 | type | 1 | rmode | opc | 0 0 0 0 0 0 | Rn | Rd | + * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+ + */ +static void disas_fp_int_conv(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 16, 3); + int rmode = extract32(insn, 19, 2); + int type = extract32(insn, 22, 2); + bool sbit = extract32(insn, 29, 1); + bool sf = extract32(insn, 31, 1); + bool itof = false; + + if (sbit) { + goto do_unallocated; + } + + switch (opcode) { + case 2: /* SCVTF */ + case 3: /* UCVTF */ + itof = true; + /* fallthru */ + case 4: /* FCVTAS */ + case 5: /* FCVTAU */ + if (rmode != 0) { + goto do_unallocated; + } + /* fallthru */ + case 0: /* FCVT[NPMZ]S */ + case 1: /* FCVT[NPMZ]U */ + switch (type) { + case 0: /* float32 */ + case 1: /* float64 */ + break; + case 3: /* float16 */ + if (!dc_isar_feature(aa64_fp16, s)) { + goto do_unallocated; + } + break; + default: + goto do_unallocated; + } + if (!fp_access_check(s)) { + return; + } + handle_fpfpcvt(s, rd, rn, opcode, itof, rmode, 64, sf, type); + break; + + default: + switch (sf << 7 | type << 5 | rmode << 3 | opcode) { + case 0b01100110: /* FMOV half <-> 32-bit int */ + case 0b01100111: + case 0b11100110: /* FMOV half <-> 64-bit int */ + case 0b11100111: + if (!dc_isar_feature(aa64_fp16, s)) { + goto do_unallocated; + } + /* fallthru */ + case 0b00000110: /* FMOV 32-bit */ + case 0b00000111: + case 0b10100110: /* FMOV 64-bit */ + case 0b10100111: + case 0b11001110: /* FMOV top half of 128-bit */ + case 0b11001111: + if (!fp_access_check(s)) { + return; + } + itof = opcode & 1; + handle_fmov(s, rd, rn, type, itof); + break; + + case 0b00111110: /* FJCVTZS */ + if (!dc_isar_feature(aa64_jscvt, s)) { + goto do_unallocated; + } else if (fp_access_check(s)) { + handle_fjcvtzs(s, rd, rn); + } + break; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } + break; + } +} + +/* FP-specific subcases of table C3-6 (SIMD and FP data processing) + * 31 30 29 28 25 24 0 + * +---+---+---+---------+-----------------------------+ + * | | 0 | | 1 1 1 1 | | + * +---+---+---+---------+-----------------------------+ + */ +static void disas_data_proc_fp(DisasContext *s, uint32_t insn) +{ + if (extract32(insn, 24, 1)) { + /* Floating point data-processing (3 source) */ + disas_fp_3src(s, insn); + } else if (extract32(insn, 21, 1) == 0) { + /* Floating point to fixed point conversions */ + disas_fp_fixed_conv(s, insn); + } else { + switch (extract32(insn, 10, 2)) { + case 1: + /* Floating point conditional compare */ + disas_fp_ccomp(s, insn); + break; + case 2: + /* Floating point data-processing (2 source) */ + disas_fp_2src(s, insn); + break; + case 3: + /* Floating point conditional select */ + disas_fp_csel(s, insn); + break; + case 0: + switch (ctz32(extract32(insn, 12, 4))) { + case 0: /* [15:12] == xxx1 */ + /* Floating point immediate */ + disas_fp_imm(s, insn); + break; + case 1: /* [15:12] == xx10 */ + /* Floating point compare */ + disas_fp_compare(s, insn); + break; + case 2: /* [15:12] == x100 */ + /* Floating point data-processing (1 source) */ + disas_fp_1src(s, insn); + break; + case 3: /* [15:12] == 1000 */ + unallocated_encoding(s); + break; + default: /* [15:12] == 0000 */ + /* Floating point <-> integer conversions */ + disas_fp_int_conv(s, insn); + break; + } + break; + } + } +} + +static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right, + int pos) +{ + /* Extract 64 bits from the middle of two concatenated 64 bit + * vector register slices left:right. The extracted bits start + * at 'pos' bits into the right (least significant) side. + * We return the result in tcg_right, and guarantee not to + * trash tcg_left. + */ + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + assert(pos > 0 && pos < 64); + + tcg_gen_shri_i64(tcg_right, tcg_right, pos); + tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos); + tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp); + + tcg_temp_free_i64(tcg_tmp); +} + +/* EXT + * 31 30 29 24 23 22 21 20 16 15 14 11 10 9 5 4 0 + * +---+---+-------------+-----+---+------+---+------+---+------+------+ + * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 | Rm | 0 | imm4 | 0 | Rn | Rd | + * +---+---+-------------+-----+---+------+---+------+---+------+------+ + */ +static void disas_simd_ext(DisasContext *s, uint32_t insn) +{ + int is_q = extract32(insn, 30, 1); + int op2 = extract32(insn, 22, 2); + int imm4 = extract32(insn, 11, 4); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int pos = imm4 << 3; + TCGv_i64 tcg_resl, tcg_resh; + + if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_resh = tcg_temp_new_i64(); + tcg_resl = tcg_temp_new_i64(); + + /* Vd gets bits starting at pos bits into Vm:Vn. This is + * either extracting 128 bits from a 128:128 concatenation, or + * extracting 64 bits from a 64:64 concatenation. + */ + if (!is_q) { + read_vec_element(s, tcg_resl, rn, 0, MO_64); + if (pos != 0) { + read_vec_element(s, tcg_resh, rm, 0, MO_64); + do_ext64(s, tcg_resh, tcg_resl, pos); + } + } else { + TCGv_i64 tcg_hh; + typedef struct { + int reg; + int elt; + } EltPosns; + EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} }; + EltPosns *elt = eltposns; + + if (pos >= 64) { + elt++; + pos -= 64; + } + + read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64); + elt++; + read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64); + elt++; + if (pos != 0) { + do_ext64(s, tcg_resh, tcg_resl, pos); + tcg_hh = tcg_temp_new_i64(); + read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64); + do_ext64(s, tcg_hh, tcg_resh, pos); + tcg_temp_free_i64(tcg_hh); + } + } + + write_vec_element(s, tcg_resl, rd, 0, MO_64); + tcg_temp_free_i64(tcg_resl); + if (is_q) { + write_vec_element(s, tcg_resh, rd, 1, MO_64); + } + tcg_temp_free_i64(tcg_resh); + clear_vec_high(s, is_q, rd); +} + +/* TBL/TBX + * 31 30 29 24 23 22 21 20 16 15 14 13 12 11 10 9 5 4 0 + * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+ + * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 | Rm | 0 | len | op | 0 0 | Rn | Rd | + * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+ + */ +static void disas_simd_tb(DisasContext *s, uint32_t insn) +{ + int op2 = extract32(insn, 22, 2); + int is_q = extract32(insn, 30, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int is_tbx = extract32(insn, 12, 1); + int len = (extract32(insn, 13, 2) + 1) * 16; + + if (op2 != 0) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rm), cpu_env, + is_q ? 16 : 8, vec_full_reg_size(s), + (len << 6) | (is_tbx << 5) | rn, + gen_helper_simd_tblx); +} + +/* ZIP/UZP/TRN + * 31 30 29 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0 + * +---+---+-------------+------+---+------+---+------------------+------+ + * | 0 | Q | 0 0 1 1 1 0 | size | 0 | Rm | 0 | opc | 1 0 | Rn | Rd | + * +---+---+-------------+------+---+------+---+------------------+------+ + */ +static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + /* opc field bits [1:0] indicate ZIP/UZP/TRN; + * bit 2 indicates 1 vs 2 variant of the insn. + */ + int opcode = extract32(insn, 12, 2); + bool part = extract32(insn, 14, 1); + bool is_q = extract32(insn, 30, 1); + int esize = 8 << size; + int i, ofs; + int datasize = is_q ? 128 : 64; + int elements = datasize / esize; + TCGv_i64 tcg_res, tcg_resl, tcg_resh; + + if (opcode == 0 || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_resl = tcg_const_i64(0); + tcg_resh = is_q ? tcg_const_i64(0) : NULL; + tcg_res = tcg_temp_new_i64(); + + for (i = 0; i < elements; i++) { + switch (opcode) { + case 1: /* UZP1/2 */ + { + int midpoint = elements / 2; + if (i < midpoint) { + read_vec_element(s, tcg_res, rn, 2 * i + part, size); + } else { + read_vec_element(s, tcg_res, rm, + 2 * (i - midpoint) + part, size); + } + break; + } + case 2: /* TRN1/2 */ + if (i & 1) { + read_vec_element(s, tcg_res, rm, (i & ~1) + part, size); + } else { + read_vec_element(s, tcg_res, rn, (i & ~1) + part, size); + } + break; + case 3: /* ZIP1/2 */ + { + int base = part * elements / 2; + if (i & 1) { + read_vec_element(s, tcg_res, rm, base + (i >> 1), size); + } else { + read_vec_element(s, tcg_res, rn, base + (i >> 1), size); + } + break; + } + default: + g_assert_not_reached(); + } + + ofs = i * esize; + if (ofs < 64) { + tcg_gen_shli_i64(tcg_res, tcg_res, ofs); + tcg_gen_or_i64(tcg_resl, tcg_resl, tcg_res); + } else { + tcg_gen_shli_i64(tcg_res, tcg_res, ofs - 64); + tcg_gen_or_i64(tcg_resh, tcg_resh, tcg_res); + } + } + + tcg_temp_free_i64(tcg_res); + + write_vec_element(s, tcg_resl, rd, 0, MO_64); + tcg_temp_free_i64(tcg_resl); + + if (is_q) { + write_vec_element(s, tcg_resh, rd, 1, MO_64); + tcg_temp_free_i64(tcg_resh); + } + clear_vec_high(s, is_q, rd); +} + +/* + * do_reduction_op helper + * + * This mirrors the Reduce() pseudocode in the ARM ARM. It is + * important for correct NaN propagation that we do these + * operations in exactly the order specified by the pseudocode. + * + * This is a recursive function, TCG temps should be freed by the + * calling function once it is done with the values. + */ +static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn, + int esize, int size, int vmap, TCGv_ptr fpst) +{ + if (esize == size) { + int element; + MemOp msize = esize == 16 ? MO_16 : MO_32; + TCGv_i32 tcg_elem; + + /* We should have one register left here */ + assert(ctpop8(vmap) == 1); + element = ctz32(vmap); + assert(element < 8); + + tcg_elem = tcg_temp_new_i32(); + read_vec_element_i32(s, tcg_elem, rn, element, msize); + return tcg_elem; + } else { + int bits = size / 2; + int shift = ctpop8(vmap) / 2; + int vmap_lo = (vmap >> shift) & vmap; + int vmap_hi = (vmap & ~vmap_lo); + TCGv_i32 tcg_hi, tcg_lo, tcg_res; + + tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst); + tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst); + tcg_res = tcg_temp_new_i32(); + + switch (fpopcode) { + case 0x0c: /* fmaxnmv half-precision */ + gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x0f: /* fmaxv half-precision */ + gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x1c: /* fminnmv half-precision */ + gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x1f: /* fminv half-precision */ + gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x2c: /* fmaxnmv */ + gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x2f: /* fmaxv */ + gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x3c: /* fminnmv */ + gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst); + break; + case 0x3f: /* fminv */ + gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst); + break; + default: + g_assert_not_reached(); + } + + tcg_temp_free_i32(tcg_hi); + tcg_temp_free_i32(tcg_lo); + return tcg_res; + } +} + +/* AdvSIMD across lanes + * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+-----------+--------+-----+------+------+ + * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd | + * +---+---+---+-----------+------+-----------+--------+-----+------+------+ + */ +static void disas_simd_across_lanes(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 5); + bool is_q = extract32(insn, 30, 1); + bool is_u = extract32(insn, 29, 1); + bool is_fp = false; + bool is_min = false; + int esize; + int elements; + int i; + TCGv_i64 tcg_res, tcg_elt; + + switch (opcode) { + case 0x1b: /* ADDV */ + if (is_u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x3: /* SADDLV, UADDLV */ + case 0xa: /* SMAXV, UMAXV */ + case 0x1a: /* SMINV, UMINV */ + if (size == 3 || (size == 2 && !is_q)) { + unallocated_encoding(s); + return; + } + break; + case 0xc: /* FMAXNMV, FMINNMV */ + case 0xf: /* FMAXV, FMINV */ + /* Bit 1 of size field encodes min vs max and the actual size + * depends on the encoding of the U bit. If not set (and FP16 + * enabled) then we do half-precision float instead of single + * precision. + */ + is_min = extract32(size, 1, 1); + is_fp = true; + if (!is_u && dc_isar_feature(aa64_fp16, s)) { + size = 1; + } else if (!is_u || !is_q || extract32(size, 0, 1)) { + unallocated_encoding(s); + return; + } else { + size = 2; + } + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + esize = 8 << size; + elements = (is_q ? 128 : 64) / esize; + + tcg_res = tcg_temp_new_i64(); + tcg_elt = tcg_temp_new_i64(); + + /* These instructions operate across all lanes of a vector + * to produce a single result. We can guarantee that a 64 + * bit intermediate is sufficient: + * + for [US]ADDLV the maximum element size is 32 bits, and + * the result type is 64 bits + * + for FMAX*V, FMIN*V, ADDV the intermediate type is the + * same as the element size, which is 32 bits at most + * For the integer operations we can choose to work at 64 + * or 32 bits and truncate at the end; for simplicity + * we use 64 bits always. The floating point + * ops do require 32 bit intermediates, though. + */ + if (!is_fp) { + read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN)); + + for (i = 1; i < elements; i++) { + read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN)); + + switch (opcode) { + case 0x03: /* SADDLV / UADDLV */ + case 0x1b: /* ADDV */ + tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt); + break; + case 0x0a: /* SMAXV / UMAXV */ + if (is_u) { + tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt); + } else { + tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt); + } + break; + case 0x1a: /* SMINV / UMINV */ + if (is_u) { + tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt); + } else { + tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt); + } + break; + default: + g_assert_not_reached(); + } + + } + } else { + /* Floating point vector reduction ops which work across 32 + * bit (single) or 16 bit (half-precision) intermediates. + * Note that correct NaN propagation requires that we do these + * operations in exactly the order specified by the pseudocode. + */ + TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + int fpopcode = opcode | is_min << 4 | is_u << 5; + int vmap = (1 << elements) - 1; + TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize, + (is_q ? 128 : 64), vmap, fpst); + tcg_gen_extu_i32_i64(tcg_res, tcg_res32); + tcg_temp_free_i32(tcg_res32); + tcg_temp_free_ptr(fpst); + } + + tcg_temp_free_i64(tcg_elt); + + /* Now truncate the result to the width required for the final output */ + if (opcode == 0x03) { + /* SADDLV, UADDLV: result is 2*esize */ + size++; + } + + switch (size) { + case 0: + tcg_gen_ext8u_i64(tcg_res, tcg_res); + break; + case 1: + tcg_gen_ext16u_i64(tcg_res, tcg_res); + break; + case 2: + tcg_gen_ext32u_i64(tcg_res, tcg_res); + break; + case 3: + break; + default: + g_assert_not_reached(); + } + + write_fp_dreg(s, rd, tcg_res); + tcg_temp_free_i64(tcg_res); +} + +/* DUP (Element, Vector) + * + * 31 30 29 21 20 16 15 10 9 5 4 0 + * +---+---+-------------------+--------+-------------+------+------+ + * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 0 1 | Rn | Rd | + * +---+---+-------------------+--------+-------------+------+------+ + * + * size: encoded in imm5 (see ARM ARM LowestSetBit()) + */ +static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn, + int imm5) +{ + int size = ctz32(imm5); + int index; + + if (size > 3 || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + index = imm5 >> (size + 1); + tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd), + vec_reg_offset(s, rn, index, size), + is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* DUP (element, scalar) + * 31 21 20 16 15 10 9 5 4 0 + * +-----------------------+--------+-------------+------+------+ + * | 0 1 0 1 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 0 1 | Rn | Rd | + * +-----------------------+--------+-------------+------+------+ + */ +static void handle_simd_dupes(DisasContext *s, int rd, int rn, + int imm5) +{ + int size = ctz32(imm5); + int index; + TCGv_i64 tmp; + + if (size > 3) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + index = imm5 >> (size + 1); + + /* This instruction just extracts the specified element and + * zero-extends it into the bottom of the destination register. + */ + tmp = tcg_temp_new_i64(); + read_vec_element(s, tmp, rn, index, size); + write_fp_dreg(s, rd, tmp); + tcg_temp_free_i64(tmp); +} + +/* DUP (General) + * + * 31 30 29 21 20 16 15 10 9 5 4 0 + * +---+---+-------------------+--------+-------------+------+------+ + * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 0 1 1 | Rn | Rd | + * +---+---+-------------------+--------+-------------+------+------+ + * + * size: encoded in imm5 (see ARM ARM LowestSetBit()) + */ +static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn, + int imm5) +{ + int size = ctz32(imm5); + uint32_t dofs, oprsz, maxsz; + + if (size > 3 || ((size == 3) && !is_q)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + dofs = vec_full_reg_offset(s, rd); + oprsz = is_q ? 16 : 8; + maxsz = vec_full_reg_size(s); + + tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn)); +} + +/* INS (Element) + * + * 31 21 20 16 15 14 11 10 9 5 4 0 + * +-----------------------+--------+------------+---+------+------+ + * | 0 1 1 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd | + * +-----------------------+--------+------------+---+------+------+ + * + * size: encoded in imm5 (see ARM ARM LowestSetBit()) + * index: encoded in imm5<4:size+1> + */ +static void handle_simd_inse(DisasContext *s, int rd, int rn, + int imm4, int imm5) +{ + int size = ctz32(imm5); + int src_index, dst_index; + TCGv_i64 tmp; + + if (size > 3) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + dst_index = extract32(imm5, 1+size, 5); + src_index = extract32(imm4, size, 4); + + tmp = tcg_temp_new_i64(); + + read_vec_element(s, tmp, rn, src_index, size); + write_vec_element(s, tmp, rd, dst_index, size); + + tcg_temp_free_i64(tmp); + + /* INS is considered a 128-bit write for SVE. */ + clear_vec_high(s, true, rd); +} + + +/* INS (General) + * + * 31 21 20 16 15 10 9 5 4 0 + * +-----------------------+--------+-------------+------+------+ + * | 0 1 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 0 1 1 1 | Rn | Rd | + * +-----------------------+--------+-------------+------+------+ + * + * size: encoded in imm5 (see ARM ARM LowestSetBit()) + * index: encoded in imm5<4:size+1> + */ +static void handle_simd_insg(DisasContext *s, int rd, int rn, int imm5) +{ + int size = ctz32(imm5); + int idx; + + if (size > 3) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + idx = extract32(imm5, 1 + size, 4 - size); + write_vec_element(s, cpu_reg(s, rn), rd, idx, size); + + /* INS is considered a 128-bit write for SVE. */ + clear_vec_high(s, true, rd); +} + +/* + * UMOV (General) + * SMOV (General) + * + * 31 30 29 21 20 16 15 12 10 9 5 4 0 + * +---+---+-------------------+--------+-------------+------+------+ + * | 0 | Q | 0 0 1 1 1 0 0 0 0 | imm5 | 0 0 1 U 1 1 | Rn | Rd | + * +---+---+-------------------+--------+-------------+------+------+ + * + * U: unsigned when set + * size: encoded in imm5 (see ARM ARM LowestSetBit()) + */ +static void handle_simd_umov_smov(DisasContext *s, int is_q, int is_signed, + int rn, int rd, int imm5) +{ + int size = ctz32(imm5); + int element; + TCGv_i64 tcg_rd; + + /* Check for UnallocatedEncodings */ + if (is_signed) { + if (size > 2 || (size == 2 && !is_q)) { + unallocated_encoding(s); + return; + } + } else { + if (size > 3 + || (size < 3 && is_q) + || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + } + + if (!fp_access_check(s)) { + return; + } + + element = extract32(imm5, 1+size, 4); + + tcg_rd = cpu_reg(s, rd); + read_vec_element(s, tcg_rd, rn, element, size | (is_signed ? MO_SIGN : 0)); + if (is_signed && !is_q) { + tcg_gen_ext32u_i64(tcg_rd, tcg_rd); + } +} + +/* AdvSIMD copy + * 31 30 29 28 21 20 16 15 14 11 10 9 5 4 0 + * +---+---+----+-----------------+------+---+------+---+------+------+ + * | 0 | Q | op | 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd | + * +---+---+----+-----------------+------+---+------+---+------+------+ + */ +static void disas_simd_copy(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int imm4 = extract32(insn, 11, 4); + int op = extract32(insn, 29, 1); + int is_q = extract32(insn, 30, 1); + int imm5 = extract32(insn, 16, 5); + + if (op) { + if (is_q) { + /* INS (element) */ + handle_simd_inse(s, rd, rn, imm4, imm5); + } else { + unallocated_encoding(s); + } + } else { + switch (imm4) { + case 0: + /* DUP (element - vector) */ + handle_simd_dupe(s, is_q, rd, rn, imm5); + break; + case 1: + /* DUP (general) */ + handle_simd_dupg(s, is_q, rd, rn, imm5); + break; + case 3: + if (is_q) { + /* INS (general) */ + handle_simd_insg(s, rd, rn, imm5); + } else { + unallocated_encoding(s); + } + break; + case 5: + case 7: + /* UMOV/SMOV (is_q indicates 32/64; imm4 indicates signedness) */ + handle_simd_umov_smov(s, is_q, (imm4 == 5), rn, rd, imm5); + break; + default: + unallocated_encoding(s); + break; + } + } +} + +/* AdvSIMD modified immediate + * 31 30 29 28 19 18 16 15 12 11 10 9 5 4 0 + * +---+---+----+---------------------+-----+-------+----+---+-------+------+ + * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh | Rd | + * +---+---+----+---------------------+-----+-------+----+---+-------+------+ + * + * There are a number of operations that can be carried out here: + * MOVI - move (shifted) imm into register + * MVNI - move inverted (shifted) imm into register + * ORR - bitwise OR of (shifted) imm with register + * BIC - bitwise clear of (shifted) imm with register + * With ARMv8.2 we also have: + * FMOV half-precision + */ +static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int cmode = extract32(insn, 12, 4); + int o2 = extract32(insn, 11, 1); + uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5); + bool is_neg = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + uint64_t imm = 0; + + if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) { + /* Check for FMOV (vector, immediate) - half-precision */ + if (!(dc_isar_feature(aa64_fp16, s) && o2 && cmode == 0xf)) { + unallocated_encoding(s); + return; + } + } + + if (!fp_access_check(s)) { + return; + } + + if (cmode == 15 && o2 && !is_neg) { + /* FMOV (vector, immediate) - half-precision */ + imm = vfp_expand_imm(MO_16, abcdefgh); + /* now duplicate across the lanes */ + imm = dup_const(MO_16, imm); + } else { + imm = asimd_imm_const(abcdefgh, cmode, is_neg); + } + + if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) { + /* MOVI or MVNI, with MVNI negation handled above. */ + tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8, + vec_full_reg_size(s), imm); + } else { + /* ORR or BIC, with BIC negation to AND handled above. */ + if (is_neg) { + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64); + } else { + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64); + } + } +} + +/* AdvSIMD scalar copy + * 31 30 29 28 21 20 16 15 14 11 10 9 5 4 0 + * +-----+----+-----------------+------+---+------+---+------+------+ + * | 0 1 | op | 1 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 | Rn | Rd | + * +-----+----+-----------------+------+---+------+---+------+------+ + */ +static void disas_simd_scalar_copy(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int imm4 = extract32(insn, 11, 4); + int imm5 = extract32(insn, 16, 5); + int op = extract32(insn, 29, 1); + + if (op != 0 || imm4 != 0) { + unallocated_encoding(s); + return; + } + + /* DUP (element, scalar) */ + handle_simd_dupes(s, rd, rn, imm5); +} + +/* AdvSIMD scalar pairwise + * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +-----+---+-----------+------+-----------+--------+-----+------+------+ + * | 0 1 | U | 1 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd | + * +-----+---+-----------+------+-----------+--------+-----+------+------+ + */ +static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn) +{ + int u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + TCGv_ptr fpst; + + /* For some ops (the FP ones), size[1] is part of the encoding. + * For ADDP strictly it is not but size[1] is always 1 for valid + * encodings. + */ + opcode |= (extract32(size, 1, 1) << 5); + + switch (opcode) { + case 0x3b: /* ADDP */ + if (u || size != 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + fpst = NULL; + break; + case 0xc: /* FMAXNMP */ + case 0xd: /* FADDP */ + case 0xf: /* FMAXP */ + case 0x2c: /* FMINNMP */ + case 0x2f: /* FMINP */ + /* FP op, size[0] is 32 or 64 bit*/ + if (!u) { + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } else { + size = MO_16; + } + } else { + size = extract32(size, 0, 1) ? MO_64 : MO_32; + } + + if (!fp_access_check(s)) { + return; + } + + fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + break; + default: + unallocated_encoding(s); + return; + } + + if (size == MO_64) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, 0, MO_64); + read_vec_element(s, tcg_op2, rn, 1, MO_64); + + switch (opcode) { + case 0x3b: /* ADDP */ + tcg_gen_add_i64(tcg_res, tcg_op1, tcg_op2); + break; + case 0xc: /* FMAXNMP */ + gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xd: /* FADDP */ + gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xf: /* FMAXP */ + gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2c: /* FMINNMP */ + gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2f: /* FMINP */ + gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + write_fp_dreg(s, rd, tcg_res); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res); + } else { + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op1, rn, 0, size); + read_vec_element_i32(s, tcg_op2, rn, 1, size); + + if (size == MO_16) { + switch (opcode) { + case 0xc: /* FMAXNMP */ + gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xd: /* FADDP */ + gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xf: /* FMAXP */ + gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2c: /* FMINNMP */ + gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2f: /* FMINP */ + gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + } else { + switch (opcode) { + case 0xc: /* FMAXNMP */ + gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xd: /* FADDP */ + gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xf: /* FMAXP */ + gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2c: /* FMINNMP */ + gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x2f: /* FMINP */ + gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + } + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_res); + } + + if (fpst) { + tcg_temp_free_ptr(fpst); + } +} + +/* + * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate) + * + * This code is handles the common shifting code and is used by both + * the vector and scalar code. + */ +static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src, + TCGv_i64 tcg_rnd, bool accumulate, + bool is_u, int size, int shift) +{ + bool extended_result = false; + bool round = tcg_rnd != NULL; + int ext_lshift = 0; + TCGv_i64 tcg_src_hi; + + if (round && size == 3) { + extended_result = true; + ext_lshift = 64 - shift; + tcg_src_hi = tcg_temp_new_i64(); + } else if (shift == 64) { + if (!accumulate && is_u) { + /* result is zero */ + tcg_gen_movi_i64(tcg_res, 0); + return; + } + } + + /* Deal with the rounding step */ + if (round) { + if (extended_result) { + TCGv_i64 tcg_zero = tcg_constant_i64(0); + if (!is_u) { + /* take care of sign extending tcg_res */ + tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63); + tcg_gen_add2_i64(tcg_src, tcg_src_hi, + tcg_src, tcg_src_hi, + tcg_rnd, tcg_zero); + } else { + tcg_gen_add2_i64(tcg_src, tcg_src_hi, + tcg_src, tcg_zero, + tcg_rnd, tcg_zero); + } + } else { + tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd); + } + } + + /* Now do the shift right */ + if (round && extended_result) { + /* extended case, >64 bit precision required */ + if (ext_lshift == 0) { + /* special case, only high bits matter */ + tcg_gen_mov_i64(tcg_src, tcg_src_hi); + } else { + tcg_gen_shri_i64(tcg_src, tcg_src, shift); + tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift); + tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi); + } + } else { + if (is_u) { + if (shift == 64) { + /* essentially shifting in 64 zeros */ + tcg_gen_movi_i64(tcg_src, 0); + } else { + tcg_gen_shri_i64(tcg_src, tcg_src, shift); + } + } else { + if (shift == 64) { + /* effectively extending the sign-bit */ + tcg_gen_sari_i64(tcg_src, tcg_src, 63); + } else { + tcg_gen_sari_i64(tcg_src, tcg_src, shift); + } + } + } + + if (accumulate) { + tcg_gen_add_i64(tcg_res, tcg_res, tcg_src); + } else { + tcg_gen_mov_i64(tcg_res, tcg_src); + } + + if (extended_result) { + tcg_temp_free_i64(tcg_src_hi); + } +} + +/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */ +static void handle_scalar_simd_shri(DisasContext *s, + bool is_u, int immh, int immb, + int opcode, int rn, int rd) +{ + const int size = 3; + int immhb = immh << 3 | immb; + int shift = 2 * (8 << size) - immhb; + bool accumulate = false; + bool round = false; + bool insert = false; + TCGv_i64 tcg_rn; + TCGv_i64 tcg_rd; + TCGv_i64 tcg_round; + + if (!extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0x02: /* SSRA / USRA (accumulate) */ + accumulate = true; + break; + case 0x04: /* SRSHR / URSHR (rounding) */ + round = true; + break; + case 0x06: /* SRSRA / URSRA (accum + rounding) */ + accumulate = round = true; + break; + case 0x08: /* SRI */ + insert = true; + break; + } + + if (round) { + tcg_round = tcg_constant_i64(1ULL << (shift - 1)); + } else { + tcg_round = NULL; + } + + tcg_rn = read_fp_dreg(s, rn); + tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); + + if (insert) { + /* shift count same as element size is valid but does nothing; + * special case to avoid potential shift by 64. + */ + int esize = 8 << size; + if (shift != esize) { + tcg_gen_shri_i64(tcg_rn, tcg_rn, shift); + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift); + } + } else { + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + accumulate, is_u, size, shift); + } + + write_fp_dreg(s, rd, tcg_rd); + + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); +} + +/* SHL/SLI - Scalar shift left */ +static void handle_scalar_simd_shli(DisasContext *s, bool insert, + int immh, int immb, int opcode, + int rn, int rd) +{ + int size = 32 - clz32(immh) - 1; + int immhb = immh << 3 | immb; + int shift = immhb - (8 << size); + TCGv_i64 tcg_rn; + TCGv_i64 tcg_rd; + + if (!extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_rn = read_fp_dreg(s, rn); + tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); + + if (insert) { + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift); + } else { + tcg_gen_shli_i64(tcg_rd, tcg_rn, shift); + } + + write_fp_dreg(s, rd, tcg_rd); + + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); +} + +/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with + * (signed/unsigned) narrowing */ +static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q, + bool is_u_shift, bool is_u_narrow, + int immh, int immb, int opcode, + int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int esize = 8 << size; + int shift = (2 * esize) - immhb; + int elements = is_scalar ? 1 : (64 / esize); + bool round = extract32(opcode, 0, 1); + MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN); + TCGv_i64 tcg_rn, tcg_rd, tcg_round; + TCGv_i32 tcg_rd_narrowed; + TCGv_i64 tcg_final; + + static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = { + { gen_helper_neon_narrow_sat_s8, + gen_helper_neon_unarrow_sat8 }, + { gen_helper_neon_narrow_sat_s16, + gen_helper_neon_unarrow_sat16 }, + { gen_helper_neon_narrow_sat_s32, + gen_helper_neon_unarrow_sat32 }, + { NULL, NULL }, + }; + static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = { + gen_helper_neon_narrow_sat_u8, + gen_helper_neon_narrow_sat_u16, + gen_helper_neon_narrow_sat_u32, + NULL + }; + NeonGenNarrowEnvFn *narrowfn; + + int i; + + assert(size < 4); + + if (extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (is_u_shift) { + narrowfn = unsigned_narrow_fns[size]; + } else { + narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0]; + } + + tcg_rn = tcg_temp_new_i64(); + tcg_rd = tcg_temp_new_i64(); + tcg_rd_narrowed = tcg_temp_new_i32(); + tcg_final = tcg_const_i64(0); + + if (round) { + tcg_round = tcg_constant_i64(1ULL << (shift - 1)); + } else { + tcg_round = NULL; + } + + for (i = 0; i < elements; i++) { + read_vec_element(s, tcg_rn, rn, i, ldop); + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + false, is_u_shift, size+1, shift); + narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd); + tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed); + tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); + } + + if (!is_q) { + write_vec_element(s, tcg_final, rd, 0, MO_64); + } else { + write_vec_element(s, tcg_final, rd, 1, MO_64); + } + + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i32(tcg_rd_narrowed); + tcg_temp_free_i64(tcg_final); + + clear_vec_high(s, is_q, rd); +} + +/* SQSHLU, UQSHL, SQSHL: saturating left shifts */ +static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q, + bool src_unsigned, bool dst_unsigned, + int immh, int immb, int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int shift = immhb - (8 << size); + int pass; + + assert(immh != 0); + assert(!(scalar && is_q)); + + if (!scalar) { + if (!is_q && extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + /* Since we use the variable-shift helpers we must + * replicate the shift count into each element of + * the tcg_shift value. + */ + switch (size) { + case 0: + shift |= shift << 8; + /* fall through */ + case 1: + shift |= shift << 16; + break; + case 2: + case 3: + break; + default: + g_assert_not_reached(); + } + } + + if (!fp_access_check(s)) { + return; + } + + if (size == 3) { + TCGv_i64 tcg_shift = tcg_constant_i64(shift); + static NeonGenTwo64OpEnvFn * const fns[2][2] = { + { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 }, + { NULL, gen_helper_neon_qshl_u64 }, + }; + NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned]; + int maxpass = is_q ? 2 : 1; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + genfn(tcg_op, cpu_env, tcg_op, tcg_shift); + write_vec_element(s, tcg_op, rd, pass, MO_64); + + tcg_temp_free_i64(tcg_op); + } + clear_vec_high(s, is_q, rd); + } else { + TCGv_i32 tcg_shift = tcg_constant_i32(shift); + static NeonGenTwoOpEnvFn * const fns[2][2][3] = { + { + { gen_helper_neon_qshl_s8, + gen_helper_neon_qshl_s16, + gen_helper_neon_qshl_s32 }, + { gen_helper_neon_qshlu_s8, + gen_helper_neon_qshlu_s16, + gen_helper_neon_qshlu_s32 } + }, { + { NULL, NULL, NULL }, + { gen_helper_neon_qshl_u8, + gen_helper_neon_qshl_u16, + gen_helper_neon_qshl_u32 } + } + }; + NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size]; + MemOp memop = scalar ? size : MO_32; + int maxpass = scalar ? 1 : is_q ? 4 : 2; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, memop); + genfn(tcg_op, cpu_env, tcg_op, tcg_shift); + if (scalar) { + switch (size) { + case 0: + tcg_gen_ext8u_i32(tcg_op, tcg_op); + break; + case 1: + tcg_gen_ext16u_i32(tcg_op, tcg_op); + break; + case 2: + break; + default: + g_assert_not_reached(); + } + write_fp_sreg(s, rd, tcg_op); + } else { + write_vec_element_i32(s, tcg_op, rd, pass, MO_32); + } + + tcg_temp_free_i32(tcg_op); + } + + if (!scalar) { + clear_vec_high(s, is_q, rd); + } + } +} + +/* Common vector code for handling integer to FP conversion */ +static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn, + int elements, int is_signed, + int fracbits, int size) +{ + TCGv_ptr tcg_fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + TCGv_i32 tcg_shift = NULL; + + MemOp mop = size | (is_signed ? MO_SIGN : 0); + int pass; + + if (fracbits || size == MO_64) { + tcg_shift = tcg_constant_i32(fracbits); + } + + if (size == MO_64) { + TCGv_i64 tcg_int64 = tcg_temp_new_i64(); + TCGv_i64 tcg_double = tcg_temp_new_i64(); + + for (pass = 0; pass < elements; pass++) { + read_vec_element(s, tcg_int64, rn, pass, mop); + + if (is_signed) { + gen_helper_vfp_sqtod(tcg_double, tcg_int64, + tcg_shift, tcg_fpst); + } else { + gen_helper_vfp_uqtod(tcg_double, tcg_int64, + tcg_shift, tcg_fpst); + } + if (elements == 1) { + write_fp_dreg(s, rd, tcg_double); + } else { + write_vec_element(s, tcg_double, rd, pass, MO_64); + } + } + + tcg_temp_free_i64(tcg_int64); + tcg_temp_free_i64(tcg_double); + + } else { + TCGv_i32 tcg_int32 = tcg_temp_new_i32(); + TCGv_i32 tcg_float = tcg_temp_new_i32(); + + for (pass = 0; pass < elements; pass++) { + read_vec_element_i32(s, tcg_int32, rn, pass, mop); + + switch (size) { + case MO_32: + if (fracbits) { + if (is_signed) { + gen_helper_vfp_sltos(tcg_float, tcg_int32, + tcg_shift, tcg_fpst); + } else { + gen_helper_vfp_ultos(tcg_float, tcg_int32, + tcg_shift, tcg_fpst); + } + } else { + if (is_signed) { + gen_helper_vfp_sitos(tcg_float, tcg_int32, tcg_fpst); + } else { + gen_helper_vfp_uitos(tcg_float, tcg_int32, tcg_fpst); + } + } + break; + case MO_16: + if (fracbits) { + if (is_signed) { + gen_helper_vfp_sltoh(tcg_float, tcg_int32, + tcg_shift, tcg_fpst); + } else { + gen_helper_vfp_ultoh(tcg_float, tcg_int32, + tcg_shift, tcg_fpst); + } + } else { + if (is_signed) { + gen_helper_vfp_sitoh(tcg_float, tcg_int32, tcg_fpst); + } else { + gen_helper_vfp_uitoh(tcg_float, tcg_int32, tcg_fpst); + } + } + break; + default: + g_assert_not_reached(); + } + + if (elements == 1) { + write_fp_sreg(s, rd, tcg_float); + } else { + write_vec_element_i32(s, tcg_float, rd, pass, size); + } + } + + tcg_temp_free_i32(tcg_int32); + tcg_temp_free_i32(tcg_float); + } + + tcg_temp_free_ptr(tcg_fpst); + + clear_vec_high(s, elements << size == 16, rd); +} + +/* UCVTF/SCVTF - Integer to FP conversion */ +static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar, + bool is_q, bool is_u, + int immh, int immb, int opcode, + int rn, int rd) +{ + int size, elements, fracbits; + int immhb = immh << 3 | immb; + + if (immh & 8) { + size = MO_64; + if (!is_scalar && !is_q) { + unallocated_encoding(s); + return; + } + } else if (immh & 4) { + size = MO_32; + } else if (immh & 2) { + size = MO_16; + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + } else { + /* immh == 0 would be a failure of the decode logic */ + g_assert(immh == 1); + unallocated_encoding(s); + return; + } + + if (is_scalar) { + elements = 1; + } else { + elements = (8 << is_q) >> size; + } + fracbits = (16 << size) - immhb; + + if (!fp_access_check(s)) { + return; + } + + handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size); +} + +/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */ +static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar, + bool is_q, bool is_u, + int immh, int immb, int rn, int rd) +{ + int immhb = immh << 3 | immb; + int pass, size, fracbits; + TCGv_ptr tcg_fpstatus; + TCGv_i32 tcg_rmode, tcg_shift; + + if (immh & 0x8) { + size = MO_64; + if (!is_scalar && !is_q) { + unallocated_encoding(s); + return; + } + } else if (immh & 0x4) { + size = MO_32; + } else if (immh & 0x2) { + size = MO_16; + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + } else { + /* Should have split out AdvSIMD modified immediate earlier. */ + assert(immh == 1); + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + assert(!(is_scalar && is_q)); + + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO)); + tcg_fpstatus = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + fracbits = (16 << size) - immhb; + tcg_shift = tcg_constant_i32(fracbits); + + if (size == MO_64) { + int maxpass = is_scalar ? 1 : 2; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + if (is_u) { + gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } + write_vec_element(s, tcg_op, rd, pass, MO_64); + tcg_temp_free_i64(tcg_op); + } + clear_vec_high(s, is_q, rd); + } else { + void (*fn)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); + int maxpass = is_scalar ? 1 : ((8 << is_q) >> size); + + switch (size) { + case MO_16: + if (is_u) { + fn = gen_helper_vfp_touhh; + } else { + fn = gen_helper_vfp_toshh; + } + break; + case MO_32: + if (is_u) { + fn = gen_helper_vfp_touls; + } else { + fn = gen_helper_vfp_tosls; + } + break; + default: + g_assert_not_reached(); + } + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, size); + fn(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + if (is_scalar) { + write_fp_sreg(s, rd, tcg_op); + } else { + write_vec_element_i32(s, tcg_op, rd, pass, size); + } + tcg_temp_free_i32(tcg_op); + } + if (!is_scalar) { + clear_vec_high(s, is_q, rd); + } + } + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + tcg_temp_free_ptr(tcg_fpstatus); + tcg_temp_free_i32(tcg_rmode); +} + +/* AdvSIMD scalar shift by immediate + * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0 + * +-----+---+-------------+------+------+--------+---+------+------+ + * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd | + * +-----+---+-------------+------+------+--------+---+------+------+ + * + * This is the scalar version so it works on a fixed sized registers + */ +static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 5); + int immb = extract32(insn, 16, 3); + int immh = extract32(insn, 19, 4); + bool is_u = extract32(insn, 29, 1); + + if (immh == 0) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0x08: /* SRI */ + if (!is_u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x00: /* SSHR / USHR */ + case 0x02: /* SSRA / USRA */ + case 0x04: /* SRSHR / URSHR */ + case 0x06: /* SRSRA / URSRA */ + handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd); + break; + case 0x0a: /* SHL / SLI */ + handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd); + break; + case 0x1c: /* SCVTF, UCVTF */ + handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb, + opcode, rn, rd); + break; + case 0x10: /* SQSHRUN, SQSHRUN2 */ + case 0x11: /* SQRSHRUN, SQRSHRUN2 */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_vec_simd_sqshrn(s, true, false, false, true, + immh, immb, opcode, rn, rd); + break; + case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */ + case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */ + handle_vec_simd_sqshrn(s, true, false, is_u, is_u, + immh, immb, opcode, rn, rd); + break; + case 0xc: /* SQSHLU */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd); + break; + case 0xe: /* SQSHL, UQSHL */ + handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd); + break; + case 0x1f: /* FCVTZS, FCVTZU */ + handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd); + break; + default: + unallocated_encoding(s); + break; + } +} + +/* AdvSIMD scalar three different + * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 + * +-----+---+-----------+------+---+------+--------+-----+------+------+ + * | 0 1 | U | 1 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd | + * +-----+---+-----------+------+---+------+--------+-----+------+------+ + */ +static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn) +{ + bool is_u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 4); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + if (is_u) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0x9: /* SQDMLAL, SQDMLAL2 */ + case 0xb: /* SQDMLSL, SQDMLSL2 */ + case 0xd: /* SQDMULL, SQDMULL2 */ + if (size == 0 || size == 3) { + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (size == 2) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN); + read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN); + + tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2); + gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, tcg_res, tcg_res); + + switch (opcode) { + case 0xd: /* SQDMULL, SQDMULL2 */ + break; + case 0xb: /* SQDMLSL, SQDMLSL2 */ + tcg_gen_neg_i64(tcg_res, tcg_res); + /* fall through */ + case 0x9: /* SQDMLAL, SQDMLAL2 */ + read_vec_element(s, tcg_op1, rd, 0, MO_64); + gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, + tcg_res, tcg_op1); + break; + default: + g_assert_not_reached(); + } + + write_fp_dreg(s, rd, tcg_res); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res); + } else { + TCGv_i32 tcg_op1 = read_fp_hreg(s, rn); + TCGv_i32 tcg_op2 = read_fp_hreg(s, rm); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2); + gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, tcg_res, tcg_res); + + switch (opcode) { + case 0xd: /* SQDMULL, SQDMULL2 */ + break; + case 0xb: /* SQDMLSL, SQDMLSL2 */ + gen_helper_neon_negl_u32(tcg_res, tcg_res); + /* fall through */ + case 0x9: /* SQDMLAL, SQDMLAL2 */ + { + TCGv_i64 tcg_op3 = tcg_temp_new_i64(); + read_vec_element(s, tcg_op3, rd, 0, MO_32); + gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, + tcg_res, tcg_op3); + tcg_temp_free_i64(tcg_op3); + break; + } + default: + g_assert_not_reached(); + } + + tcg_gen_ext32u_i64(tcg_res, tcg_res); + write_fp_dreg(s, rd, tcg_res); + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i64(tcg_res); + } +} + +static void handle_3same_64(DisasContext *s, int opcode, bool u, + TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm) +{ + /* Handle 64x64->64 opcodes which are shared between the scalar + * and vector 3-same groups. We cover every opcode where size == 3 + * is valid in either the three-reg-same (integer, not pairwise) + * or scalar-three-reg-same groups. + */ + TCGCond cond; + + switch (opcode) { + case 0x1: /* SQADD */ + if (u) { + gen_helper_neon_qadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } else { + gen_helper_neon_qadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } + break; + case 0x5: /* SQSUB */ + if (u) { + gen_helper_neon_qsub_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } else { + gen_helper_neon_qsub_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } + break; + case 0x6: /* CMGT, CMHI */ + /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0. + * We implement this using setcond (test) and then negating. + */ + cond = u ? TCG_COND_GTU : TCG_COND_GT; + do_cmop: + tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm); + tcg_gen_neg_i64(tcg_rd, tcg_rd); + break; + case 0x7: /* CMGE, CMHS */ + cond = u ? TCG_COND_GEU : TCG_COND_GE; + goto do_cmop; + case 0x11: /* CMTST, CMEQ */ + if (u) { + cond = TCG_COND_EQ; + goto do_cmop; + } + gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm); + break; + case 0x8: /* SSHL, USHL */ + if (u) { + gen_ushl_i64(tcg_rd, tcg_rn, tcg_rm); + } else { + gen_sshl_i64(tcg_rd, tcg_rn, tcg_rm); + } + break; + case 0x9: /* SQSHL, UQSHL */ + if (u) { + gen_helper_neon_qshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } else { + gen_helper_neon_qshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } + break; + case 0xa: /* SRSHL, URSHL */ + if (u) { + gen_helper_neon_rshl_u64(tcg_rd, tcg_rn, tcg_rm); + } else { + gen_helper_neon_rshl_s64(tcg_rd, tcg_rn, tcg_rm); + } + break; + case 0xb: /* SQRSHL, UQRSHL */ + if (u) { + gen_helper_neon_qrshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } else { + gen_helper_neon_qrshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm); + } + break; + case 0x10: /* ADD, SUB */ + if (u) { + tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm); + } else { + tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm); + } + break; + default: + g_assert_not_reached(); + } +} + +/* Handle the 3-same-operands float operations; shared by the scalar + * and vector encodings. The caller must filter out any encodings + * not allocated for the encoding it is dealing with. + */ +static void handle_3same_float(DisasContext *s, int size, int elements, + int fpopcode, int rd, int rn, int rm) +{ + int pass; + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + + for (pass = 0; pass < elements; pass++) { + if (size) { + /* Double */ + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + + switch (fpopcode) { + case 0x39: /* FMLS */ + /* As usual for ARM, separate negation for fused multiply-add */ + gen_helper_vfp_negd(tcg_op1, tcg_op1); + /* fall through */ + case 0x19: /* FMLA */ + read_vec_element(s, tcg_res, rd, pass, MO_64); + gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2, + tcg_res, fpst); + break; + case 0x18: /* FMAXNM */ + gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1a: /* FADD */ + gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1b: /* FMULX */ + gen_helper_vfp_mulxd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1c: /* FCMEQ */ + gen_helper_neon_ceq_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1e: /* FMAX */ + gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1f: /* FRECPS */ + gen_helper_recpsf_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x38: /* FMINNM */ + gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3a: /* FSUB */ + gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3e: /* FMIN */ + gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3f: /* FRSQRTS */ + gen_helper_rsqrtsf_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5b: /* FMUL */ + gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5c: /* FCMGE */ + gen_helper_neon_cge_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5d: /* FACGE */ + gen_helper_neon_acge_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5f: /* FDIV */ + gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7a: /* FABD */ + gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst); + gen_helper_vfp_absd(tcg_res, tcg_res); + break; + case 0x7c: /* FCMGT */ + gen_helper_neon_cgt_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7d: /* FACGT */ + gen_helper_neon_acgt_f64(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + write_vec_element(s, tcg_res, rd, pass, MO_64); + + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } else { + /* Single */ + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op1, rn, pass, MO_32); + read_vec_element_i32(s, tcg_op2, rm, pass, MO_32); + + switch (fpopcode) { + case 0x39: /* FMLS */ + /* As usual for ARM, separate negation for fused multiply-add */ + gen_helper_vfp_negs(tcg_op1, tcg_op1); + /* fall through */ + case 0x19: /* FMLA */ + read_vec_element_i32(s, tcg_res, rd, pass, MO_32); + gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2, + tcg_res, fpst); + break; + case 0x1a: /* FADD */ + gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1b: /* FMULX */ + gen_helper_vfp_mulxs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1c: /* FCMEQ */ + gen_helper_neon_ceq_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1e: /* FMAX */ + gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1f: /* FRECPS */ + gen_helper_recpsf_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x18: /* FMAXNM */ + gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x38: /* FMINNM */ + gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3a: /* FSUB */ + gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3e: /* FMIN */ + gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3f: /* FRSQRTS */ + gen_helper_rsqrtsf_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5b: /* FMUL */ + gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5c: /* FCMGE */ + gen_helper_neon_cge_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5d: /* FACGE */ + gen_helper_neon_acge_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x5f: /* FDIV */ + gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7a: /* FABD */ + gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst); + gen_helper_vfp_abss(tcg_res, tcg_res); + break; + case 0x7c: /* FCMGT */ + gen_helper_neon_cgt_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7d: /* FACGT */ + gen_helper_neon_acgt_f32(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + if (elements == 1) { + /* scalar single so clear high part */ + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + + tcg_gen_extu_i32_i64(tcg_tmp, tcg_res); + write_vec_element(s, tcg_tmp, rd, pass, MO_64); + tcg_temp_free_i64(tcg_tmp); + } else { + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + } + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + } + } + + tcg_temp_free_ptr(fpst); + + clear_vec_high(s, elements * (size ? 8 : 4) > 8, rd); +} + +/* AdvSIMD scalar three same + * 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0 + * +-----+---+-----------+------+---+------+--------+---+------+------+ + * | 0 1 | U | 1 1 1 1 0 | size | 1 | Rm | opcode | 1 | Rn | Rd | + * +-----+---+-----------+------+---+------+--------+---+------+------+ + */ +static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 5); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + TCGv_i64 tcg_rd; + + if (opcode >= 0x18) { + /* Floating point: U, size[1] and opcode indicate operation */ + int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6); + switch (fpopcode) { + case 0x1b: /* FMULX */ + case 0x1f: /* FRECPS */ + case 0x3f: /* FRSQRTS */ + case 0x5d: /* FACGE */ + case 0x7d: /* FACGT */ + case 0x1c: /* FCMEQ */ + case 0x5c: /* FCMGE */ + case 0x7c: /* FCMGT */ + case 0x7a: /* FABD */ + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm); + return; + } + + switch (opcode) { + case 0x1: /* SQADD, UQADD */ + case 0x5: /* SQSUB, UQSUB */ + case 0x9: /* SQSHL, UQSHL */ + case 0xb: /* SQRSHL, UQRSHL */ + break; + case 0x8: /* SSHL, USHL */ + case 0xa: /* SRSHL, URSHL */ + case 0x6: /* CMGT, CMHI */ + case 0x7: /* CMGE, CMHS */ + case 0x11: /* CMTST, CMEQ */ + case 0x10: /* ADD, SUB (vector) */ + if (size != 3) { + unallocated_encoding(s); + return; + } + break; + case 0x16: /* SQDMULH, SQRDMULH (vector) */ + if (size != 1 && size != 2) { + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_rd = tcg_temp_new_i64(); + + if (size == 3) { + TCGv_i64 tcg_rn = read_fp_dreg(s, rn); + TCGv_i64 tcg_rm = read_fp_dreg(s, rm); + + handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm); + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rm); + } else { + /* Do a single operation on the lowest element in the vector. + * We use the standard Neon helpers and rely on 0 OP 0 == 0 with + * no side effects for all these operations. + * OPTME: special-purpose helpers would avoid doing some + * unnecessary work in the helper for the 8 and 16 bit cases. + */ + NeonGenTwoOpEnvFn *genenvfn; + TCGv_i32 tcg_rn = tcg_temp_new_i32(); + TCGv_i32 tcg_rm = tcg_temp_new_i32(); + TCGv_i32 tcg_rd32 = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_rn, rn, 0, size); + read_vec_element_i32(s, tcg_rm, rm, 0, size); + + switch (opcode) { + case 0x1: /* SQADD, UQADD */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qadd_s8, gen_helper_neon_qadd_u8 }, + { gen_helper_neon_qadd_s16, gen_helper_neon_qadd_u16 }, + { gen_helper_neon_qadd_s32, gen_helper_neon_qadd_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0x5: /* SQSUB, UQSUB */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qsub_s8, gen_helper_neon_qsub_u8 }, + { gen_helper_neon_qsub_s16, gen_helper_neon_qsub_u16 }, + { gen_helper_neon_qsub_s32, gen_helper_neon_qsub_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0x9: /* SQSHL, UQSHL */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 }, + { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 }, + { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0xb: /* SQRSHL, UQRSHL */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 }, + { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 }, + { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0x16: /* SQDMULH, SQRDMULH */ + { + static NeonGenTwoOpEnvFn * const fns[2][2] = { + { gen_helper_neon_qdmulh_s16, gen_helper_neon_qrdmulh_s16 }, + { gen_helper_neon_qdmulh_s32, gen_helper_neon_qrdmulh_s32 }, + }; + assert(size == 1 || size == 2); + genenvfn = fns[size - 1][u]; + break; + } + default: + g_assert_not_reached(); + } + + genenvfn(tcg_rd32, cpu_env, tcg_rn, tcg_rm); + tcg_gen_extu_i32_i64(tcg_rd, tcg_rd32); + tcg_temp_free_i32(tcg_rd32); + tcg_temp_free_i32(tcg_rn); + tcg_temp_free_i32(tcg_rm); + } + + write_fp_dreg(s, rd, tcg_rd); + + tcg_temp_free_i64(tcg_rd); +} + +/* AdvSIMD scalar three same FP16 + * 31 30 29 28 24 23 22 21 20 16 15 14 13 11 10 9 5 4 0 + * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+ + * | 0 1 | U | 1 1 1 1 0 | a | 1 0 | Rm | 0 0 | opcode | 1 | Rn | Rd | + * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+ + * v: 0101 1110 0100 0000 0000 0100 0000 0000 => 5e400400 + * m: 1101 1111 0110 0000 1100 0100 0000 0000 => df60c400 + */ +static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s, + uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 3); + int rm = extract32(insn, 16, 5); + bool u = extract32(insn, 29, 1); + bool a = extract32(insn, 23, 1); + int fpopcode = opcode | (a << 3) | (u << 4); + TCGv_ptr fpst; + TCGv_i32 tcg_op1; + TCGv_i32 tcg_op2; + TCGv_i32 tcg_res; + + switch (fpopcode) { + case 0x03: /* FMULX */ + case 0x04: /* FCMEQ (reg) */ + case 0x07: /* FRECPS */ + case 0x0f: /* FRSQRTS */ + case 0x14: /* FCMGE (reg) */ + case 0x15: /* FACGE */ + case 0x1a: /* FABD */ + case 0x1c: /* FCMGT (reg) */ + case 0x1d: /* FACGT */ + break; + default: + unallocated_encoding(s); + return; + } + + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + } + + if (!fp_access_check(s)) { + return; + } + + fpst = fpstatus_ptr(FPST_FPCR_F16); + + tcg_op1 = read_fp_hreg(s, rn); + tcg_op2 = read_fp_hreg(s, rm); + tcg_res = tcg_temp_new_i32(); + + switch (fpopcode) { + case 0x03: /* FMULX */ + gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x04: /* FCMEQ (reg) */ + gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x07: /* FRECPS */ + gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x0f: /* FRSQRTS */ + gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x14: /* FCMGE (reg) */ + gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x15: /* FACGE */ + gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1a: /* FABD */ + gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst); + tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff); + break; + case 0x1c: /* FCMGT (reg) */ + gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1d: /* FACGT */ + gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + write_fp_sreg(s, rd, tcg_res); + + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_ptr(fpst); +} + +/* AdvSIMD scalar three same extra + * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0 + * +-----+---+-----------+------+---+------+---+--------+---+----+----+ + * | 0 1 | U | 1 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd | + * +-----+---+-----------+------+---+------+---+--------+---+----+----+ + */ +static void disas_simd_scalar_three_reg_same_extra(DisasContext *s, + uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 4); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + TCGv_i32 ele1, ele2, ele3; + TCGv_i64 res; + bool feature; + + switch (u * 16 + opcode) { + case 0x10: /* SQRDMLAH (vector) */ + case 0x11: /* SQRDMLSH (vector) */ + if (size != 1 && size != 2) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_rdm, s); + break; + default: + unallocated_encoding(s); + return; + } + if (!feature) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + /* Do a single operation on the lowest element in the vector. + * We use the standard Neon helpers and rely on 0 OP 0 == 0 + * with no side effects for all these operations. + * OPTME: special-purpose helpers would avoid doing some + * unnecessary work in the helper for the 16 bit cases. + */ + ele1 = tcg_temp_new_i32(); + ele2 = tcg_temp_new_i32(); + ele3 = tcg_temp_new_i32(); + + read_vec_element_i32(s, ele1, rn, 0, size); + read_vec_element_i32(s, ele2, rm, 0, size); + read_vec_element_i32(s, ele3, rd, 0, size); + + switch (opcode) { + case 0x0: /* SQRDMLAH */ + if (size == 1) { + gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3); + } else { + gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3); + } + break; + case 0x1: /* SQRDMLSH */ + if (size == 1) { + gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3); + } else { + gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3); + } + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i32(ele1); + tcg_temp_free_i32(ele2); + + res = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(res, ele3); + tcg_temp_free_i32(ele3); + + write_fp_dreg(s, rd, res); + tcg_temp_free_i64(res); +} + +static void handle_2misc_64(DisasContext *s, int opcode, bool u, + TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, + TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus) +{ + /* Handle 64->64 opcodes which are shared between the scalar and + * vector 2-reg-misc groups. We cover every integer opcode where size == 3 + * is valid in either group and also the double-precision fp ops. + * The caller only need provide tcg_rmode and tcg_fpstatus if the op + * requires them. + */ + TCGCond cond; + + switch (opcode) { + case 0x4: /* CLS, CLZ */ + if (u) { + tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64); + } else { + tcg_gen_clrsb_i64(tcg_rd, tcg_rn); + } + break; + case 0x5: /* NOT */ + /* This opcode is shared with CNT and RBIT but we have earlier + * enforced that size == 3 if and only if this is the NOT insn. + */ + tcg_gen_not_i64(tcg_rd, tcg_rn); + break; + case 0x7: /* SQABS, SQNEG */ + if (u) { + gen_helper_neon_qneg_s64(tcg_rd, cpu_env, tcg_rn); + } else { + gen_helper_neon_qabs_s64(tcg_rd, cpu_env, tcg_rn); + } + break; + case 0xa: /* CMLT */ + /* 64 bit integer comparison against zero, result is + * test ? (2^64 - 1) : 0. We implement via setcond(!test) and + * subtracting 1. + */ + cond = TCG_COND_LT; + do_cmop: + tcg_gen_setcondi_i64(cond, tcg_rd, tcg_rn, 0); + tcg_gen_neg_i64(tcg_rd, tcg_rd); + break; + case 0x8: /* CMGT, CMGE */ + cond = u ? TCG_COND_GE : TCG_COND_GT; + goto do_cmop; + case 0x9: /* CMEQ, CMLE */ + cond = u ? TCG_COND_LE : TCG_COND_EQ; + goto do_cmop; + case 0xb: /* ABS, NEG */ + if (u) { + tcg_gen_neg_i64(tcg_rd, tcg_rn); + } else { + tcg_gen_abs_i64(tcg_rd, tcg_rn); + } + break; + case 0x2f: /* FABS */ + gen_helper_vfp_absd(tcg_rd, tcg_rn); + break; + case 0x6f: /* FNEG */ + gen_helper_vfp_negd(tcg_rd, tcg_rn); + break; + case 0x7f: /* FSQRT */ + gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env); + break; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus); + break; + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus); + break; + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + case 0x58: /* FRINTA */ + case 0x79: /* FRINTI */ + gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus); + break; + case 0x59: /* FRINTX */ + gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus); + break; + case 0x1e: /* FRINT32Z */ + case 0x5e: /* FRINT32X */ + gen_helper_frint32_d(tcg_rd, tcg_rn, tcg_fpstatus); + break; + case 0x1f: /* FRINT64Z */ + case 0x5f: /* FRINT64X */ + gen_helper_frint64_d(tcg_rd, tcg_rn, tcg_fpstatus); + break; + default: + g_assert_not_reached(); + } +} + +static void handle_2misc_fcmp_zero(DisasContext *s, int opcode, + bool is_scalar, bool is_u, bool is_q, + int size, int rn, int rd) +{ + bool is_double = (size == MO_64); + TCGv_ptr fpst; + + if (!fp_access_check(s)) { + return; + } + + fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + + if (is_double) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_zero = tcg_constant_i64(0); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + NeonGenTwoDoubleOpFn *genfn; + bool swap = false; + int pass; + + switch (opcode) { + case 0x2e: /* FCMLT (zero) */ + swap = true; + /* fallthrough */ + case 0x2c: /* FCMGT (zero) */ + genfn = gen_helper_neon_cgt_f64; + break; + case 0x2d: /* FCMEQ (zero) */ + genfn = gen_helper_neon_ceq_f64; + break; + case 0x6d: /* FCMLE (zero) */ + swap = true; + /* fall through */ + case 0x6c: /* FCMGE (zero) */ + genfn = gen_helper_neon_cge_f64; + break; + default: + g_assert_not_reached(); + } + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + read_vec_element(s, tcg_op, rn, pass, MO_64); + if (swap) { + genfn(tcg_res, tcg_zero, tcg_op, fpst); + } else { + genfn(tcg_res, tcg_op, tcg_zero, fpst); + } + write_vec_element(s, tcg_res, rd, pass, MO_64); + } + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op); + + clear_vec_high(s, !is_scalar, rd); + } else { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_zero = tcg_constant_i32(0); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + NeonGenTwoSingleOpFn *genfn; + bool swap = false; + int pass, maxpasses; + + if (size == MO_16) { + switch (opcode) { + case 0x2e: /* FCMLT (zero) */ + swap = true; + /* fall through */ + case 0x2c: /* FCMGT (zero) */ + genfn = gen_helper_advsimd_cgt_f16; + break; + case 0x2d: /* FCMEQ (zero) */ + genfn = gen_helper_advsimd_ceq_f16; + break; + case 0x6d: /* FCMLE (zero) */ + swap = true; + /* fall through */ + case 0x6c: /* FCMGE (zero) */ + genfn = gen_helper_advsimd_cge_f16; + break; + default: + g_assert_not_reached(); + } + } else { + switch (opcode) { + case 0x2e: /* FCMLT (zero) */ + swap = true; + /* fall through */ + case 0x2c: /* FCMGT (zero) */ + genfn = gen_helper_neon_cgt_f32; + break; + case 0x2d: /* FCMEQ (zero) */ + genfn = gen_helper_neon_ceq_f32; + break; + case 0x6d: /* FCMLE (zero) */ + swap = true; + /* fall through */ + case 0x6c: /* FCMGE (zero) */ + genfn = gen_helper_neon_cge_f32; + break; + default: + g_assert_not_reached(); + } + } + + if (is_scalar) { + maxpasses = 1; + } else { + int vector_size = 8 << is_q; + maxpasses = vector_size >> size; + } + + for (pass = 0; pass < maxpasses; pass++) { + read_vec_element_i32(s, tcg_op, rn, pass, size); + if (swap) { + genfn(tcg_res, tcg_zero, tcg_op, fpst); + } else { + genfn(tcg_res, tcg_op, tcg_zero, fpst); + } + if (is_scalar) { + write_fp_sreg(s, rd, tcg_res); + } else { + write_vec_element_i32(s, tcg_res, rd, pass, size); + } + } + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + if (!is_scalar) { + clear_vec_high(s, is_q, rd); + } + } + + tcg_temp_free_ptr(fpst); +} + +static void handle_2misc_reciprocal(DisasContext *s, int opcode, + bool is_scalar, bool is_u, bool is_q, + int size, int rn, int rd) +{ + bool is_double = (size == 3); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + + if (is_double) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + int pass; + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + read_vec_element(s, tcg_op, rn, pass, MO_64); + switch (opcode) { + case 0x3d: /* FRECPE */ + gen_helper_recpe_f64(tcg_res, tcg_op, fpst); + break; + case 0x3f: /* FRECPX */ + gen_helper_frecpx_f64(tcg_res, tcg_op, fpst); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst); + break; + default: + g_assert_not_reached(); + } + write_vec_element(s, tcg_res, rd, pass, MO_64); + } + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op); + clear_vec_high(s, !is_scalar, rd); + } else { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + int pass, maxpasses; + + if (is_scalar) { + maxpasses = 1; + } else { + maxpasses = is_q ? 4 : 2; + } + + for (pass = 0; pass < maxpasses; pass++) { + read_vec_element_i32(s, tcg_op, rn, pass, MO_32); + + switch (opcode) { + case 0x3c: /* URECPE */ + gen_helper_recpe_u32(tcg_res, tcg_op); + break; + case 0x3d: /* FRECPE */ + gen_helper_recpe_f32(tcg_res, tcg_op, fpst); + break; + case 0x3f: /* FRECPX */ + gen_helper_frecpx_f32(tcg_res, tcg_op, fpst); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst); + break; + default: + g_assert_not_reached(); + } + + if (is_scalar) { + write_fp_sreg(s, rd, tcg_res); + } else { + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + } + } + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + if (!is_scalar) { + clear_vec_high(s, is_q, rd); + } + } + tcg_temp_free_ptr(fpst); +} + +static void handle_2misc_narrow(DisasContext *s, bool scalar, + int opcode, bool u, bool is_q, + int size, int rn, int rd) +{ + /* Handle 2-reg-misc ops which are narrowing (so each 2*size element + * in the source becomes a size element in the destination). + */ + int pass; + TCGv_i32 tcg_res[2]; + int destelt = is_q ? 2 : 0; + int passes = scalar ? 1 : 2; + + if (scalar) { + tcg_res[1] = tcg_constant_i32(0); + } + + for (pass = 0; pass < passes; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + NeonGenNarrowFn *genfn = NULL; + NeonGenNarrowEnvFn *genenvfn = NULL; + + if (scalar) { + read_vec_element(s, tcg_op, rn, pass, size + 1); + } else { + read_vec_element(s, tcg_op, rn, pass, MO_64); + } + tcg_res[pass] = tcg_temp_new_i32(); + + switch (opcode) { + case 0x12: /* XTN, SQXTUN */ + { + static NeonGenNarrowFn * const xtnfns[3] = { + gen_helper_neon_narrow_u8, + gen_helper_neon_narrow_u16, + tcg_gen_extrl_i64_i32, + }; + static NeonGenNarrowEnvFn * const sqxtunfns[3] = { + gen_helper_neon_unarrow_sat8, + gen_helper_neon_unarrow_sat16, + gen_helper_neon_unarrow_sat32, + }; + if (u) { + genenvfn = sqxtunfns[size]; + } else { + genfn = xtnfns[size]; + } + break; + } + case 0x14: /* SQXTN, UQXTN */ + { + static NeonGenNarrowEnvFn * const fns[3][2] = { + { gen_helper_neon_narrow_sat_s8, + gen_helper_neon_narrow_sat_u8 }, + { gen_helper_neon_narrow_sat_s16, + gen_helper_neon_narrow_sat_u16 }, + { gen_helper_neon_narrow_sat_s32, + gen_helper_neon_narrow_sat_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0x16: /* FCVTN, FCVTN2 */ + /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */ + if (size == 2) { + gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env); + } else { + TCGv_i32 tcg_lo = tcg_temp_new_i32(); + TCGv_i32 tcg_hi = tcg_temp_new_i32(); + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + TCGv_i32 ahp = get_ahp_flag(); + + tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op); + gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp); + gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp); + tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16); + tcg_temp_free_i32(tcg_lo); + tcg_temp_free_i32(tcg_hi); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(ahp); + } + break; + case 0x36: /* BFCVTN, BFCVTN2 */ + { + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst); + tcg_temp_free_ptr(fpst); + } + break; + case 0x56: /* FCVTXN, FCVTXN2 */ + /* 64 bit to 32 bit float conversion + * with von Neumann rounding (round to odd) + */ + assert(size == 2); + gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env); + break; + default: + g_assert_not_reached(); + } + + if (genfn) { + genfn(tcg_res[pass], tcg_op); + } else if (genenvfn) { + genenvfn(tcg_res[pass], cpu_env, tcg_op); + } + + tcg_temp_free_i64(tcg_op); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } + clear_vec_high(s, is_q, rd); +} + +/* Remaining saturating accumulating ops */ +static void handle_2misc_satacc(DisasContext *s, bool is_scalar, bool is_u, + bool is_q, int size, int rn, int rd) +{ + bool is_double = (size == 3); + + if (is_double) { + TCGv_i64 tcg_rn = tcg_temp_new_i64(); + TCGv_i64 tcg_rd = tcg_temp_new_i64(); + int pass; + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + read_vec_element(s, tcg_rn, rn, pass, MO_64); + read_vec_element(s, tcg_rd, rd, pass, MO_64); + + if (is_u) { /* USQADD */ + gen_helper_neon_uqadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rd); + } else { /* SUQADD */ + gen_helper_neon_sqadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rd); + } + write_vec_element(s, tcg_rd, rd, pass, MO_64); + } + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i64(tcg_rn); + clear_vec_high(s, !is_scalar, rd); + } else { + TCGv_i32 tcg_rn = tcg_temp_new_i32(); + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + int pass, maxpasses; + + if (is_scalar) { + maxpasses = 1; + } else { + maxpasses = is_q ? 4 : 2; + } + + for (pass = 0; pass < maxpasses; pass++) { + if (is_scalar) { + read_vec_element_i32(s, tcg_rn, rn, pass, size); + read_vec_element_i32(s, tcg_rd, rd, pass, size); + } else { + read_vec_element_i32(s, tcg_rn, rn, pass, MO_32); + read_vec_element_i32(s, tcg_rd, rd, pass, MO_32); + } + + if (is_u) { /* USQADD */ + switch (size) { + case 0: + gen_helper_neon_uqadd_s8(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + case 1: + gen_helper_neon_uqadd_s16(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + case 2: + gen_helper_neon_uqadd_s32(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + default: + g_assert_not_reached(); + } + } else { /* SUQADD */ + switch (size) { + case 0: + gen_helper_neon_sqadd_u8(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + case 1: + gen_helper_neon_sqadd_u16(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + case 2: + gen_helper_neon_sqadd_u32(tcg_rd, cpu_env, tcg_rn, tcg_rd); + break; + default: + g_assert_not_reached(); + } + } + + if (is_scalar) { + write_vec_element(s, tcg_constant_i64(0), rd, 0, MO_64); + } + write_vec_element_i32(s, tcg_rd, rd, pass, MO_32); + } + tcg_temp_free_i32(tcg_rd); + tcg_temp_free_i32(tcg_rn); + clear_vec_high(s, is_q, rd); + } +} + +/* AdvSIMD scalar two reg misc + * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +-----+---+-----------+------+-----------+--------+-----+------+------+ + * | 0 1 | U | 1 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 | Rn | Rd | + * +-----+---+-----------+------+-----------+--------+-----+------+------+ + */ +static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 12, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + bool is_fcvt = false; + int rmode; + TCGv_i32 tcg_rmode; + TCGv_ptr tcg_fpstatus; + + switch (opcode) { + case 0x3: /* USQADD / SUQADD*/ + if (!fp_access_check(s)) { + return; + } + handle_2misc_satacc(s, true, u, false, size, rn, rd); + return; + case 0x7: /* SQABS / SQNEG */ + break; + case 0xa: /* CMLT */ + if (u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x8: /* CMGT, CMGE */ + case 0x9: /* CMEQ, CMLE */ + case 0xb: /* ABS, NEG */ + if (size != 3) { + unallocated_encoding(s); + return; + } + break; + case 0x12: /* SQXTUN */ + if (!u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x14: /* SQXTN, UQXTN */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd); + return; + case 0xc ... 0xf: + case 0x16 ... 0x1d: + case 0x1f: + /* Floating point: U, size[1] and opcode indicate operation; + * size[0] indicates single or double precision. + */ + opcode |= (extract32(size, 1, 1) << 5) | (u << 6); + size = extract32(size, 0, 1) ? 3 : 2; + switch (opcode) { + case 0x2c: /* FCMGT (zero) */ + case 0x2d: /* FCMEQ (zero) */ + case 0x2e: /* FCMLT (zero) */ + case 0x6c: /* FCMGE (zero) */ + case 0x6d: /* FCMLE (zero) */ + handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd); + return; + case 0x1d: /* SCVTF */ + case 0x5d: /* UCVTF */ + { + bool is_signed = (opcode == 0x1d); + if (!fp_access_check(s)) { + return; + } + handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size); + return; + } + case 0x3d: /* FRECPE */ + case 0x3f: /* FRECPX */ + case 0x7d: /* FRSQRTE */ + if (!fp_access_check(s)) { + return; + } + handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd); + return; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + is_fcvt = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + break; + case 0x1c: /* FCVTAS */ + case 0x5c: /* FCVTAU */ + /* TIEAWAY doesn't fit in the usual rounding mode encoding */ + is_fcvt = true; + rmode = FPROUNDING_TIEAWAY; + break; + case 0x56: /* FCVTXN, FCVTXN2 */ + if (size == 2) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd); + return; + default: + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (is_fcvt) { + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + tcg_fpstatus = fpstatus_ptr(FPST_FPCR); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + } else { + tcg_rmode = NULL; + tcg_fpstatus = NULL; + } + + if (size == 3) { + TCGv_i64 tcg_rn = read_fp_dreg(s, rn); + TCGv_i64 tcg_rd = tcg_temp_new_i64(); + + handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus); + write_fp_dreg(s, rd, tcg_rd); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i64(tcg_rn); + } else { + TCGv_i32 tcg_rn = tcg_temp_new_i32(); + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_rn, rn, 0, size); + + switch (opcode) { + case 0x7: /* SQABS, SQNEG */ + { + NeonGenOneOpEnvFn *genfn; + static NeonGenOneOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 }, + { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 }, + { gen_helper_neon_qabs_s32, gen_helper_neon_qneg_s32 }, + }; + genfn = fns[size][u]; + genfn(tcg_rd, cpu_env, tcg_rn); + break; + } + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_constant_i32(0), + tcg_fpstatus); + break; + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_constant_i32(0), + tcg_fpstatus); + break; + default: + g_assert_not_reached(); + } + + write_fp_sreg(s, rd, tcg_rd); + tcg_temp_free_i32(tcg_rd); + tcg_temp_free_i32(tcg_rn); + } + + if (is_fcvt) { + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_ptr(tcg_fpstatus); + } +} + +/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */ +static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, + int immh, int immb, int opcode, int rn, int rd) +{ + int size = 32 - clz32(immh) - 1; + int immhb = immh << 3 | immb; + int shift = 2 * (8 << size) - immhb; + GVecGen2iFn *gvec_fn; + + if (extract32(immh, 3, 1) && !is_q) { + unallocated_encoding(s); + return; + } + tcg_debug_assert(size <= 3); + + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0x02: /* SSRA / USRA (accumulate) */ + gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra; + break; + + case 0x08: /* SRI */ + gvec_fn = gen_gvec_sri; + break; + + case 0x00: /* SSHR / USHR */ + if (is_u) { + if (shift == 8 << size) { + /* Shift count the same size as element size produces zero. */ + tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd), + is_q ? 16 : 8, vec_full_reg_size(s), 0); + return; + } + gvec_fn = tcg_gen_gvec_shri; + } else { + /* Shift count the same size as element size produces all sign. */ + if (shift == 8 << size) { + shift -= 1; + } + gvec_fn = tcg_gen_gvec_sari; + } + break; + + case 0x04: /* SRSHR / URSHR (rounding) */ + gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr; + break; + + case 0x06: /* SRSRA / URSRA (accum + rounding) */ + gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra; + break; + + default: + g_assert_not_reached(); + } + + gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size); +} + +/* SHL/SLI - Vector shift left */ +static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, + int immh, int immb, int opcode, int rn, int rd) +{ + int size = 32 - clz32(immh) - 1; + int immhb = immh << 3 | immb; + int shift = immhb - (8 << size); + + /* Range of size is limited by decode: immh is a non-zero 4 bit field */ + assert(size >= 0 && size <= 3); + + if (extract32(immh, 3, 1) && !is_q) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (insert) { + gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size); + } else { + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size); + } +} + +/* USHLL/SHLL - Vector shift left with widening */ +static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u, + int immh, int immb, int opcode, int rn, int rd) +{ + int size = 32 - clz32(immh) - 1; + int immhb = immh << 3 | immb; + int shift = immhb - (8 << size); + int dsize = 64; + int esize = 8 << size; + int elements = dsize/esize; + TCGv_i64 tcg_rn = new_tmp_a64(s); + TCGv_i64 tcg_rd = new_tmp_a64(s); + int i; + + if (size >= 3) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + /* For the LL variants the store is larger than the load, + * so if rd == rn we would overwrite parts of our input. + * So load everything right now and use shifts in the main loop. + */ + read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64); + + for (i = 0; i < elements; i++) { + tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize); + ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0); + tcg_gen_shli_i64(tcg_rd, tcg_rd, shift); + write_vec_element(s, tcg_rd, rd, i, size + 1); + } +} + +/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */ +static void handle_vec_simd_shrn(DisasContext *s, bool is_q, + int immh, int immb, int opcode, int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int dsize = 64; + int esize = 8 << size; + int elements = dsize/esize; + int shift = (2 * esize) - immhb; + bool round = extract32(opcode, 0, 1); + TCGv_i64 tcg_rn, tcg_rd, tcg_final; + TCGv_i64 tcg_round; + int i; + + if (extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_rn = tcg_temp_new_i64(); + tcg_rd = tcg_temp_new_i64(); + tcg_final = tcg_temp_new_i64(); + read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64); + + if (round) { + tcg_round = tcg_constant_i64(1ULL << (shift - 1)); + } else { + tcg_round = NULL; + } + + for (i = 0; i < elements; i++) { + read_vec_element(s, tcg_rn, rn, i, size+1); + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + false, true, size+1, shift); + + tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); + } + + if (!is_q) { + write_vec_element(s, tcg_final, rd, 0, MO_64); + } else { + write_vec_element(s, tcg_final, rd, 1, MO_64); + } + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i64(tcg_final); + + clear_vec_high(s, is_q, rd); +} + + +/* AdvSIMD shift by immediate + * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0 + * +---+---+---+-------------+------+------+--------+---+------+------+ + * | 0 | Q | U | 0 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd | + * +---+---+---+-------------+------+------+--------+---+------+------+ + */ +static void disas_simd_shift_imm(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 5); + int immb = extract32(insn, 16, 3); + int immh = extract32(insn, 19, 4); + bool is_u = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + + /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */ + assert(immh != 0); + + switch (opcode) { + case 0x08: /* SRI */ + if (!is_u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x00: /* SSHR / USHR */ + case 0x02: /* SSRA / USRA (accumulate) */ + case 0x04: /* SRSHR / URSHR (rounding) */ + case 0x06: /* SRSRA / URSRA (accum + rounding) */ + handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd); + break; + case 0x0a: /* SHL / SLI */ + handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd); + break; + case 0x10: /* SHRN */ + case 0x11: /* RSHRN / SQRSHRUN */ + if (is_u) { + handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb, + opcode, rn, rd); + } else { + handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd); + } + break; + case 0x12: /* SQSHRN / UQSHRN */ + case 0x13: /* SQRSHRN / UQRSHRN */ + handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb, + opcode, rn, rd); + break; + case 0x14: /* SSHLL / USHLL */ + handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd); + break; + case 0x1c: /* SCVTF / UCVTF */ + handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb, + opcode, rn, rd); + break; + case 0xc: /* SQSHLU */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd); + break; + case 0xe: /* SQSHL, UQSHL */ + handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd); + break; + case 0x1f: /* FCVTZS/ FCVTZU */ + handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd); + return; + default: + unallocated_encoding(s); + return; + } +} + +/* Generate code to do a "long" addition or subtraction, ie one done in + * TCGv_i64 on vector lanes twice the width specified by size. + */ +static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res, + TCGv_i64 tcg_op1, TCGv_i64 tcg_op2) +{ + static NeonGenTwo64OpFn * const fns[3][2] = { + { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 }, + { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 }, + { tcg_gen_add_i64, tcg_gen_sub_i64 }, + }; + NeonGenTwo64OpFn *genfn; + assert(size < 3); + + genfn = fns[size][is_sub]; + genfn(tcg_res, tcg_op1, tcg_op2); +} + +static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size, + int opcode, int rd, int rn, int rm) +{ + /* 3-reg-different widening insns: 64 x 64 -> 128 */ + TCGv_i64 tcg_res[2]; + int pass, accop; + + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = tcg_temp_new_i64(); + + /* Does this op do an adding accumulate, a subtracting accumulate, + * or no accumulate at all? + */ + switch (opcode) { + case 5: + case 8: + case 9: + accop = 1; + break; + case 10: + case 11: + accop = -1; + break; + default: + accop = 0; + break; + } + + if (accop != 0) { + read_vec_element(s, tcg_res[0], rd, 0, MO_64); + read_vec_element(s, tcg_res[1], rd, 1, MO_64); + } + + /* size == 2 means two 32x32->64 operations; this is worth special + * casing because we can generally handle it inline. + */ + if (size == 2) { + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_passres; + MemOp memop = MO_32 | (is_u ? 0 : MO_SIGN); + + int elt = pass + is_q * 2; + + read_vec_element(s, tcg_op1, rn, elt, memop); + read_vec_element(s, tcg_op2, rm, elt, memop); + + if (accop == 0) { + tcg_passres = tcg_res[pass]; + } else { + tcg_passres = tcg_temp_new_i64(); + } + + switch (opcode) { + case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ + tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2); + break; + case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ + tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2); + break; + case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ + case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ + { + TCGv_i64 tcg_tmp1 = tcg_temp_new_i64(); + TCGv_i64 tcg_tmp2 = tcg_temp_new_i64(); + + tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2); + tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1); + tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE, + tcg_passres, + tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2); + tcg_temp_free_i64(tcg_tmp1); + tcg_temp_free_i64(tcg_tmp2); + break; + } + case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ + case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ + case 12: /* UMULL, UMULL2, SMULL, SMULL2 */ + tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2); + break; + case 9: /* SQDMLAL, SQDMLAL2 */ + case 11: /* SQDMLSL, SQDMLSL2 */ + case 13: /* SQDMULL, SQDMULL2 */ + tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2); + gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env, + tcg_passres, tcg_passres); + break; + default: + g_assert_not_reached(); + } + + if (opcode == 9 || opcode == 11) { + /* saturating accumulate ops */ + if (accop < 0) { + tcg_gen_neg_i64(tcg_passres, tcg_passres); + } + gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env, + tcg_res[pass], tcg_passres); + } else if (accop > 0) { + tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres); + } else if (accop < 0) { + tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres); + } + + if (accop != 0) { + tcg_temp_free_i64(tcg_passres); + } + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } + } else { + /* size 0 or 1, generally helper functions */ + for (pass = 0; pass < 2; pass++) { + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i64 tcg_passres; + int elt = pass + is_q * 2; + + read_vec_element_i32(s, tcg_op1, rn, elt, MO_32); + read_vec_element_i32(s, tcg_op2, rm, elt, MO_32); + + if (accop == 0) { + tcg_passres = tcg_res[pass]; + } else { + tcg_passres = tcg_temp_new_i64(); + } + + switch (opcode) { + case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ + case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ + { + TCGv_i64 tcg_op2_64 = tcg_temp_new_i64(); + static NeonGenWidenFn * const widenfns[2][2] = { + { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 }, + { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 }, + }; + NeonGenWidenFn *widenfn = widenfns[size][is_u]; + + widenfn(tcg_op2_64, tcg_op2); + widenfn(tcg_passres, tcg_op1); + gen_neon_addl(size, (opcode == 2), tcg_passres, + tcg_passres, tcg_op2_64); + tcg_temp_free_i64(tcg_op2_64); + break; + } + case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ + case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ + if (size == 0) { + if (is_u) { + gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2); + } else { + gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2); + } + } else { + if (is_u) { + gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2); + } else { + gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2); + } + } + break; + case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ + case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ + case 12: /* UMULL, UMULL2, SMULL, SMULL2 */ + if (size == 0) { + if (is_u) { + gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2); + } else { + gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2); + } + } else { + if (is_u) { + gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2); + } else { + gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2); + } + } + break; + case 9: /* SQDMLAL, SQDMLAL2 */ + case 11: /* SQDMLSL, SQDMLSL2 */ + case 13: /* SQDMULL, SQDMULL2 */ + assert(size == 1); + gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2); + gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env, + tcg_passres, tcg_passres); + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + + if (accop != 0) { + if (opcode == 9 || opcode == 11) { + /* saturating accumulate ops */ + if (accop < 0) { + gen_helper_neon_negl_u32(tcg_passres, tcg_passres); + } + gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env, + tcg_res[pass], + tcg_passres); + } else { + gen_neon_addl(size, (accop < 0), tcg_res[pass], + tcg_res[pass], tcg_passres); + } + tcg_temp_free_i64(tcg_passres); + } + } + } + + write_vec_element(s, tcg_res[0], rd, 0, MO_64); + write_vec_element(s, tcg_res[1], rd, 1, MO_64); + tcg_temp_free_i64(tcg_res[0]); + tcg_temp_free_i64(tcg_res[1]); +} + +static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size, + int opcode, int rd, int rn, int rm) +{ + TCGv_i64 tcg_res[2]; + int part = is_q ? 2 : 0; + int pass; + + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i64 tcg_op2_wide = tcg_temp_new_i64(); + static NeonGenWidenFn * const widenfns[3][2] = { + { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 }, + { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 }, + { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 }, + }; + NeonGenWidenFn *widenfn = widenfns[size][is_u]; + + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32); + widenfn(tcg_op2_wide, tcg_op2); + tcg_temp_free_i32(tcg_op2); + tcg_res[pass] = tcg_temp_new_i64(); + gen_neon_addl(size, (opcode == 3), + tcg_res[pass], tcg_op1, tcg_op2_wide); + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2_wide); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } +} + +static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in) +{ + tcg_gen_addi_i64(in, in, 1U << 31); + tcg_gen_extrh_i64_i32(res, in); +} + +static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, + int opcode, int rd, int rn, int rm) +{ + TCGv_i32 tcg_res[2]; + int part = is_q ? 2 : 0; + int pass; + + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_wideres = tcg_temp_new_i64(); + static NeonGenNarrowFn * const narrowfns[3][2] = { + { gen_helper_neon_narrow_high_u8, + gen_helper_neon_narrow_round_high_u8 }, + { gen_helper_neon_narrow_high_u16, + gen_helper_neon_narrow_round_high_u16 }, + { tcg_gen_extrh_i64_i32, do_narrow_round_high_u32 }, + }; + NeonGenNarrowFn *gennarrow = narrowfns[size][is_u]; + + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + + gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + + tcg_res[pass] = tcg_temp_new_i32(); + gennarrow(tcg_res[pass], tcg_wideres); + tcg_temp_free_i64(tcg_wideres); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } + clear_vec_high(s, is_q, rd); +} + +/* AdvSIMD three different + * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+--------+-----+------+------+ + * | 0 | Q | U | 0 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd | + * +---+---+---+-----------+------+---+------+--------+-----+------+------+ + */ +static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) +{ + /* Instructions in this group fall into three basic classes + * (in each case with the operation working on each element in + * the input vectors): + * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra + * 128 bit input) + * (2) wide 64 x 128 -> 128 + * (3) narrowing 128 x 128 -> 64 + * Here we do initial decode, catch unallocated cases and + * dispatch to separate functions for each class. + */ + int is_q = extract32(insn, 30, 1); + int is_u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 4); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + switch (opcode) { + case 1: /* SADDW, SADDW2, UADDW, UADDW2 */ + case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */ + /* 64 x 128 -> 128 */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm); + break; + case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */ + case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */ + /* 128 x 128 -> 64 */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm); + break; + case 14: /* PMULL, PMULL2 */ + if (is_u) { + unallocated_encoding(s); + return; + } + switch (size) { + case 0: /* PMULL.P8 */ + if (!fp_access_check(s)) { + return; + } + /* The Q field specifies lo/hi half input for this insn. */ + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, + gen_helper_neon_pmull_h); + break; + + case 3: /* PMULL.P64 */ + if (!dc_isar_feature(aa64_pmull, s)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + /* The Q field specifies lo/hi half input for this insn. */ + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, + gen_helper_gvec_pmull_q); + break; + + default: + unallocated_encoding(s); + break; + } + return; + case 9: /* SQDMLAL, SQDMLAL2 */ + case 11: /* SQDMLSL, SQDMLSL2 */ + case 13: /* SQDMULL, SQDMULL2 */ + if (is_u || size == 0) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ + case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ + case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ + case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ + case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ + case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ + case 12: /* SMULL, SMULL2, UMULL, UMULL2 */ + /* 64 x 64 -> 128 */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm); + break; + default: + /* opcode 15 not allocated */ + unallocated_encoding(s); + break; + } +} + +/* Logic op (opcode == 3) subgroup of C3.6.16. */ +static void disas_simd_3same_logic(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool is_u = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + + if (!fp_access_check(s)) { + return; + } + + switch (size + 4 * is_u) { + case 0: /* AND */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0); + return; + case 1: /* BIC */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0); + return; + case 2: /* ORR */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0); + return; + case 3: /* ORN */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0); + return; + case 4: /* EOR */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0); + return; + + case 5: /* BSL bitwise select */ + gen_gvec_fn4(s, is_q, rd, rd, rn, rm, tcg_gen_gvec_bitsel, 0); + return; + case 6: /* BIT, bitwise insert if true */ + gen_gvec_fn4(s, is_q, rd, rm, rn, rd, tcg_gen_gvec_bitsel, 0); + return; + case 7: /* BIF, bitwise insert if false */ + gen_gvec_fn4(s, is_q, rd, rm, rd, rn, tcg_gen_gvec_bitsel, 0); + return; + + default: + g_assert_not_reached(); + } +} + +/* Pairwise op subgroup of C3.6.16. + * + * This is called directly or via the handle_3same_float for float pairwise + * operations where the opcode and size are calculated differently. + */ +static void handle_simd_3same_pair(DisasContext *s, int is_q, int u, int opcode, + int size, int rn, int rm, int rd) +{ + TCGv_ptr fpst; + int pass; + + /* Floating point operations need fpst */ + if (opcode >= 0x58) { + fpst = fpstatus_ptr(FPST_FPCR); + } else { + fpst = NULL; + } + + if (!fp_access_check(s)) { + return; + } + + /* These operations work on the concatenated rm:rn, with each pair of + * adjacent elements being operated on to produce an element in the result. + */ + if (size == 3) { + TCGv_i64 tcg_res[2]; + + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + int passreg = (pass == 0) ? rn : rm; + + read_vec_element(s, tcg_op1, passreg, 0, MO_64); + read_vec_element(s, tcg_op2, passreg, 1, MO_64); + tcg_res[pass] = tcg_temp_new_i64(); + + switch (opcode) { + case 0x17: /* ADDP */ + tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2); + break; + case 0x58: /* FMAXNMP */ + gen_helper_vfp_maxnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x5a: /* FADDP */ + gen_helper_vfp_addd(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x5e: /* FMAXP */ + gen_helper_vfp_maxd(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x78: /* FMINNMP */ + gen_helper_vfp_minnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x7e: /* FMINP */ + gen_helper_vfp_mind(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } + } else { + int maxpass = is_q ? 4 : 2; + TCGv_i32 tcg_res[4]; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + NeonGenTwoOpFn *genfn = NULL; + int passreg = pass < (maxpass / 2) ? rn : rm; + int passelt = (is_q && (pass & 1)) ? 2 : 0; + + read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_32); + read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_32); + tcg_res[pass] = tcg_temp_new_i32(); + + switch (opcode) { + case 0x17: /* ADDP */ + { + static NeonGenTwoOpFn * const fns[3] = { + gen_helper_neon_padd_u8, + gen_helper_neon_padd_u16, + tcg_gen_add_i32, + }; + genfn = fns[size]; + break; + } + case 0x14: /* SMAXP, UMAXP */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_pmax_s8, gen_helper_neon_pmax_u8 }, + { gen_helper_neon_pmax_s16, gen_helper_neon_pmax_u16 }, + { tcg_gen_smax_i32, tcg_gen_umax_i32 }, + }; + genfn = fns[size][u]; + break; + } + case 0x15: /* SMINP, UMINP */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_pmin_s8, gen_helper_neon_pmin_u8 }, + { gen_helper_neon_pmin_s16, gen_helper_neon_pmin_u16 }, + { tcg_gen_smin_i32, tcg_gen_umin_i32 }, + }; + genfn = fns[size][u]; + break; + } + /* The FP operations are all on single floats (32 bit) */ + case 0x58: /* FMAXNMP */ + gen_helper_vfp_maxnums(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x5a: /* FADDP */ + gen_helper_vfp_adds(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x5e: /* FMAXP */ + gen_helper_vfp_maxs(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x78: /* FMINNMP */ + gen_helper_vfp_minnums(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x7e: /* FMINP */ + gen_helper_vfp_mins(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + /* FP ops called directly, otherwise call now */ + if (genfn) { + genfn(tcg_res[pass], tcg_op1, tcg_op2); + } + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + } + + for (pass = 0; pass < maxpass; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } + clear_vec_high(s, is_q, rd); + } + + if (fpst) { + tcg_temp_free_ptr(fpst); + } +} + +/* Floating point op subgroup of C3.6.16. */ +static void disas_simd_3same_float(DisasContext *s, uint32_t insn) +{ + /* For floating point ops, the U, size[1] and opcode bits + * together indicate the operation. size[0] indicates single + * or double. + */ + int fpopcode = extract32(insn, 11, 5) + | (extract32(insn, 23, 1) << 5) + | (extract32(insn, 29, 1) << 6); + int is_q = extract32(insn, 30, 1); + int size = extract32(insn, 22, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + int datasize = is_q ? 128 : 64; + int esize = 32 << size; + int elements = datasize / esize; + + if (size == 1 && !is_q) { + unallocated_encoding(s); + return; + } + + switch (fpopcode) { + case 0x58: /* FMAXNMP */ + case 0x5a: /* FADDP */ + case 0x5e: /* FMAXP */ + case 0x78: /* FMINNMP */ + case 0x7e: /* FMINP */ + if (size && !is_q) { + unallocated_encoding(s); + return; + } + handle_simd_3same_pair(s, is_q, 0, fpopcode, size ? MO_64 : MO_32, + rn, rm, rd); + return; + case 0x1b: /* FMULX */ + case 0x1f: /* FRECPS */ + case 0x3f: /* FRSQRTS */ + case 0x5d: /* FACGE */ + case 0x7d: /* FACGT */ + case 0x19: /* FMLA */ + case 0x39: /* FMLS */ + case 0x18: /* FMAXNM */ + case 0x1a: /* FADD */ + case 0x1c: /* FCMEQ */ + case 0x1e: /* FMAX */ + case 0x38: /* FMINNM */ + case 0x3a: /* FSUB */ + case 0x3e: /* FMIN */ + case 0x5b: /* FMUL */ + case 0x5c: /* FCMGE */ + case 0x5f: /* FDIV */ + case 0x7a: /* FABD */ + case 0x7c: /* FCMGT */ + if (!fp_access_check(s)) { + return; + } + handle_3same_float(s, size, elements, fpopcode, rd, rn, rm); + return; + + case 0x1d: /* FMLAL */ + case 0x3d: /* FMLSL */ + case 0x59: /* FMLAL2 */ + case 0x79: /* FMLSL2 */ + if (size & 1 || !dc_isar_feature(aa64_fhm, s)) { + unallocated_encoding(s); + return; + } + if (fp_access_check(s)) { + int is_s = extract32(insn, 23, 1); + int is_2 = extract32(insn, 29, 1); + int data = (is_2 << 1) | is_s; + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), cpu_env, + is_q ? 16 : 8, vec_full_reg_size(s), + data, gen_helper_gvec_fmlal_a64); + } + return; + + default: + unallocated_encoding(s); + return; + } +} + +/* Integer op subgroup of C3.6.16. */ +static void disas_simd_3same_int(DisasContext *s, uint32_t insn) +{ + int is_q = extract32(insn, 30, 1); + int u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 11, 5); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int pass; + TCGCond cond; + + switch (opcode) { + case 0x13: /* MUL, PMUL */ + if (u && size != 0) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x0: /* SHADD, UHADD */ + case 0x2: /* SRHADD, URHADD */ + case 0x4: /* SHSUB, UHSUB */ + case 0xc: /* SMAX, UMAX */ + case 0xd: /* SMIN, UMIN */ + case 0xe: /* SABD, UABD */ + case 0xf: /* SABA, UABA */ + case 0x12: /* MLA, MLS */ + if (size == 3) { + unallocated_encoding(s); + return; + } + break; + case 0x16: /* SQDMULH, SQRDMULH */ + if (size == 0 || size == 3) { + unallocated_encoding(s); + return; + } + break; + default: + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + } + + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0x01: /* SQADD, UQADD */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqadd_qc, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqadd_qc, size); + } + return; + case 0x05: /* SQSUB, UQSUB */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqsub_qc, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqsub_qc, size); + } + return; + case 0x08: /* SSHL, USHL */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_ushl, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sshl, size); + } + return; + case 0x0c: /* SMAX, UMAX */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umax, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smax, size); + } + return; + case 0x0d: /* SMIN, UMIN */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umin, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smin, size); + } + return; + case 0xe: /* SABD, UABD */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uabd, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sabd, size); + } + return; + case 0xf: /* SABA, UABA */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uaba, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_saba, size); + } + return; + case 0x10: /* ADD, SUB */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size); + } + return; + case 0x13: /* MUL, PMUL */ + if (!u) { /* MUL */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size); + } else { /* PMUL */ + gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, gen_helper_gvec_pmul_b); + } + return; + case 0x12: /* MLA, MLS */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mls, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mla, size); + } + return; + case 0x16: /* SQDMULH, SQRDMULH */ + { + static gen_helper_gvec_3_ptr * const fns[2][2] = { + { gen_helper_neon_sqdmulh_h, gen_helper_neon_sqrdmulh_h }, + { gen_helper_neon_sqdmulh_s, gen_helper_neon_sqrdmulh_s }, + }; + gen_gvec_op3_qc(s, is_q, rd, rn, rm, fns[size - 1][u]); + } + return; + case 0x11: + if (!u) { /* CMTST */ + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_cmtst, size); + return; + } + /* else CMEQ */ + cond = TCG_COND_EQ; + goto do_gvec_cmp; + case 0x06: /* CMGT, CMHI */ + cond = u ? TCG_COND_GTU : TCG_COND_GT; + goto do_gvec_cmp; + case 0x07: /* CMGE, CMHS */ + cond = u ? TCG_COND_GEU : TCG_COND_GE; + do_gvec_cmp: + tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s)); + return; + } + + if (size == 3) { + assert(is_q); + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + + handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2); + + write_vec_element(s, tcg_res, rd, pass, MO_64); + + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } + } else { + for (pass = 0; pass < (is_q ? 4 : 2); pass++) { + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + NeonGenTwoOpFn *genfn = NULL; + NeonGenTwoOpEnvFn *genenvfn = NULL; + + read_vec_element_i32(s, tcg_op1, rn, pass, MO_32); + read_vec_element_i32(s, tcg_op2, rm, pass, MO_32); + + switch (opcode) { + case 0x0: /* SHADD, UHADD */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_hadd_s8, gen_helper_neon_hadd_u8 }, + { gen_helper_neon_hadd_s16, gen_helper_neon_hadd_u16 }, + { gen_helper_neon_hadd_s32, gen_helper_neon_hadd_u32 }, + }; + genfn = fns[size][u]; + break; + } + case 0x2: /* SRHADD, URHADD */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_rhadd_s8, gen_helper_neon_rhadd_u8 }, + { gen_helper_neon_rhadd_s16, gen_helper_neon_rhadd_u16 }, + { gen_helper_neon_rhadd_s32, gen_helper_neon_rhadd_u32 }, + }; + genfn = fns[size][u]; + break; + } + case 0x4: /* SHSUB, UHSUB */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_hsub_s8, gen_helper_neon_hsub_u8 }, + { gen_helper_neon_hsub_s16, gen_helper_neon_hsub_u16 }, + { gen_helper_neon_hsub_s32, gen_helper_neon_hsub_u32 }, + }; + genfn = fns[size][u]; + break; + } + case 0x9: /* SQSHL, UQSHL */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 }, + { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 }, + { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0xa: /* SRSHL, URSHL */ + { + static NeonGenTwoOpFn * const fns[3][2] = { + { gen_helper_neon_rshl_s8, gen_helper_neon_rshl_u8 }, + { gen_helper_neon_rshl_s16, gen_helper_neon_rshl_u16 }, + { gen_helper_neon_rshl_s32, gen_helper_neon_rshl_u32 }, + }; + genfn = fns[size][u]; + break; + } + case 0xb: /* SQRSHL, UQRSHL */ + { + static NeonGenTwoOpEnvFn * const fns[3][2] = { + { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 }, + { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 }, + { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + default: + g_assert_not_reached(); + } + + if (genenvfn) { + genenvfn(tcg_res, cpu_env, tcg_op1, tcg_op2); + } else { + genfn(tcg_res, tcg_op1, tcg_op2); + } + + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + } + } + clear_vec_high(s, is_q, rd); +} + +/* AdvSIMD three same + * 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+--------+---+------+------+ + * | 0 | Q | U | 0 1 1 1 0 | size | 1 | Rm | opcode | 1 | Rn | Rd | + * +---+---+---+-----------+------+---+------+--------+---+------+------+ + */ +static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 11, 5); + + switch (opcode) { + case 0x3: /* logic ops */ + disas_simd_3same_logic(s, insn); + break; + case 0x17: /* ADDP */ + case 0x14: /* SMAXP, UMAXP */ + case 0x15: /* SMINP, UMINP */ + { + /* Pairwise operations */ + int is_q = extract32(insn, 30, 1); + int u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + if (opcode == 0x17) { + if (u || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + } else { + if (size == 3) { + unallocated_encoding(s); + return; + } + } + handle_simd_3same_pair(s, is_q, u, opcode, size, rn, rm, rd); + break; + } + case 0x18 ... 0x31: + /* floating point ops, sz[1] and U are part of opcode */ + disas_simd_3same_float(s, insn); + break; + default: + disas_simd_3same_int(s, insn); + break; + } +} + +/* + * Advanced SIMD three same (ARMv8.2 FP16 variants) + * + * 31 30 29 28 24 23 22 21 20 16 15 14 13 11 10 9 5 4 0 + * +---+---+---+-----------+---------+------+-----+--------+---+------+------+ + * | 0 | Q | U | 0 1 1 1 0 | a | 1 0 | Rm | 0 0 | opcode | 1 | Rn | Rd | + * +---+---+---+-----------+---------+------+-----+--------+---+------+------+ + * + * This includes FMULX, FCMEQ (register), FRECPS, FRSQRTS, FCMGE + * (register), FACGE, FABD, FCMGT (register) and FACGT. + * + */ +static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 11, 3); + int u = extract32(insn, 29, 1); + int a = extract32(insn, 23, 1); + int is_q = extract32(insn, 30, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + /* + * For these floating point ops, the U, a and opcode bits + * together indicate the operation. + */ + int fpopcode = opcode | (a << 3) | (u << 4); + int datasize = is_q ? 128 : 64; + int elements = datasize / 16; + bool pairwise; + TCGv_ptr fpst; + int pass; + + switch (fpopcode) { + case 0x0: /* FMAXNM */ + case 0x1: /* FMLA */ + case 0x2: /* FADD */ + case 0x3: /* FMULX */ + case 0x4: /* FCMEQ */ + case 0x6: /* FMAX */ + case 0x7: /* FRECPS */ + case 0x8: /* FMINNM */ + case 0x9: /* FMLS */ + case 0xa: /* FSUB */ + case 0xe: /* FMIN */ + case 0xf: /* FRSQRTS */ + case 0x13: /* FMUL */ + case 0x14: /* FCMGE */ + case 0x15: /* FACGE */ + case 0x17: /* FDIV */ + case 0x1a: /* FABD */ + case 0x1c: /* FCMGT */ + case 0x1d: /* FACGT */ + pairwise = false; + break; + case 0x10: /* FMAXNMP */ + case 0x12: /* FADDP */ + case 0x16: /* FMAXP */ + case 0x18: /* FMINNMP */ + case 0x1e: /* FMINP */ + pairwise = true; + break; + default: + unallocated_encoding(s); + return; + } + + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + fpst = fpstatus_ptr(FPST_FPCR_F16); + + if (pairwise) { + int maxpass = is_q ? 8 : 4; + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i32 tcg_res[8]; + + for (pass = 0; pass < maxpass; pass++) { + int passreg = pass < (maxpass / 2) ? rn : rm; + int passelt = (pass << 1) & (maxpass - 1); + + read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_16); + read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_16); + tcg_res[pass] = tcg_temp_new_i32(); + + switch (fpopcode) { + case 0x10: /* FMAXNMP */ + gen_helper_advsimd_maxnumh(tcg_res[pass], tcg_op1, tcg_op2, + fpst); + break; + case 0x12: /* FADDP */ + gen_helper_advsimd_addh(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x16: /* FMAXP */ + gen_helper_advsimd_maxh(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + case 0x18: /* FMINNMP */ + gen_helper_advsimd_minnumh(tcg_res[pass], tcg_op1, tcg_op2, + fpst); + break; + case 0x1e: /* FMINP */ + gen_helper_advsimd_minh(tcg_res[pass], tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + } + + for (pass = 0; pass < maxpass; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_16); + tcg_temp_free_i32(tcg_res[pass]); + } + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + + } else { + for (pass = 0; pass < elements; pass++) { + TCGv_i32 tcg_op1 = tcg_temp_new_i32(); + TCGv_i32 tcg_op2 = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op1, rn, pass, MO_16); + read_vec_element_i32(s, tcg_op2, rm, pass, MO_16); + + switch (fpopcode) { + case 0x0: /* FMAXNM */ + gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1: /* FMLA */ + read_vec_element_i32(s, tcg_res, rd, pass, MO_16); + gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res, + fpst); + break; + case 0x2: /* FADD */ + gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x3: /* FMULX */ + gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x4: /* FCMEQ */ + gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x6: /* FMAX */ + gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x7: /* FRECPS */ + gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x8: /* FMINNM */ + gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x9: /* FMLS */ + /* As usual for ARM, separate negation for fused multiply-add */ + tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000); + read_vec_element_i32(s, tcg_res, rd, pass, MO_16); + gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res, + fpst); + break; + case 0xa: /* FSUB */ + gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xe: /* FMIN */ + gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0xf: /* FRSQRTS */ + gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x13: /* FMUL */ + gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x14: /* FCMGE */ + gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x15: /* FACGE */ + gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x17: /* FDIV */ + gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1a: /* FABD */ + gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst); + tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff); + break; + case 0x1c: /* FCMGT */ + gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + case 0x1d: /* FACGT */ + gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst); + break; + default: + g_assert_not_reached(); + } + + write_vec_element_i32(s, tcg_res, rd, pass, MO_16); + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + } + } + + tcg_temp_free_ptr(fpst); + + clear_vec_high(s, is_q, rd); +} + +/* AdvSIMD three same extra + * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+---+--------+---+----+----+ + * | 0 | Q | U | 0 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd | + * +---+---+---+-----------+------+---+------+---+--------+---+----+----+ + */ +static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 4); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + bool feature; + int rot; + + switch (u * 16 + opcode) { + case 0x10: /* SQRDMLAH (vector) */ + case 0x11: /* SQRDMLSH (vector) */ + if (size != 1 && size != 2) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_rdm, s); + break; + case 0x02: /* SDOT (vector) */ + case 0x12: /* UDOT (vector) */ + if (size != MO_32) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_dp, s); + break; + case 0x03: /* USDOT */ + if (size != MO_32) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_i8mm, s); + break; + case 0x04: /* SMMLA */ + case 0x14: /* UMMLA */ + case 0x05: /* USMMLA */ + if (!is_q || size != MO_32) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_i8mm, s); + break; + case 0x18: /* FCMLA, #0 */ + case 0x19: /* FCMLA, #90 */ + case 0x1a: /* FCMLA, #180 */ + case 0x1b: /* FCMLA, #270 */ + case 0x1c: /* FCADD, #90 */ + case 0x1e: /* FCADD, #270 */ + if (size == 0 + || (size == 1 && !dc_isar_feature(aa64_fp16, s)) + || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_fcma, s); + break; + case 0x1d: /* BFMMLA */ + if (size != MO_16 || !is_q) { + unallocated_encoding(s); + return; + } + feature = dc_isar_feature(aa64_bf16, s); + break; + case 0x1f: + switch (size) { + case 1: /* BFDOT */ + case 3: /* BFMLAL{B,T} */ + feature = dc_isar_feature(aa64_bf16, s); + break; + default: + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + if (!feature) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0x0: /* SQRDMLAH (vector) */ + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlah_qc, size); + return; + + case 0x1: /* SQRDMLSH (vector) */ + gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlsh_qc, size); + return; + + case 0x2: /* SDOT / UDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, + u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b); + return; + + case 0x3: /* USDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_usdot_b); + return; + + case 0x04: /* SMMLA, UMMLA */ + gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0, + u ? gen_helper_gvec_ummla_b + : gen_helper_gvec_smmla_b); + return; + case 0x05: /* USMMLA */ + gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0, gen_helper_gvec_usmmla_b); + return; + + case 0x8: /* FCMLA, #0 */ + case 0x9: /* FCMLA, #90 */ + case 0xa: /* FCMLA, #180 */ + case 0xb: /* FCMLA, #270 */ + rot = extract32(opcode, 0, 2); + switch (size) { + case 1: + gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, true, rot, + gen_helper_gvec_fcmlah); + break; + case 2: + gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot, + gen_helper_gvec_fcmlas); + break; + case 3: + gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot, + gen_helper_gvec_fcmlad); + break; + default: + g_assert_not_reached(); + } + return; + + case 0xc: /* FCADD, #90 */ + case 0xe: /* FCADD, #270 */ + rot = extract32(opcode, 1, 1); + switch (size) { + case 1: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcaddh); + break; + case 2: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcadds); + break; + case 3: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcaddd); + break; + default: + g_assert_not_reached(); + } + return; + + case 0xd: /* BFMMLA */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla); + return; + case 0xf: + switch (size) { + case 1: /* BFDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot); + break; + case 3: /* BFMLAL{B,T} */ + gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, false, is_q, + gen_helper_gvec_bfmlal); + break; + default: + g_assert_not_reached(); + } + return; + + default: + g_assert_not_reached(); + } +} + +static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q, + int size, int rn, int rd) +{ + /* Handle 2-reg-misc ops which are widening (so each size element + * in the source becomes a 2*size element in the destination. + * The only instruction like this is FCVTL. + */ + int pass; + + if (size == 3) { + /* 32 -> 64 bit fp conversion */ + TCGv_i64 tcg_res[2]; + int srcelt = is_q ? 2 : 0; + + for (pass = 0; pass < 2; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + tcg_res[pass] = tcg_temp_new_i64(); + + read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32); + gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env); + tcg_temp_free_i32(tcg_op); + } + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } + } else { + /* 16 -> 32 bit fp conversion */ + int srcelt = is_q ? 4 : 0; + TCGv_i32 tcg_res[4]; + TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR); + TCGv_i32 ahp = get_ahp_flag(); + + for (pass = 0; pass < 4; pass++) { + tcg_res[pass] = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16); + gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass], + fpst, ahp); + } + for (pass = 0; pass < 4; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(ahp); + } +} + +static void handle_rev(DisasContext *s, int opcode, bool u, + bool is_q, int size, int rn, int rd) +{ + int op = (opcode << 1) | u; + int opsz = op + size; + int grp_size = 3 - opsz; + int dsize = is_q ? 128 : 64; + int i; + + if (opsz >= 3) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (size == 0) { + /* Special case bytes, use bswap op on each group of elements */ + int groups = dsize / (8 << grp_size); + + for (i = 0; i < groups; i++) { + TCGv_i64 tcg_tmp = tcg_temp_new_i64(); + + read_vec_element(s, tcg_tmp, rn, i, grp_size); + switch (grp_size) { + case MO_16: + tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ); + break; + case MO_32: + tcg_gen_bswap32_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ); + break; + case MO_64: + tcg_gen_bswap64_i64(tcg_tmp, tcg_tmp); + break; + default: + g_assert_not_reached(); + } + write_vec_element(s, tcg_tmp, rd, i, grp_size); + tcg_temp_free_i64(tcg_tmp); + } + clear_vec_high(s, is_q, rd); + } else { + int revmask = (1 << grp_size) - 1; + int esize = 8 << size; + int elements = dsize / esize; + TCGv_i64 tcg_rn = tcg_temp_new_i64(); + TCGv_i64 tcg_rd = tcg_const_i64(0); + TCGv_i64 tcg_rd_hi = tcg_const_i64(0); + + for (i = 0; i < elements; i++) { + int e_rev = (i & 0xf) ^ revmask; + int off = e_rev * esize; + read_vec_element(s, tcg_rn, rn, i, size); + if (off >= 64) { + tcg_gen_deposit_i64(tcg_rd_hi, tcg_rd_hi, + tcg_rn, off - 64, esize); + } else { + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, off, esize); + } + } + write_vec_element(s, tcg_rd, rd, 0, MO_64); + write_vec_element(s, tcg_rd_hi, rd, 1, MO_64); + + tcg_temp_free_i64(tcg_rd_hi); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i64(tcg_rn); + } +} + +static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u, + bool is_q, int size, int rn, int rd) +{ + /* Implement the pairwise operations from 2-misc: + * SADDLP, UADDLP, SADALP, UADALP. + * These all add pairs of elements in the input to produce a + * double-width result element in the output (possibly accumulating). + */ + bool accum = (opcode == 0x6); + int maxpass = is_q ? 2 : 1; + int pass; + TCGv_i64 tcg_res[2]; + + if (size == 2) { + /* 32 + 32 -> 64 op */ + MemOp memop = size + (u ? 0 : MO_SIGN); + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + + tcg_res[pass] = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, pass * 2, memop); + read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop); + tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2); + if (accum) { + read_vec_element(s, tcg_op1, rd, pass, MO_64); + tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1); + } + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } + } else { + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + NeonGenOne64OpFn *genfn; + static NeonGenOne64OpFn * const fns[2][2] = { + { gen_helper_neon_addlp_s8, gen_helper_neon_addlp_u8 }, + { gen_helper_neon_addlp_s16, gen_helper_neon_addlp_u16 }, + }; + + genfn = fns[size][u]; + + tcg_res[pass] = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + genfn(tcg_res[pass], tcg_op); + + if (accum) { + read_vec_element(s, tcg_op, rd, pass, MO_64); + if (size == 0) { + gen_helper_neon_addl_u16(tcg_res[pass], + tcg_res[pass], tcg_op); + } else { + gen_helper_neon_addl_u32(tcg_res[pass], + tcg_res[pass], tcg_op); + } + } + tcg_temp_free_i64(tcg_op); + } + } + if (!is_q) { + tcg_res[1] = tcg_constant_i64(0); + } + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } +} + +static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd) +{ + /* Implement SHLL and SHLL2 */ + int pass; + int part = is_q ? 2 : 0; + TCGv_i64 tcg_res[2]; + + for (pass = 0; pass < 2; pass++) { + static NeonGenWidenFn * const widenfns[3] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + }; + NeonGenWidenFn *widenfn = widenfns[size]; + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32); + tcg_res[pass] = tcg_temp_new_i64(); + widenfn(tcg_res[pass], tcg_op); + tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size); + + tcg_temp_free_i32(tcg_op); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } +} + +/* AdvSIMD two reg misc + * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+-----------+--------+-----+------+------+ + * | 0 | Q | U | 0 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 | Rn | Rd | + * +---+---+---+-----------+------+-----------+--------+-----+------+------+ + */ +static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) +{ + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 5); + bool u = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + bool need_fpstatus = false; + bool need_rmode = false; + int rmode = -1; + TCGv_i32 tcg_rmode; + TCGv_ptr tcg_fpstatus; + + switch (opcode) { + case 0x0: /* REV64, REV32 */ + case 0x1: /* REV16 */ + handle_rev(s, opcode, u, is_q, size, rn, rd); + return; + case 0x5: /* CNT, NOT, RBIT */ + if (u && size == 0) { + /* NOT */ + break; + } else if (u && size == 1) { + /* RBIT */ + break; + } else if (!u && size == 0) { + /* CNT */ + break; + } + unallocated_encoding(s); + return; + case 0x12: /* XTN, XTN2, SQXTUN, SQXTUN2 */ + case 0x14: /* SQXTN, SQXTN2, UQXTN, UQXTN2 */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd); + return; + case 0x4: /* CLS, CLZ */ + if (size == 3) { + unallocated_encoding(s); + return; + } + break; + case 0x2: /* SADDLP, UADDLP */ + case 0x6: /* SADALP, UADALP */ + if (size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd); + return; + case 0x13: /* SHLL, SHLL2 */ + if (u == 0 || size == 3) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_shll(s, is_q, size, rn, rd); + return; + case 0xa: /* CMLT */ + if (u == 1) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x8: /* CMGT, CMGE */ + case 0x9: /* CMEQ, CMLE */ + case 0xb: /* ABS, NEG */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x3: /* SUQADD, USQADD */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_satacc(s, false, u, is_q, size, rn, rd); + return; + case 0x7: /* SQABS, SQNEG */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0xc ... 0xf: + case 0x16 ... 0x1f: + { + /* Floating point: U, size[1] and opcode indicate operation; + * size[0] indicates single or double precision. + */ + int is_double = extract32(size, 0, 1); + opcode |= (extract32(size, 1, 1) << 5) | (u << 6); + size = is_double ? 3 : 2; + switch (opcode) { + case 0x2f: /* FABS */ + case 0x6f: /* FNEG */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x1d: /* SCVTF */ + case 0x5d: /* UCVTF */ + { + bool is_signed = (opcode == 0x1d) ? true : false; + int elements = is_double ? 2 : is_q ? 4 : 2; + if (is_double && !is_q) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size); + return; + } + case 0x2c: /* FCMGT (zero) */ + case 0x2d: /* FCMEQ (zero) */ + case 0x2e: /* FCMLT (zero) */ + case 0x6c: /* FCMGE (zero) */ + case 0x6d: /* FCMLE (zero) */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd); + return; + case 0x7f: /* FSQRT */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + need_fpstatus = true; + need_rmode = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x5c: /* FCVTAU */ + case 0x1c: /* FCVTAS */ + need_fpstatus = true; + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x3c: /* URECPE */ + if (size == 3) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x3d: /* FRECPE */ + case 0x7d: /* FRSQRTE */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd); + return; + case 0x56: /* FCVTXN, FCVTXN2 */ + if (size == 2) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x16: /* FCVTN, FCVTN2 */ + /* handle_2misc_narrow does a 2*size -> size operation, but these + * instructions encode the source size rather than dest size. + */ + if (!fp_access_check(s)) { + return; + } + handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd); + return; + case 0x36: /* BFCVTN, BFCVTN2 */ + if (!dc_isar_feature(aa64_bf16, s) || size != 2) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd); + return; + case 0x17: /* FCVTL, FCVTL2 */ + if (!fp_access_check(s)) { + return; + } + handle_2misc_widening(s, opcode, is_q, size, rn, rd); + return; + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + need_rmode = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + /* fall through */ + case 0x59: /* FRINTX */ + case 0x79: /* FRINTI */ + need_fpstatus = true; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x58: /* FRINTA */ + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + need_fpstatus = true; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x7c: /* URSQRTE */ + if (size == 3) { + unallocated_encoding(s); + return; + } + break; + case 0x1e: /* FRINT32Z */ + case 0x1f: /* FRINT64Z */ + need_rmode = true; + rmode = FPROUNDING_ZERO; + /* fall through */ + case 0x5e: /* FRINT32X */ + case 0x5f: /* FRINT64X */ + need_fpstatus = true; + if ((size == 3 && !is_q) || !dc_isar_feature(aa64_frint, s)) { + unallocated_encoding(s); + return; + } + break; + default: + unallocated_encoding(s); + return; + } + break; + } + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (need_fpstatus || need_rmode) { + tcg_fpstatus = fpstatus_ptr(FPST_FPCR); + } else { + tcg_fpstatus = NULL; + } + if (need_rmode) { + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + } else { + tcg_rmode = NULL; + } + + switch (opcode) { + case 0x5: + if (u && size == 0) { /* NOT */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0); + return; + } + break; + case 0x8: /* CMGT, CMGE */ + if (u) { + gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cge0, size); + } else { + gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cgt0, size); + } + return; + case 0x9: /* CMEQ, CMLE */ + if (u) { + gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cle0, size); + } else { + gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_ceq0, size); + } + return; + case 0xa: /* CMLT */ + gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_clt0, size); + return; + case 0xb: + if (u) { /* ABS, NEG */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size); + } else { + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_abs, size); + } + return; + } + + if (size == 3) { + /* All 64-bit element operations can be shared with scalar 2misc */ + int pass; + + /* Coverity claims (size == 3 && !is_q) has been eliminated + * from all paths leading to here. + */ + tcg_debug_assert(is_q); + for (pass = 0; pass < 2; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + + handle_2misc_64(s, opcode, u, tcg_res, tcg_op, + tcg_rmode, tcg_fpstatus); + + write_vec_element(s, tcg_res, rd, pass, MO_64); + + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op); + } + } else { + int pass; + + for (pass = 0; pass < (is_q ? 4 : 2); pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, MO_32); + + if (size == 2) { + /* Special cases for 32 bit elements */ + switch (opcode) { + case 0x4: /* CLS */ + if (u) { + tcg_gen_clzi_i32(tcg_res, tcg_op, 32); + } else { + tcg_gen_clrsb_i32(tcg_res, tcg_op); + } + break; + case 0x7: /* SQABS, SQNEG */ + if (u) { + gen_helper_neon_qneg_s32(tcg_res, cpu_env, tcg_op); + } else { + gen_helper_neon_qabs_s32(tcg_res, cpu_env, tcg_op); + } + break; + case 0x2f: /* FABS */ + gen_helper_vfp_abss(tcg_res, tcg_op); + break; + case 0x6f: /* FNEG */ + gen_helper_vfp_negs(tcg_res, tcg_op); + break; + case 0x7f: /* FSQRT */ + gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env); + break; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + gen_helper_vfp_tosls(tcg_res, tcg_op, + tcg_constant_i32(0), tcg_fpstatus); + break; + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + gen_helper_vfp_touls(tcg_res, tcg_op, + tcg_constant_i32(0), tcg_fpstatus); + break; + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + case 0x58: /* FRINTA */ + case 0x79: /* FRINTI */ + gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x59: /* FRINTX */ + gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x7c: /* URSQRTE */ + gen_helper_rsqrte_u32(tcg_res, tcg_op); + break; + case 0x1e: /* FRINT32Z */ + case 0x5e: /* FRINT32X */ + gen_helper_frint32_s(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x1f: /* FRINT64Z */ + case 0x5f: /* FRINT64X */ + gen_helper_frint64_s(tcg_res, tcg_op, tcg_fpstatus); + break; + default: + g_assert_not_reached(); + } + } else { + /* Use helpers for 8 and 16 bit elements */ + switch (opcode) { + case 0x5: /* CNT, RBIT */ + /* For these two insns size is part of the opcode specifier + * (handled earlier); they always operate on byte elements. + */ + if (u) { + gen_helper_neon_rbit_u8(tcg_res, tcg_op); + } else { + gen_helper_neon_cnt_u8(tcg_res, tcg_op); + } + break; + case 0x7: /* SQABS, SQNEG */ + { + NeonGenOneOpEnvFn *genfn; + static NeonGenOneOpEnvFn * const fns[2][2] = { + { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 }, + { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 }, + }; + genfn = fns[size][u]; + genfn(tcg_res, cpu_env, tcg_op); + break; + } + case 0x4: /* CLS, CLZ */ + if (u) { + if (size == 0) { + gen_helper_neon_clz_u8(tcg_res, tcg_op); + } else { + gen_helper_neon_clz_u16(tcg_res, tcg_op); + } + } else { + if (size == 0) { + gen_helper_neon_cls_s8(tcg_res, tcg_op); + } else { + gen_helper_neon_cls_s16(tcg_res, tcg_op); + } + } + break; + default: + g_assert_not_reached(); + } + } + + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + } + } + clear_vec_high(s, is_q, rd); + + if (need_rmode) { + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + tcg_temp_free_i32(tcg_rmode); + } + if (need_fpstatus) { + tcg_temp_free_ptr(tcg_fpstatus); + } +} + +/* AdvSIMD [scalar] two register miscellaneous (FP16) + * + * 31 30 29 28 27 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +---+---+---+---+---------+---+-------------+--------+-----+------+------+ + * | 0 | Q | U | S | 1 1 1 0 | a | 1 1 1 1 0 0 | opcode | 1 0 | Rn | Rd | + * +---+---+---+---+---------+---+-------------+--------+-----+------+------+ + * mask: 1000 1111 0111 1110 0000 1100 0000 0000 0x8f7e 0c00 + * val: 0000 1110 0111 1000 0000 1000 0000 0000 0x0e78 0800 + * + * This actually covers two groups where scalar access is governed by + * bit 28. A bunch of the instructions (float to integral) only exist + * in the vector form and are un-allocated for the scalar decode. Also + * in the scalar decode Q is always 1. + */ +static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn) +{ + int fpop, opcode, a, u; + int rn, rd; + bool is_q; + bool is_scalar; + bool only_in_vector = false; + + int pass; + TCGv_i32 tcg_rmode = NULL; + TCGv_ptr tcg_fpstatus = NULL; + bool need_rmode = false; + bool need_fpst = true; + int rmode; + + if (!dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + + rd = extract32(insn, 0, 5); + rn = extract32(insn, 5, 5); + + a = extract32(insn, 23, 1); + u = extract32(insn, 29, 1); + is_scalar = extract32(insn, 28, 1); + is_q = extract32(insn, 30, 1); + + opcode = extract32(insn, 12, 5); + fpop = deposit32(opcode, 5, 1, a); + fpop = deposit32(fpop, 6, 1, u); + + switch (fpop) { + case 0x1d: /* SCVTF */ + case 0x5d: /* UCVTF */ + { + int elements; + + if (is_scalar) { + elements = 1; + } else { + elements = (is_q ? 8 : 4); + } + + if (!fp_access_check(s)) { + return; + } + handle_simd_intfp_conv(s, rd, rn, elements, !u, 0, MO_16); + return; + } + break; + case 0x2c: /* FCMGT (zero) */ + case 0x2d: /* FCMEQ (zero) */ + case 0x2e: /* FCMLT (zero) */ + case 0x6c: /* FCMGE (zero) */ + case 0x6d: /* FCMLE (zero) */ + handle_2misc_fcmp_zero(s, fpop, is_scalar, 0, is_q, MO_16, rn, rd); + return; + case 0x3d: /* FRECPE */ + case 0x3f: /* FRECPX */ + break; + case 0x18: /* FRINTN */ + need_rmode = true; + only_in_vector = true; + rmode = FPROUNDING_TIEEVEN; + break; + case 0x19: /* FRINTM */ + need_rmode = true; + only_in_vector = true; + rmode = FPROUNDING_NEGINF; + break; + case 0x38: /* FRINTP */ + need_rmode = true; + only_in_vector = true; + rmode = FPROUNDING_POSINF; + break; + case 0x39: /* FRINTZ */ + need_rmode = true; + only_in_vector = true; + rmode = FPROUNDING_ZERO; + break; + case 0x58: /* FRINTA */ + need_rmode = true; + only_in_vector = true; + rmode = FPROUNDING_TIEAWAY; + break; + case 0x59: /* FRINTX */ + case 0x79: /* FRINTI */ + only_in_vector = true; + /* current rounding mode */ + break; + case 0x1a: /* FCVTNS */ + need_rmode = true; + rmode = FPROUNDING_TIEEVEN; + break; + case 0x1b: /* FCVTMS */ + need_rmode = true; + rmode = FPROUNDING_NEGINF; + break; + case 0x1c: /* FCVTAS */ + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + break; + case 0x3a: /* FCVTPS */ + need_rmode = true; + rmode = FPROUNDING_POSINF; + break; + case 0x3b: /* FCVTZS */ + need_rmode = true; + rmode = FPROUNDING_ZERO; + break; + case 0x5a: /* FCVTNU */ + need_rmode = true; + rmode = FPROUNDING_TIEEVEN; + break; + case 0x5b: /* FCVTMU */ + need_rmode = true; + rmode = FPROUNDING_NEGINF; + break; + case 0x5c: /* FCVTAU */ + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + break; + case 0x7a: /* FCVTPU */ + need_rmode = true; + rmode = FPROUNDING_POSINF; + break; + case 0x7b: /* FCVTZU */ + need_rmode = true; + rmode = FPROUNDING_ZERO; + break; + case 0x2f: /* FABS */ + case 0x6f: /* FNEG */ + need_fpst = false; + break; + case 0x7d: /* FRSQRTE */ + case 0x7f: /* FSQRT (vector) */ + break; + default: + unallocated_encoding(s); + return; + } + + + /* Check additional constraints for the scalar encoding */ + if (is_scalar) { + if (!is_q) { + unallocated_encoding(s); + return; + } + /* FRINTxx is only in the vector form */ + if (only_in_vector) { + unallocated_encoding(s); + return; + } + } + + if (!fp_access_check(s)) { + return; + } + + if (need_rmode || need_fpst) { + tcg_fpstatus = fpstatus_ptr(FPST_FPCR_F16); + } + + if (need_rmode) { + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + } + + if (is_scalar) { + TCGv_i32 tcg_op = read_fp_hreg(s, rn); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + switch (fpop) { + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x3d: /* FRECPE */ + gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x3f: /* FRECPX */ + gen_helper_frecpx_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x6f: /* FNEG */ + tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + default: + g_assert_not_reached(); + } + + /* limit any sign extension going on */ + tcg_gen_andi_i32(tcg_res, tcg_res, 0xffff); + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + } else { + for (pass = 0; pass < (is_q ? 8 : 4); pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, MO_16); + + switch (fpop) { + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x3d: /* FRECPE */ + gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + case 0x58: /* FRINTA */ + case 0x79: /* FRINTI */ + gen_helper_advsimd_rinth(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x59: /* FRINTX */ + gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x2f: /* FABS */ + tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff); + break; + case 0x6f: /* FNEG */ + tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x7f: /* FSQRT */ + gen_helper_sqrt_f16(tcg_res, tcg_op, tcg_fpstatus); + break; + default: + g_assert_not_reached(); + } + + write_vec_element_i32(s, tcg_res, rd, pass, MO_16); + + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + } + + clear_vec_high(s, is_q, rd); + } + + if (tcg_rmode) { + gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus); + tcg_temp_free_i32(tcg_rmode); + } + + if (tcg_fpstatus) { + tcg_temp_free_ptr(tcg_fpstatus); + } +} + +/* AdvSIMD scalar x indexed element + * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0 + * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+ + * | 0 1 | U | 1 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd | + * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+ + * AdvSIMD vector x indexed element + * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+ + * | 0 | Q | U | 0 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd | + * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+ + */ +static void disas_simd_indexed(DisasContext *s, uint32_t insn) +{ + /* This encoding has two kinds of instruction: + * normal, where we perform elt x idxelt => elt for each + * element in the vector + * long, where we perform elt x idxelt and generate a result of + * double the width of the input element + * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs). + */ + bool is_scalar = extract32(insn, 28, 1); + bool is_q = extract32(insn, 30, 1); + bool u = extract32(insn, 29, 1); + int size = extract32(insn, 22, 2); + int l = extract32(insn, 21, 1); + int m = extract32(insn, 20, 1); + /* Note that the Rm field here is only 4 bits, not 5 as it usually is */ + int rm = extract32(insn, 16, 4); + int opcode = extract32(insn, 12, 4); + int h = extract32(insn, 11, 1); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + bool is_long = false; + int is_fp = 0; + bool is_fp16 = false; + int index; + TCGv_ptr fpst; + + switch (16 * u + opcode) { + case 0x08: /* MUL */ + case 0x10: /* MLA */ + case 0x14: /* MLS */ + if (is_scalar) { + unallocated_encoding(s); + return; + } + break; + case 0x02: /* SMLAL, SMLAL2 */ + case 0x12: /* UMLAL, UMLAL2 */ + case 0x06: /* SMLSL, SMLSL2 */ + case 0x16: /* UMLSL, UMLSL2 */ + case 0x0a: /* SMULL, SMULL2 */ + case 0x1a: /* UMULL, UMULL2 */ + if (is_scalar) { + unallocated_encoding(s); + return; + } + is_long = true; + break; + case 0x03: /* SQDMLAL, SQDMLAL2 */ + case 0x07: /* SQDMLSL, SQDMLSL2 */ + case 0x0b: /* SQDMULL, SQDMULL2 */ + is_long = true; + break; + case 0x0c: /* SQDMULH */ + case 0x0d: /* SQRDMULH */ + break; + case 0x01: /* FMLA */ + case 0x05: /* FMLS */ + case 0x09: /* FMUL */ + case 0x19: /* FMULX */ + is_fp = 1; + break; + case 0x1d: /* SQRDMLAH */ + case 0x1f: /* SQRDMLSH */ + if (!dc_isar_feature(aa64_rdm, s)) { + unallocated_encoding(s); + return; + } + break; + case 0x0e: /* SDOT */ + case 0x1e: /* UDOT */ + if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_dp, s)) { + unallocated_encoding(s); + return; + } + break; + case 0x0f: + switch (size) { + case 0: /* SUDOT */ + case 2: /* USDOT */ + if (is_scalar || !dc_isar_feature(aa64_i8mm, s)) { + unallocated_encoding(s); + return; + } + size = MO_32; + break; + case 1: /* BFDOT */ + if (is_scalar || !dc_isar_feature(aa64_bf16, s)) { + unallocated_encoding(s); + return; + } + size = MO_32; + break; + case 3: /* BFMLAL{B,T} */ + if (is_scalar || !dc_isar_feature(aa64_bf16, s)) { + unallocated_encoding(s); + return; + } + /* can't set is_fp without other incorrect size checks */ + size = MO_16; + break; + default: + unallocated_encoding(s); + return; + } + break; + case 0x11: /* FCMLA #0 */ + case 0x13: /* FCMLA #90 */ + case 0x15: /* FCMLA #180 */ + case 0x17: /* FCMLA #270 */ + if (is_scalar || !dc_isar_feature(aa64_fcma, s)) { + unallocated_encoding(s); + return; + } + is_fp = 2; + break; + case 0x00: /* FMLAL */ + case 0x04: /* FMLSL */ + case 0x18: /* FMLAL2 */ + case 0x1c: /* FMLSL2 */ + if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) { + unallocated_encoding(s); + return; + } + size = MO_16; + /* is_fp, but we pass cpu_env not fp_status. */ + break; + default: + unallocated_encoding(s); + return; + } + + switch (is_fp) { + case 1: /* normal fp */ + /* convert insn encoded size to MemOp size */ + switch (size) { + case 0: /* half-precision */ + size = MO_16; + is_fp16 = true; + break; + case MO_32: /* single precision */ + case MO_64: /* double precision */ + break; + default: + unallocated_encoding(s); + return; + } + break; + + case 2: /* complex fp */ + /* Each indexable element is a complex pair. */ + size += 1; + switch (size) { + case MO_32: + if (h && !is_q) { + unallocated_encoding(s); + return; + } + is_fp16 = true; + break; + case MO_64: + break; + default: + unallocated_encoding(s); + return; + } + break; + + default: /* integer */ + switch (size) { + case MO_8: + case MO_64: + unallocated_encoding(s); + return; + } + break; + } + if (is_fp16 && !dc_isar_feature(aa64_fp16, s)) { + unallocated_encoding(s); + return; + } + + /* Given MemOp size, adjust register and indexing. */ + switch (size) { + case MO_16: + index = h << 2 | l << 1 | m; + break; + case MO_32: + index = h << 1 | l; + rm |= m << 4; + break; + case MO_64: + if (l || !is_q) { + unallocated_encoding(s); + return; + } + index = h; + rm |= m << 4; + break; + default: + g_assert_not_reached(); + } + + if (!fp_access_check(s)) { + return; + } + + if (is_fp) { + fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR); + } else { + fpst = NULL; + } + + switch (16 * u + opcode) { + case 0x0e: /* SDOT */ + case 0x1e: /* UDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, + u ? gen_helper_gvec_udot_idx_b + : gen_helper_gvec_sdot_idx_b); + return; + case 0x0f: + switch (extract32(insn, 22, 2)) { + case 0: /* SUDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, + gen_helper_gvec_sudot_idx_b); + return; + case 1: /* BFDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, + gen_helper_gvec_bfdot_idx); + return; + case 2: /* USDOT */ + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, + gen_helper_gvec_usdot_idx_b); + return; + case 3: /* BFMLAL{B,T} */ + gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, 0, (index << 1) | is_q, + gen_helper_gvec_bfmlal_idx); + return; + } + g_assert_not_reached(); + case 0x11: /* FCMLA #0 */ + case 0x13: /* FCMLA #90 */ + case 0x15: /* FCMLA #180 */ + case 0x17: /* FCMLA #270 */ + { + int rot = extract32(insn, 13, 2); + int data = (index << 2) | rot; + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, rd), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), data, + size == MO_64 + ? gen_helper_gvec_fcmlas_idx + : gen_helper_gvec_fcmlah_idx); + tcg_temp_free_ptr(fpst); + } + return; + + case 0x00: /* FMLAL */ + case 0x04: /* FMLSL */ + case 0x18: /* FMLAL2 */ + case 0x1c: /* FMLSL2 */ + { + int is_s = extract32(opcode, 2, 1); + int is_2 = u; + int data = (index << 2) | (is_2 << 1) | is_s; + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), cpu_env, + is_q ? 16 : 8, vec_full_reg_size(s), + data, gen_helper_gvec_fmlal_idx_a64); + } + return; + + case 0x08: /* MUL */ + if (!is_long && !is_scalar) { + static gen_helper_gvec_3 * const fns[3] = { + gen_helper_gvec_mul_idx_h, + gen_helper_gvec_mul_idx_s, + gen_helper_gvec_mul_idx_d, + }; + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s), + index, fns[size - 1]); + return; + } + break; + + case 0x10: /* MLA */ + if (!is_long && !is_scalar) { + static gen_helper_gvec_4 * const fns[3] = { + gen_helper_gvec_mla_idx_h, + gen_helper_gvec_mla_idx_s, + gen_helper_gvec_mla_idx_d, + }; + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, rd), + is_q ? 16 : 8, vec_full_reg_size(s), + index, fns[size - 1]); + return; + } + break; + + case 0x14: /* MLS */ + if (!is_long && !is_scalar) { + static gen_helper_gvec_4 * const fns[3] = { + gen_helper_gvec_mls_idx_h, + gen_helper_gvec_mls_idx_s, + gen_helper_gvec_mls_idx_d, + }; + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, rd), + is_q ? 16 : 8, vec_full_reg_size(s), + index, fns[size - 1]); + return; + } + break; + } + + if (size == 3) { + TCGv_i64 tcg_idx = tcg_temp_new_i64(); + int pass; + + assert(is_fp && is_q && !is_long); + + read_vec_element(s, tcg_idx, rm, index, MO_64); + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + + switch (16 * u + opcode) { + case 0x05: /* FMLS */ + /* As usual for ARM, separate negation for fused multiply-add */ + gen_helper_vfp_negd(tcg_op, tcg_op); + /* fall through */ + case 0x01: /* FMLA */ + read_vec_element(s, tcg_res, rd, pass, MO_64); + gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst); + break; + case 0x09: /* FMUL */ + gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst); + break; + case 0x19: /* FMULX */ + gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst); + break; + default: + g_assert_not_reached(); + } + + write_vec_element(s, tcg_res, rd, pass, MO_64); + tcg_temp_free_i64(tcg_op); + tcg_temp_free_i64(tcg_res); + } + + tcg_temp_free_i64(tcg_idx); + clear_vec_high(s, !is_scalar, rd); + } else if (!is_long) { + /* 32 bit floating point, or 16 or 32 bit integer. + * For the 16 bit scalar case we use the usual Neon helpers and + * rely on the fact that 0 op 0 == 0 with no side effects. + */ + TCGv_i32 tcg_idx = tcg_temp_new_i32(); + int pass, maxpasses; + + if (is_scalar) { + maxpasses = 1; + } else { + maxpasses = is_q ? 4 : 2; + } + + read_vec_element_i32(s, tcg_idx, rm, index, size); + + if (size == 1 && !is_scalar) { + /* The simplest way to handle the 16x16 indexed ops is to duplicate + * the index into both halves of the 32 bit tcg_idx and then use + * the usual Neon helpers. + */ + tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16); + } + + for (pass = 0; pass < maxpasses; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32); + + switch (16 * u + opcode) { + case 0x08: /* MUL */ + case 0x10: /* MLA */ + case 0x14: /* MLS */ + { + static NeonGenTwoOpFn * const fns[2][2] = { + { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, + { tcg_gen_add_i32, tcg_gen_sub_i32 }, + }; + NeonGenTwoOpFn *genfn; + bool is_sub = opcode == 0x4; + + if (size == 1) { + gen_helper_neon_mul_u16(tcg_res, tcg_op, tcg_idx); + } else { + tcg_gen_mul_i32(tcg_res, tcg_op, tcg_idx); + } + if (opcode == 0x8) { + break; + } + read_vec_element_i32(s, tcg_op, rd, pass, MO_32); + genfn = fns[size - 1][is_sub]; + genfn(tcg_res, tcg_op, tcg_res); + break; + } + case 0x05: /* FMLS */ + case 0x01: /* FMLA */ + read_vec_element_i32(s, tcg_res, rd, pass, + is_scalar ? size : MO_32); + switch (size) { + case 1: + if (opcode == 0x5) { + /* As usual for ARM, separate negation for fused + * multiply-add */ + tcg_gen_xori_i32(tcg_op, tcg_op, 0x80008000); + } + if (is_scalar) { + gen_helper_advsimd_muladdh(tcg_res, tcg_op, tcg_idx, + tcg_res, fpst); + } else { + gen_helper_advsimd_muladd2h(tcg_res, tcg_op, tcg_idx, + tcg_res, fpst); + } + break; + case 2: + if (opcode == 0x5) { + /* As usual for ARM, separate negation for + * fused multiply-add */ + tcg_gen_xori_i32(tcg_op, tcg_op, 0x80000000); + } + gen_helper_vfp_muladds(tcg_res, tcg_op, tcg_idx, + tcg_res, fpst); + break; + default: + g_assert_not_reached(); + } + break; + case 0x09: /* FMUL */ + switch (size) { + case 1: + if (is_scalar) { + gen_helper_advsimd_mulh(tcg_res, tcg_op, + tcg_idx, fpst); + } else { + gen_helper_advsimd_mul2h(tcg_res, tcg_op, + tcg_idx, fpst); + } + break; + case 2: + gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst); + break; + default: + g_assert_not_reached(); + } + break; + case 0x19: /* FMULX */ + switch (size) { + case 1: + if (is_scalar) { + gen_helper_advsimd_mulxh(tcg_res, tcg_op, + tcg_idx, fpst); + } else { + gen_helper_advsimd_mulx2h(tcg_res, tcg_op, + tcg_idx, fpst); + } + break; + case 2: + gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst); + break; + default: + g_assert_not_reached(); + } + break; + case 0x0c: /* SQDMULH */ + if (size == 1) { + gen_helper_neon_qdmulh_s16(tcg_res, cpu_env, + tcg_op, tcg_idx); + } else { + gen_helper_neon_qdmulh_s32(tcg_res, cpu_env, + tcg_op, tcg_idx); + } + break; + case 0x0d: /* SQRDMULH */ + if (size == 1) { + gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env, + tcg_op, tcg_idx); + } else { + gen_helper_neon_qrdmulh_s32(tcg_res, cpu_env, + tcg_op, tcg_idx); + } + break; + case 0x1d: /* SQRDMLAH */ + read_vec_element_i32(s, tcg_res, rd, pass, + is_scalar ? size : MO_32); + if (size == 1) { + gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } else { + gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } + break; + case 0x1f: /* SQRDMLSH */ + read_vec_element_i32(s, tcg_res, rd, pass, + is_scalar ? size : MO_32); + if (size == 1) { + gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } else { + gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } + break; + default: + g_assert_not_reached(); + } + + if (is_scalar) { + write_fp_sreg(s, rd, tcg_res); + } else { + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + } + + tcg_temp_free_i32(tcg_op); + tcg_temp_free_i32(tcg_res); + } + + tcg_temp_free_i32(tcg_idx); + clear_vec_high(s, is_q, rd); + } else { + /* long ops: 16x16->32 or 32x32->64 */ + TCGv_i64 tcg_res[2]; + int pass; + bool satop = extract32(opcode, 0, 1); + MemOp memop = MO_32; + + if (satop || !u) { + memop |= MO_SIGN; + } + + if (size == 2) { + TCGv_i64 tcg_idx = tcg_temp_new_i64(); + + read_vec_element(s, tcg_idx, rm, index, memop); + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_passres; + int passelt; + + if (is_scalar) { + passelt = 0; + } else { + passelt = pass + (is_q * 2); + } + + read_vec_element(s, tcg_op, rn, passelt, memop); + + tcg_res[pass] = tcg_temp_new_i64(); + + if (opcode == 0xa || opcode == 0xb) { + /* Non-accumulating ops */ + tcg_passres = tcg_res[pass]; + } else { + tcg_passres = tcg_temp_new_i64(); + } + + tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx); + tcg_temp_free_i64(tcg_op); + + if (satop) { + /* saturating, doubling */ + gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env, + tcg_passres, tcg_passres); + } + + if (opcode == 0xa || opcode == 0xb) { + continue; + } + + /* Accumulating op: handle accumulate step */ + read_vec_element(s, tcg_res[pass], rd, pass, MO_64); + + switch (opcode) { + case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ + tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres); + break; + case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ + tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres); + break; + case 0x7: /* SQDMLSL, SQDMLSL2 */ + tcg_gen_neg_i64(tcg_passres, tcg_passres); + /* fall through */ + case 0x3: /* SQDMLAL, SQDMLAL2 */ + gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env, + tcg_res[pass], + tcg_passres); + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i64(tcg_passres); + } + tcg_temp_free_i64(tcg_idx); + + clear_vec_high(s, !is_scalar, rd); + } else { + TCGv_i32 tcg_idx = tcg_temp_new_i32(); + + assert(size == 1); + read_vec_element_i32(s, tcg_idx, rm, index, size); + + if (!is_scalar) { + /* The simplest way to handle the 16x16 indexed ops is to + * duplicate the index into both halves of the 32 bit tcg_idx + * and then use the usual Neon helpers. + */ + tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16); + } + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i64 tcg_passres; + + if (is_scalar) { + read_vec_element_i32(s, tcg_op, rn, pass, size); + } else { + read_vec_element_i32(s, tcg_op, rn, + pass + (is_q * 2), MO_32); + } + + tcg_res[pass] = tcg_temp_new_i64(); + + if (opcode == 0xa || opcode == 0xb) { + /* Non-accumulating ops */ + tcg_passres = tcg_res[pass]; + } else { + tcg_passres = tcg_temp_new_i64(); + } + + if (memop & MO_SIGN) { + gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx); + } else { + gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx); + } + if (satop) { + gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env, + tcg_passres, tcg_passres); + } + tcg_temp_free_i32(tcg_op); + + if (opcode == 0xa || opcode == 0xb) { + continue; + } + + /* Accumulating op: handle accumulate step */ + read_vec_element(s, tcg_res[pass], rd, pass, MO_64); + + switch (opcode) { + case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ + gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass], + tcg_passres); + break; + case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ + gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass], + tcg_passres); + break; + case 0x7: /* SQDMLSL, SQDMLSL2 */ + gen_helper_neon_negl_u32(tcg_passres, tcg_passres); + /* fall through */ + case 0x3: /* SQDMLAL, SQDMLAL2 */ + gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env, + tcg_res[pass], + tcg_passres); + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i64(tcg_passres); + } + tcg_temp_free_i32(tcg_idx); + + if (is_scalar) { + tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]); + } + } + + if (is_scalar) { + tcg_res[1] = tcg_constant_i64(0); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } + } + + if (fpst) { + tcg_temp_free_ptr(fpst); + } +} + +/* Crypto AES + * 31 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +-----------------+------+-----------+--------+-----+------+------+ + * | 0 1 0 0 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 | Rn | Rd | + * +-----------------+------+-----------+--------+-----+------+------+ + */ +static void disas_crypto_aes(DisasContext *s, uint32_t insn) +{ + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int decrypt; + gen_helper_gvec_2 *genfn2 = NULL; + gen_helper_gvec_3 *genfn3 = NULL; + + if (!dc_isar_feature(aa64_aes, s) || size != 0) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0x4: /* AESE */ + decrypt = 0; + genfn3 = gen_helper_crypto_aese; + break; + case 0x6: /* AESMC */ + decrypt = 0; + genfn2 = gen_helper_crypto_aesmc; + break; + case 0x5: /* AESD */ + decrypt = 1; + genfn3 = gen_helper_crypto_aese; + break; + case 0x7: /* AESIMC */ + decrypt = 1; + genfn2 = gen_helper_crypto_aesmc; + break; + default: + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + if (genfn2) { + gen_gvec_op2_ool(s, true, rd, rn, decrypt, genfn2); + } else { + gen_gvec_op3_ool(s, true, rd, rd, rn, decrypt, genfn3); + } +} + +/* Crypto three-reg SHA + * 31 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0 + * +-----------------+------+---+------+---+--------+-----+------+------+ + * | 0 1 0 1 1 1 1 0 | size | 0 | Rm | 0 | opcode | 0 0 | Rn | Rd | + * +-----------------+------+---+------+---+--------+-----+------+------+ + */ +static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn) +{ + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 3); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + gen_helper_gvec_3 *genfn; + bool feature; + + if (size != 0) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0: /* SHA1C */ + genfn = gen_helper_crypto_sha1c; + feature = dc_isar_feature(aa64_sha1, s); + break; + case 1: /* SHA1P */ + genfn = gen_helper_crypto_sha1p; + feature = dc_isar_feature(aa64_sha1, s); + break; + case 2: /* SHA1M */ + genfn = gen_helper_crypto_sha1m; + feature = dc_isar_feature(aa64_sha1, s); + break; + case 3: /* SHA1SU0 */ + genfn = gen_helper_crypto_sha1su0; + feature = dc_isar_feature(aa64_sha1, s); + break; + case 4: /* SHA256H */ + genfn = gen_helper_crypto_sha256h; + feature = dc_isar_feature(aa64_sha256, s); + break; + case 5: /* SHA256H2 */ + genfn = gen_helper_crypto_sha256h2; + feature = dc_isar_feature(aa64_sha256, s); + break; + case 6: /* SHA256SU1 */ + genfn = gen_helper_crypto_sha256su1; + feature = dc_isar_feature(aa64_sha256, s); + break; + default: + unallocated_encoding(s); + return; + } + + if (!feature) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + gen_gvec_op3_ool(s, true, rd, rn, rm, 0, genfn); +} + +/* Crypto two-reg SHA + * 31 24 23 22 21 17 16 12 11 10 9 5 4 0 + * +-----------------+------+-----------+--------+-----+------+------+ + * | 0 1 0 1 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 | Rn | Rd | + * +-----------------+------+-----------+--------+-----+------+------+ + */ +static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn) +{ + int size = extract32(insn, 22, 2); + int opcode = extract32(insn, 12, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + gen_helper_gvec_2 *genfn; + bool feature; + + if (size != 0) { + unallocated_encoding(s); + return; + } + + switch (opcode) { + case 0: /* SHA1H */ + feature = dc_isar_feature(aa64_sha1, s); + genfn = gen_helper_crypto_sha1h; + break; + case 1: /* SHA1SU1 */ + feature = dc_isar_feature(aa64_sha1, s); + genfn = gen_helper_crypto_sha1su1; + break; + case 2: /* SHA256SU0 */ + feature = dc_isar_feature(aa64_sha256, s); + genfn = gen_helper_crypto_sha256su0; + break; + default: + unallocated_encoding(s); + return; + } + + if (!feature) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + gen_gvec_op2_ool(s, true, rd, rn, 0, genfn); +} + +static void gen_rax1_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + tcg_gen_rotli_i64(d, m, 1); + tcg_gen_xor_i64(d, d, n); +} + +static void gen_rax1_vec(unsigned vece, TCGv_vec d, TCGv_vec n, TCGv_vec m) +{ + tcg_gen_rotli_vec(vece, d, m, 1); + tcg_gen_xor_vec(vece, d, d, n); +} + +void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; + static const GVecGen3 op = { + .fni8 = gen_rax1_i64, + .fniv = gen_rax1_vec, + .opt_opc = vecop_list, + .fno = gen_helper_crypto_rax1, + .vece = MO_64, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &op); +} + +/* Crypto three-reg SHA512 + * 31 21 20 16 15 14 13 12 11 10 9 5 4 0 + * +-----------------------+------+---+---+-----+--------+------+------+ + * | 1 1 0 0 1 1 1 0 0 1 1 | Rm | 1 | O | 0 0 | opcode | Rn | Rd | + * +-----------------------+------+---+---+-----+--------+------+------+ + */ +static void disas_crypto_three_reg_sha512(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 10, 2); + int o = extract32(insn, 14, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + bool feature; + gen_helper_gvec_3 *oolfn = NULL; + GVecGen3Fn *gvecfn = NULL; + + if (o == 0) { + switch (opcode) { + case 0: /* SHA512H */ + feature = dc_isar_feature(aa64_sha512, s); + oolfn = gen_helper_crypto_sha512h; + break; + case 1: /* SHA512H2 */ + feature = dc_isar_feature(aa64_sha512, s); + oolfn = gen_helper_crypto_sha512h2; + break; + case 2: /* SHA512SU1 */ + feature = dc_isar_feature(aa64_sha512, s); + oolfn = gen_helper_crypto_sha512su1; + break; + case 3: /* RAX1 */ + feature = dc_isar_feature(aa64_sha3, s); + gvecfn = gen_gvec_rax1; + break; + default: + g_assert_not_reached(); + } + } else { + switch (opcode) { + case 0: /* SM3PARTW1 */ + feature = dc_isar_feature(aa64_sm3, s); + oolfn = gen_helper_crypto_sm3partw1; + break; + case 1: /* SM3PARTW2 */ + feature = dc_isar_feature(aa64_sm3, s); + oolfn = gen_helper_crypto_sm3partw2; + break; + case 2: /* SM4EKEY */ + feature = dc_isar_feature(aa64_sm4, s); + oolfn = gen_helper_crypto_sm4ekey; + break; + default: + unallocated_encoding(s); + return; + } + } + + if (!feature) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (oolfn) { + gen_gvec_op3_ool(s, true, rd, rn, rm, 0, oolfn); + } else { + gen_gvec_fn3(s, true, rd, rn, rm, gvecfn, MO_64); + } +} + +/* Crypto two-reg SHA512 + * 31 12 11 10 9 5 4 0 + * +-----------------------------------------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 | opcode | Rn | Rd | + * +-----------------------------------------+--------+------+------+ + */ +static void disas_crypto_two_reg_sha512(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 10, 2); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + bool feature; + + switch (opcode) { + case 0: /* SHA512SU0 */ + feature = dc_isar_feature(aa64_sha512, s); + break; + case 1: /* SM4E */ + feature = dc_isar_feature(aa64_sm4, s); + break; + default: + unallocated_encoding(s); + return; + } + + if (!feature) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0: /* SHA512SU0 */ + gen_gvec_op2_ool(s, true, rd, rn, 0, gen_helper_crypto_sha512su0); + break; + case 1: /* SM4E */ + gen_gvec_op3_ool(s, true, rd, rd, rn, 0, gen_helper_crypto_sm4e); + break; + default: + g_assert_not_reached(); + } +} + +/* Crypto four-register + * 31 23 22 21 20 16 15 14 10 9 5 4 0 + * +-------------------+-----+------+---+------+------+------+ + * | 1 1 0 0 1 1 1 0 0 | Op0 | Rm | 0 | Ra | Rn | Rd | + * +-------------------+-----+------+---+------+------+------+ + */ +static void disas_crypto_four_reg(DisasContext *s, uint32_t insn) +{ + int op0 = extract32(insn, 21, 2); + int rm = extract32(insn, 16, 5); + int ra = extract32(insn, 10, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + bool feature; + + switch (op0) { + case 0: /* EOR3 */ + case 1: /* BCAX */ + feature = dc_isar_feature(aa64_sha3, s); + break; + case 2: /* SM3SS1 */ + feature = dc_isar_feature(aa64_sm3, s); + break; + default: + unallocated_encoding(s); + return; + } + + if (!feature) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (op0 < 2) { + TCGv_i64 tcg_op1, tcg_op2, tcg_op3, tcg_res[2]; + int pass; + + tcg_op1 = tcg_temp_new_i64(); + tcg_op2 = tcg_temp_new_i64(); + tcg_op3 = tcg_temp_new_i64(); + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = tcg_temp_new_i64(); + + for (pass = 0; pass < 2; pass++) { + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + read_vec_element(s, tcg_op3, ra, pass, MO_64); + + if (op0 == 0) { + /* EOR3 */ + tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op3); + } else { + /* BCAX */ + tcg_gen_andc_i64(tcg_res[pass], tcg_op2, tcg_op3); + } + tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); + } + write_vec_element(s, tcg_res[0], rd, 0, MO_64); + write_vec_element(s, tcg_res[1], rd, 1, MO_64); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_op3); + tcg_temp_free_i64(tcg_res[0]); + tcg_temp_free_i64(tcg_res[1]); + } else { + TCGv_i32 tcg_op1, tcg_op2, tcg_op3, tcg_res, tcg_zero; + + tcg_op1 = tcg_temp_new_i32(); + tcg_op2 = tcg_temp_new_i32(); + tcg_op3 = tcg_temp_new_i32(); + tcg_res = tcg_temp_new_i32(); + tcg_zero = tcg_constant_i32(0); + + read_vec_element_i32(s, tcg_op1, rn, 3, MO_32); + read_vec_element_i32(s, tcg_op2, rm, 3, MO_32); + read_vec_element_i32(s, tcg_op3, ra, 3, MO_32); + + tcg_gen_rotri_i32(tcg_res, tcg_op1, 20); + tcg_gen_add_i32(tcg_res, tcg_res, tcg_op2); + tcg_gen_add_i32(tcg_res, tcg_res, tcg_op3); + tcg_gen_rotri_i32(tcg_res, tcg_res, 25); + + write_vec_element_i32(s, tcg_zero, rd, 0, MO_32); + write_vec_element_i32(s, tcg_zero, rd, 1, MO_32); + write_vec_element_i32(s, tcg_zero, rd, 2, MO_32); + write_vec_element_i32(s, tcg_res, rd, 3, MO_32); + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_op3); + tcg_temp_free_i32(tcg_res); + } +} + +/* Crypto XAR + * 31 21 20 16 15 10 9 5 4 0 + * +-----------------------+------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 1 0 0 | Rm | imm6 | Rn | Rd | + * +-----------------------+------+--------+------+------+ + */ +static void disas_crypto_xar(DisasContext *s, uint32_t insn) +{ + int rm = extract32(insn, 16, 5); + int imm6 = extract32(insn, 10, 6); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + if (!dc_isar_feature(aa64_sha3, s)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + gen_gvec_xar(MO_64, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), imm6, 16, + vec_full_reg_size(s)); +} + +/* Crypto three-reg imm2 + * 31 21 20 16 15 14 13 12 11 10 9 5 4 0 + * +-----------------------+------+-----+------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 0 1 0 | Rm | 1 0 | imm2 | opcode | Rn | Rd | + * +-----------------------+------+-----+------+--------+------+------+ + */ +static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_crypto_sm3tt1a, gen_helper_crypto_sm3tt1b, + gen_helper_crypto_sm3tt2a, gen_helper_crypto_sm3tt2b, + }; + int opcode = extract32(insn, 10, 2); + int imm2 = extract32(insn, 12, 2); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + + if (!dc_isar_feature(aa64_sm3, s)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + gen_gvec_op3_ool(s, true, rd, rn, rm, imm2, fns[opcode]); +} + +/* C3.6 Data processing - SIMD, inc Crypto + * + * As the decode gets a little complex we are using a table based + * approach for this part of the decode. + */ +static const AArch64DecodeTable data_proc_simd[] = { + /* pattern , mask , fn */ + { 0x0e200400, 0x9f200400, disas_simd_three_reg_same }, + { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra }, + { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff }, + { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc }, + { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes }, + { 0x0e000400, 0x9fe08400, disas_simd_copy }, + { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */ + /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */ + { 0x0f000400, 0x9ff80400, disas_simd_mod_imm }, + { 0x0f000400, 0x9f800400, disas_simd_shift_imm }, + { 0x0e000000, 0xbf208c00, disas_simd_tb }, + { 0x0e000800, 0xbf208c00, disas_simd_zip_trn }, + { 0x2e000000, 0xbf208400, disas_simd_ext }, + { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same }, + { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra }, + { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff }, + { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc }, + { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise }, + { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy }, + { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */ + { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm }, + { 0x4e280800, 0xff3e0c00, disas_crypto_aes }, + { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha }, + { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha }, + { 0xce608000, 0xffe0b000, disas_crypto_three_reg_sha512 }, + { 0xcec08000, 0xfffff000, disas_crypto_two_reg_sha512 }, + { 0xce000000, 0xff808000, disas_crypto_four_reg }, + { 0xce800000, 0xffe00000, disas_crypto_xar }, + { 0xce408000, 0xffe0c000, disas_crypto_three_reg_imm2 }, + { 0x0e400400, 0x9f60c400, disas_simd_three_reg_same_fp16 }, + { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 }, + { 0x5e400400, 0xdf60c400, disas_simd_scalar_three_reg_same_fp16 }, + { 0x00000000, 0x00000000, NULL } +}; + +static void disas_data_proc_simd(DisasContext *s, uint32_t insn) +{ + /* Note that this is called with all non-FP cases from + * table C3-6 so it must UNDEF for entries not specifically + * allocated to instructions in that table. + */ + AArch64DecodeFn *fn = lookup_disas_fn(&data_proc_simd[0], insn); + if (fn) { + fn(s, insn); + } else { + unallocated_encoding(s); + } +} + +/* C3.6 Data processing - SIMD and floating point */ +static void disas_data_proc_simd_fp(DisasContext *s, uint32_t insn) +{ + if (extract32(insn, 28, 1) == 1 && extract32(insn, 30, 1) == 0) { + disas_data_proc_fp(s, insn); + } else { + /* SIMD, including crypto */ + disas_data_proc_simd(s, insn); + } +} + +/* + * Include the generated SME FA64 decoder. + */ + +#include "decode-sme-fa64.c.inc" + +static bool trans_OK(DisasContext *s, arg_OK *a) +{ + return true; +} + +static bool trans_FAIL(DisasContext *s, arg_OK *a) +{ + s->is_nonstreaming = true; + return true; +} + +/** + * is_guarded_page: + * @env: The cpu environment + * @s: The DisasContext + * + * Return true if the page is guarded. + */ +static bool is_guarded_page(CPUARMState *env, DisasContext *s) +{ + uint64_t addr = s->base.pc_first; +#ifdef CONFIG_USER_ONLY + return page_get_flags(addr) & PAGE_BTI; +#else + CPUTLBEntryFull *full; + void *host; + int mmu_idx = arm_to_core_mmu_idx(s->mmu_idx); + int flags; + + /* + * We test this immediately after reading an insn, which means + * that the TLB entry must be present and valid, and thus this + * access will never raise an exception. + */ + flags = probe_access_full(env, addr, MMU_INST_FETCH, mmu_idx, + false, &host, &full, 0); + assert(!(flags & TLB_INVALID_MASK)); + + return full->guarded; +#endif +} + +/** + * btype_destination_ok: + * @insn: The instruction at the branch destination + * @bt: SCTLR_ELx.BT + * @btype: PSTATE.BTYPE, and is non-zero + * + * On a guarded page, there are a limited number of insns + * that may be present at the branch target: + * - branch target identifiers, + * - paciasp, pacibsp, + * - BRK insn + * - HLT insn + * Anything else causes a Branch Target Exception. + * + * Return true if the branch is compatible, false to raise BTITRAP. + */ +static bool btype_destination_ok(uint32_t insn, bool bt, int btype) +{ + if ((insn & 0xfffff01fu) == 0xd503201fu) { + /* HINT space */ + switch (extract32(insn, 5, 7)) { + case 0b011001: /* PACIASP */ + case 0b011011: /* PACIBSP */ + /* + * If SCTLR_ELx.BT, then PACI*SP are not compatible + * with btype == 3. Otherwise all btype are ok. + */ + return !bt || btype != 3; + case 0b100000: /* BTI */ + /* Not compatible with any btype. */ + return false; + case 0b100010: /* BTI c */ + /* Not compatible with btype == 3 */ + return btype != 3; + case 0b100100: /* BTI j */ + /* Not compatible with btype == 2 */ + return btype != 2; + case 0b100110: /* BTI jc */ + /* Compatible with any btype. */ + return true; + } + } else { + switch (insn & 0xffe0001fu) { + case 0xd4200000u: /* BRK */ + case 0xd4400000u: /* HLT */ + /* Give priority to the breakpoint exception. */ + return true; + } + } + return false; +} + +static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, + CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + CPUARMState *env = cpu->env_ptr; + ARMCPU *arm_cpu = env_archcpu(env); + CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb); + int bound, core_mmu_idx; + + dc->isar = &arm_cpu->isar; + dc->condjmp = 0; + dc->pc_save = dc->base.pc_first; + dc->aarch64 = true; + dc->thumb = false; + dc->sctlr_b = 0; + dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE; + dc->condexec_mask = 0; + dc->condexec_cond = 0; + core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX); + dc->mmu_idx = core_to_aa64_mmu_idx(core_mmu_idx); + dc->tbii = EX_TBFLAG_A64(tb_flags, TBII); + dc->tbid = EX_TBFLAG_A64(tb_flags, TBID); + dc->tcma = EX_TBFLAG_A64(tb_flags, TCMA); + dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx); +#if !defined(CONFIG_USER_ONLY) + dc->user = (dc->current_el == 0); +#endif + dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL); + dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM); + dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL); + dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE); + dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC); + dc->fgt_eret = EX_TBFLAG_A64(tb_flags, FGT_ERET); + dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL); + dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL); + dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16; + dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16; + dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE); + dc->bt = EX_TBFLAG_A64(tb_flags, BT); + dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE); + dc->unpriv = EX_TBFLAG_A64(tb_flags, UNPRIV); + dc->ata = EX_TBFLAG_A64(tb_flags, ATA); + dc->mte_active[0] = EX_TBFLAG_A64(tb_flags, MTE_ACTIVE); + dc->mte_active[1] = EX_TBFLAG_A64(tb_flags, MTE0_ACTIVE); + dc->pstate_sm = EX_TBFLAG_A64(tb_flags, PSTATE_SM); + dc->pstate_za = EX_TBFLAG_A64(tb_flags, PSTATE_ZA); + dc->sme_trap_nonstreaming = EX_TBFLAG_A64(tb_flags, SME_TRAP_NONSTREAMING); + dc->vec_len = 0; + dc->vec_stride = 0; + dc->cp_regs = arm_cpu->cp_regs; + dc->features = env->features; + dc->dcz_blocksize = arm_cpu->dcz_blocksize; + +#ifdef CONFIG_USER_ONLY + /* In sve_probe_page, we assume TBI is enabled. */ + tcg_debug_assert(dc->tbid & 1); +#endif + + /* Single step state. The code-generation logic here is: + * SS_ACTIVE == 0: + * generate code with no special handling for single-stepping (except + * that anything that can make us go to SS_ACTIVE == 1 must end the TB; + * this happens anyway because those changes are all system register or + * PSTATE writes). + * SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending) + * emit code for one insn + * emit code to clear PSTATE.SS + * emit code to generate software step exception for completed step + * end TB (as usual for having generated an exception) + * SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending) + * emit code to generate a software step exception + * end the TB + */ + dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE); + dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS); + dc->is_ldex = false; + + /* Bound the number of insns to execute to those left on the page. */ + bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4; + + /* If architectural single step active, limit to 1. */ + if (dc->ss_active) { + bound = 1; + } + dc->base.max_insns = MIN(dc->base.max_insns, bound); + + init_tmp_a64_array(dc); +} + +static void aarch64_tr_tb_start(DisasContextBase *db, CPUState *cpu) +{ +} + +static void aarch64_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + target_ulong pc_arg = dc->base.pc_next; + + if (TARGET_TB_PCREL) { + pc_arg &= ~TARGET_PAGE_MASK; + } + tcg_gen_insn_start(pc_arg, 0, 0); + dc->insn_start = tcg_last_op(); +} + +static void aarch64_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *s = container_of(dcbase, DisasContext, base); + CPUARMState *env = cpu->env_ptr; + uint64_t pc = s->base.pc_next; + uint32_t insn; + + /* Singlestep exceptions have the highest priority. */ + if (s->ss_active && !s->pstate_ss) { + /* Singlestep state is Active-pending. + * If we're in this state at the start of a TB then either + * a) we just took an exception to an EL which is being debugged + * and this is the first insn in the exception handler + * b) debug exceptions were masked and we just unmasked them + * without changing EL (eg by clearing PSTATE.D) + * In either case we're going to take a swstep exception in the + * "did not step an insn" case, and so the syndrome ISV and EX + * bits should be zero. + */ + assert(s->base.num_insns == 1); + gen_swstep_exception(s, 0, 0); + s->base.is_jmp = DISAS_NORETURN; + s->base.pc_next = pc + 4; + return; + } + + if (pc & 3) { + /* + * PC alignment fault. This has priority over the instruction abort + * that we would receive from a translation fault via arm_ldl_code. + * This should only be possible after an indirect branch, at the + * start of the TB. + */ + assert(s->base.num_insns == 1); + gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc)); + s->base.is_jmp = DISAS_NORETURN; + s->base.pc_next = QEMU_ALIGN_UP(pc, 4); + return; + } + + s->pc_curr = pc; + insn = arm_ldl_code(env, &s->base, pc, s->sctlr_b); + s->insn = insn; + s->base.pc_next = pc + 4; + + s->fp_access_checked = false; + s->sve_access_checked = false; + + if (s->pstate_il) { + /* + * Illegal execution state. This has priority over BTI + * exceptions, but comes after instruction abort exceptions. + */ + gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate()); + return; + } + + if (dc_isar_feature(aa64_bti, s)) { + if (s->base.num_insns == 1) { + /* + * At the first insn of the TB, compute s->guarded_page. + * We delayed computing this until successfully reading + * the first insn of the TB, above. This (mostly) ensures + * that the softmmu tlb entry has been populated, and the + * page table GP bit is available. + * + * Note that we need to compute this even if btype == 0, + * because this value is used for BR instructions later + * where ENV is not available. + */ + s->guarded_page = is_guarded_page(env, s); + + /* First insn can have btype set to non-zero. */ + tcg_debug_assert(s->btype >= 0); + + /* + * Note that the Branch Target Exception has fairly high + * priority -- below debugging exceptions but above most + * everything else. This allows us to handle this now + * instead of waiting until the insn is otherwise decoded. + */ + if (s->btype != 0 + && s->guarded_page + && !btype_destination_ok(insn, s->bt, s->btype)) { + gen_exception_insn(s, 0, EXCP_UDEF, syn_btitrap(s->btype)); + return; + } + } else { + /* Not the first insn: btype must be 0. */ + tcg_debug_assert(s->btype == 0); + } + } + + s->is_nonstreaming = false; + if (s->sme_trap_nonstreaming) { + disas_sme_fa64(s, insn); + } + + switch (extract32(insn, 25, 4)) { + case 0x0: + if (!extract32(insn, 31, 1) || !disas_sme(s, insn)) { + unallocated_encoding(s); + } + break; + case 0x1: case 0x3: /* UNALLOCATED */ + unallocated_encoding(s); + break; + case 0x2: + if (!disas_sve(s, insn)) { + unallocated_encoding(s); + } + break; + case 0x8: case 0x9: /* Data processing - immediate */ + disas_data_proc_imm(s, insn); + break; + case 0xa: case 0xb: /* Branch, exception generation and system insns */ + disas_b_exc_sys(s, insn); + break; + case 0x4: + case 0x6: + case 0xc: + case 0xe: /* Loads and stores */ + disas_ldst(s, insn); + break; + case 0x5: + case 0xd: /* Data processing - register */ + disas_data_proc_reg(s, insn); + break; + case 0x7: + case 0xf: /* Data processing - SIMD and floating point */ + disas_data_proc_simd_fp(s, insn); + break; + default: + assert(FALSE); /* all 15 cases should be handled above */ + break; + } + + /* if we allocated any temporaries, free them here */ + free_tmp_a64(s); + + /* + * After execution of most insns, btype is reset to 0. + * Note that we set btype == -1 when the insn sets btype. + */ + if (s->btype > 0 && s->base.is_jmp != DISAS_NORETURN) { + reset_btype(s); + } + + translator_loop_temp_check(&s->base); +} + +static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + + if (unlikely(dc->ss_active)) { + /* Note that this means single stepping WFI doesn't halt the CPU. + * For conditional branch insns this is harmless unreachable code as + * gen_goto_tb() has already handled emitting the debug exception + * (and thus a tb-jump is not possible when singlestepping). + */ + switch (dc->base.is_jmp) { + default: + gen_a64_update_pc(dc, 4); + /* fall through */ + case DISAS_EXIT: + case DISAS_JUMP: + gen_step_complete_exception(dc); + break; + case DISAS_NORETURN: + break; + } + } else { + switch (dc->base.is_jmp) { + case DISAS_NEXT: + case DISAS_TOO_MANY: + gen_goto_tb(dc, 1, 4); + break; + default: + case DISAS_UPDATE_EXIT: + gen_a64_update_pc(dc, 4); + /* fall through */ + case DISAS_EXIT: + tcg_gen_exit_tb(NULL, 0); + break; + case DISAS_UPDATE_NOCHAIN: + gen_a64_update_pc(dc, 4); + /* fall through */ + case DISAS_JUMP: + tcg_gen_lookup_and_goto_ptr(); + break; + case DISAS_NORETURN: + case DISAS_SWI: + break; + case DISAS_WFE: + gen_a64_update_pc(dc, 4); + gen_helper_wfe(cpu_env); + break; + case DISAS_YIELD: + gen_a64_update_pc(dc, 4); + gen_helper_yield(cpu_env); + break; + case DISAS_WFI: + /* + * This is a special case because we don't want to just halt + * the CPU if trying to debug across a WFI. + */ + gen_a64_update_pc(dc, 4); + gen_helper_wfi(cpu_env, tcg_constant_i32(4)); + /* + * The helper doesn't necessarily throw an exception, but we + * must go back to the main loop to check for interrupts anyway. + */ + tcg_gen_exit_tb(NULL, 0); + break; + } + } +} + +static void aarch64_tr_disas_log(const DisasContextBase *dcbase, + CPUState *cpu, FILE *logfile) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + + fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first)); + target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size); +} + +const TranslatorOps aarch64_translator_ops = { + .init_disas_context = aarch64_tr_init_disas_context, + .tb_start = aarch64_tr_tb_start, + .insn_start = aarch64_tr_insn_start, + .translate_insn = aarch64_tr_translate_insn, + .tb_stop = aarch64_tr_tb_stop, + .disas_log = aarch64_tr_disas_log, +}; diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h new file mode 100644 index 0000000..ad3762d --- /dev/null +++ b/target/arm/tcg/translate-a64.h @@ -0,0 +1,201 @@ +/* + * AArch64 translation, common definitions. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TARGET_ARM_TRANSLATE_A64_H +#define TARGET_ARM_TRANSLATE_A64_H + +TCGv_i64 new_tmp_a64(DisasContext *s); +TCGv_i64 new_tmp_a64_local(DisasContext *s); +TCGv_i64 new_tmp_a64_zero(DisasContext *s); +TCGv_i64 cpu_reg(DisasContext *s, int reg); +TCGv_i64 cpu_reg_sp(DisasContext *s, int reg); +TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf); +TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf); +void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v); +bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn, + unsigned int imms, unsigned int immr); +bool sve_access_check(DisasContext *s); +bool sme_enabled_check(DisasContext *s); +bool sme_enabled_check_with_svcr(DisasContext *s, unsigned); + +/* This function corresponds to CheckStreamingSVEEnabled. */ +static inline bool sme_sm_enabled_check(DisasContext *s) +{ + return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK); +} + +/* This function corresponds to CheckSMEAndZAEnabled. */ +static inline bool sme_za_enabled_check(DisasContext *s) +{ + return sme_enabled_check_with_svcr(s, R_SVCR_ZA_MASK); +} + +/* Note that this function corresponds to CheckStreamingSVEAndZAEnabled. */ +static inline bool sme_smza_enabled_check(DisasContext *s) +{ + return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK | R_SVCR_ZA_MASK); +} + +TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr); +TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int log2_size); +TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int size); + +/* We should have at some point before trying to access an FP register + * done the necessary access check, so assert that + * (a) we did the check and + * (b) we didn't then just plough ahead anyway if it failed. + * Print the instruction pattern in the abort message so we can figure + * out what we need to fix if a user encounters this problem in the wild. + */ +static inline void assert_fp_access_checked(DisasContext *s) +{ +#ifdef CONFIG_DEBUG_TCG + if (unlikely(!s->fp_access_checked || s->fp_excp_el)) { + fprintf(stderr, "target-arm: FP access check missing for " + "instruction 0x%08x\n", s->insn); + abort(); + } +#endif +} + +/* Return the offset into CPUARMState of an element of specified + * size, 'element' places in from the least significant end of + * the FP/vector register Qn. + */ +static inline int vec_reg_offset(DisasContext *s, int regno, + int element, MemOp size) +{ + int element_size = 1 << size; + int offs = element * element_size; +#if HOST_BIG_ENDIAN + /* This is complicated slightly because vfp.zregs[n].d[0] is + * still the lowest and vfp.zregs[n].d[15] the highest of the + * 256 byte vector, even on big endian systems. + * + * Calculate the offset assuming fully little-endian, + * then XOR to account for the order of the 8-byte units. + * + * For 16 byte elements, the two 8 byte halves will not form a + * host int128 if the host is bigendian, since they're in the + * wrong order. However the only 16 byte operation we have is + * a move, so we can ignore this for the moment. More complicated + * operations will have to special case loading and storing from + * the zregs array. + */ + if (element_size < 8) { + offs ^= 8 - element_size; + } +#endif + offs += offsetof(CPUARMState, vfp.zregs[regno]); + assert_fp_access_checked(s); + return offs; +} + +/* Return the offset info CPUARMState of the "whole" vector register Qn. */ +static inline int vec_full_reg_offset(DisasContext *s, int regno) +{ + assert_fp_access_checked(s); + return offsetof(CPUARMState, vfp.zregs[regno]); +} + +/* Return a newly allocated pointer to the vector register. */ +static inline TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, vec_full_reg_offset(s, regno)); + return ret; +} + +/* Return the byte size of the "whole" vector register, VL / 8. */ +static inline int vec_full_reg_size(DisasContext *s) +{ + return s->vl; +} + +/* Return the byte size of the vector register, SVL / 8. */ +static inline int streaming_vec_reg_size(DisasContext *s) +{ + return s->svl; +} + +/* + * Return the offset info CPUARMState of the predicate vector register Pn. + * Note for this purpose, FFR is P16. + */ +static inline int pred_full_reg_offset(DisasContext *s, int regno) +{ + return offsetof(CPUARMState, vfp.pregs[regno]); +} + +/* Return the byte size of the whole predicate register, VL / 64. */ +static inline int pred_full_reg_size(DisasContext *s) +{ + return s->vl >> 3; +} + +/* Return the byte size of the predicate register, SVL / 64. */ +static inline int streaming_pred_reg_size(DisasContext *s) +{ + return s->svl >> 3; +} + +/* + * Round up the size of a register to a size allowed by + * the tcg vector infrastructure. Any operation which uses this + * size may assume that the bits above pred_full_reg_size are zero, + * and must leave them the same way. + * + * Note that this is not needed for the vector registers as they + * are always properly sized for tcg vectors. + */ +static inline int size_for_gvec(int size) +{ + if (size <= 8) { + return 8; + } else { + return QEMU_ALIGN_UP(size, 16); + } +} + +static inline int pred_gvec_reg_size(DisasContext *s) +{ + return size_for_gvec(pred_full_reg_size(s)); +} + +/* Return a newly allocated pointer to the predicate register. */ +static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, pred_full_reg_offset(s, regno)); + return ret; +} + +bool disas_sve(DisasContext *, uint32_t); +bool disas_sme(DisasContext *, uint32_t); + +void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, int64_t shift, + uint32_t opr_sz, uint32_t max_sz); + +void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm); +void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm); + +#endif /* TARGET_ARM_TRANSLATE_A64_H */ diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c new file mode 100644 index 0000000..5df7d46 --- /dev/null +++ b/target/arm/tcg/translate-m-nocp.c @@ -0,0 +1,788 @@ +/* + * ARM translation: M-profile NOCP special-case instructions + * + * Copyright (c) 2020 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "translate.h" +#include "translate-a32.h" + +#include "decode-m-nocp.c.inc" + +/* + * Decode VLLDM and VLSTM are nonstandard because: + * * if there is no FPU then these insns must NOP in + * Secure state and UNDEF in Nonsecure state + * * if there is an FPU then these insns do not have + * the usual behaviour that vfp_access_check() provides of + * being controlled by CPACR/NSACR enable bits or the + * lazy-stacking logic. + */ +static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a) +{ + TCGv_i32 fptr; + + if (!arm_dc_feature(s, ARM_FEATURE_M) || + !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + + if (a->op) { + /* + * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not + * to take the IMPDEF option to make memory accesses to the stack + * slots that correspond to the D16-D31 registers (discarding + * read data and writing UNKNOWN values), so for us the T2 + * encoding behaves identically to the T1 encoding. + */ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + } else { + /* + * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs. + * This is currently architecturally impossible, but we add the + * check to stay in line with the pseudocode. Note that we must + * emit code for the UNDEF so it takes precedence over the NOCP. + */ + if (dc_isar_feature(aa32_simd_r32, s)) { + unallocated_encoding(s); + return true; + } + } + + /* + * If not secure, UNDEF. We must emit code for this + * rather than returning false so that this takes + * precedence over the m-nocp.decode NOCP fallback. + */ + if (!s->v8m_secure) { + unallocated_encoding(s); + return true; + } + + s->eci_handled = true; + + /* If no fpu, NOP. */ + if (!dc_isar_feature(aa32_vfp, s)) { + clear_eci_state(s); + return true; + } + + fptr = load_reg(s, a->rn); + if (a->l) { + gen_helper_v7m_vlldm(cpu_env, fptr); + } else { + gen_helper_v7m_vlstm(cpu_env, fptr); + } + tcg_temp_free_i32(fptr); + + clear_eci_state(s); + + /* + * End the TB, because we have updated FP control bits, + * and possibly VPR or LTPSIZE. + */ + s->base.is_jmp = DISAS_UPDATE_EXIT; + return true; +} + +static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) +{ + int btmreg, topreg; + TCGv_i64 zero; + TCGv_i32 aspen, sfpa; + + if (!dc_isar_feature(aa32_m_sec_state, s)) { + /* Before v8.1M, fall through in decode to NOCP check */ + return false; + } + + /* Explicitly UNDEF because this takes precedence over NOCP */ + if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) { + unallocated_encoding(s); + return true; + } + + s->eci_handled = true; + + if (!dc_isar_feature(aa32_vfp_simd, s)) { + /* NOP if we have neither FP nor MVE */ + clear_eci_state(s); + return true; + } + + /* + * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no + * active floating point context so we must NOP (without doing + * any lazy state preservation or the NOCP check). + */ + aspen = load_cpu_field(v7m.fpccr[M_REG_S]); + sfpa = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_or_i32(sfpa, sfpa, aspen); + arm_gen_condlabel(s); + tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel.label); + + if (s->fp_excp_el != 0) { + gen_exception_insn_el(s, 0, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return true; + } + + topreg = a->vd + a->imm - 1; + btmreg = a->vd; + + /* Convert to Sreg numbers if the insn specified in Dregs */ + if (a->size == 3) { + topreg = topreg * 2 + 1; + btmreg *= 2; + } + + if (topreg > 63 || (topreg > 31 && !(topreg & 1))) { + /* UNPREDICTABLE: we choose to undef */ + unallocated_encoding(s); + return true; + } + + /* Silently ignore requests to clear D16-D31 if they don't exist */ + if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) { + topreg = 31; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* Zero the Sregs from btmreg to topreg inclusive. */ + zero = tcg_constant_i64(0); + if (btmreg & 1) { + write_neon_element64(zero, btmreg >> 1, 1, MO_32); + btmreg++; + } + for (; btmreg + 1 <= topreg; btmreg += 2) { + write_neon_element64(zero, btmreg >> 1, 0, MO_64); + } + if (btmreg == topreg) { + write_neon_element64(zero, btmreg >> 1, 0, MO_32); + btmreg++; + } + assert(btmreg == topreg + 1); + if (dc_isar_feature(aa32_mve, s)) { + store_cpu_field(tcg_constant_i32(0), v7m.vpr); + } + + clear_eci_state(s); + return true; +} + +/* + * M-profile provides two different sets of instructions that can + * access floating point system registers: VMSR/VMRS (which move + * to/from a general purpose register) and VLDR/VSTR sysreg (which + * move directly to/from memory). In some cases there are also side + * effects which must happen after any write to memory (which could + * cause an exception). So we implement the common logic for the + * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(), + * which take pointers to callback functions which will perform the + * actual "read/write general purpose register" and "read/write + * memory" operations. + */ + +/* + * Emit code to store the sysreg to its final destination; frees the + * TCG temp 'value' it is passed. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. + */ +typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access); +/* + * Emit code to load the value to be copied to the sysreg; returns + * a new TCG temporary. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. + */ +typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque, + bool do_access); + +/* Common decode/access checks for fp sysreg read/write */ +typedef enum FPSysRegCheckResult { + FPSysRegCheckFailed, /* caller should return false */ + FPSysRegCheckDone, /* caller should return true */ + FPSysRegCheckContinue, /* caller should continue generating code */ +} FPSysRegCheckResult; + +static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno) +{ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + + switch (regno) { + case ARM_VFP_FPSCR: + case QEMU_VFP_FPSCR_NZCV: + break; + case ARM_VFP_FPSCR_NZCVQC: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_FPCXT_S: + case ARM_VFP_FPCXT_NS: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + if (!s->v8m_secure) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_VPR: + case ARM_VFP_P0: + if (!dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + break; + default: + return FPSysRegCheckFailed; + } + + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * So we don't call vfp_access_check() and the callers must handle this. + */ + if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) { + return FPSysRegCheckDone; + } + return FPSysRegCheckContinue; +} + +static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, + TCGLabel *label) +{ + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * We don't have a TB flag that matches the fpInactive check, so we + * do it at runtime as we don't expect FPCXT_NS accesses to be frequent. + * + * Emit code that checks fpInactive and does a conditional + * branch to label based on it: + * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive) + * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active) + */ + assert(cond == TCG_COND_EQ || cond == TCG_COND_NE); + + /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */ + TCGv_i32 aspen, fpca; + aspen = load_cpu_field(v7m.fpccr[M_REG_NS]); + fpca = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK); + tcg_gen_or_i32(fpca, fpca, aspen); + tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label); + tcg_temp_free_i32(aspen); + tcg_temp_free_i32(fpca); +} + +static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, + fp_sysreg_loadfn *loadfn, + void *opaque) +{ + /* Do a write to an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = loadfn(s, opaque, true); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + gen_lookup_tb(s); + break; + case ARM_VFP_FPSCR_NZCVQC: + { + TCGv_i32 fpscr; + tmp = loadfn(s, opaque, true); + if (dc_isar_feature(aa32_mve, s)) { + /* QC is only present for MVE; otherwise RES0 */ + TCGv_i32 qc = tcg_temp_new_i32(); + tcg_gen_andi_i32(qc, tmp, FPCR_QC); + /* + * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; + * here writing the same value into all elements is simplest. + */ + tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), + 16, 16, qc); + } + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); + tcg_gen_or_i32(fpscr, fpscr, tmp); + store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); + tcg_temp_free_i32(tmp); + break; + } + case ARM_VFP_FPCXT_NS: + { + TCGLabel *lab_active = gen_new_label(); + + lab_end = gen_new_label(); + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* + * fpInactive case: write is a NOP, so only do side effects + * like register writeback before we branch to end + */ + loadfn(s, opaque, false); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS writes + * behave the same as FPCXT_S writes. + */ + if (!vfp_access_check_m(s, true)) { + /* + * This was only a conditional exception, so override + * gen_exception_insn_el()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + } + /* fall through */ + case ARM_VFP_FPCXT_S: + { + TCGv_i32 sfpa, control; + /* + * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes + * bits [27:0] from value and zeroes bits [31:28]. + */ + tmp = loadfn(s, opaque, true); + sfpa = tcg_temp_new_i32(); + tcg_gen_shri_i32(sfpa, tmp, 31); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_deposit_i32(control, control, sfpa, + R_V7M_CONTROL_SFPA_SHIFT, 1); + store_cpu_field(control, v7m.control[M_REG_S]); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(sfpa); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + loadfn(s, opaque, false); + break; + } + tmp = loadfn(s, opaque, true); + store_cpu_field(tmp, v7m.vpr); + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + break; + case ARM_VFP_P0: + { + TCGv_i32 vpr; + tmp = loadfn(s, opaque, true); + vpr = load_cpu_field(v7m.vpr); + tcg_gen_deposit_i32(vpr, vpr, tmp, + R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + store_cpu_field(vpr, v7m.vpr); + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + tcg_temp_free_i32(tmp); + break; + } + default: + g_assert_not_reached(); + } + if (lab_end) { + gen_set_label(lab_end); + } + return true; +} + +static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, + fp_sysreg_storefn *storefn, + void *opaque) +{ + /* Do a read from an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + bool lookup_tb = false; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { + /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ + regno = QEMU_VFP_FPSCR_NZCV; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_FPSCR_NZCVQC: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); + storefn(s, opaque, tmp, true); + break; + case QEMU_VFP_FPSCR_NZCV: + /* + * Read just NZCV; this is a special case to avoid the + * helper call for the "VMRS to CPSR.NZCV" insn. + */ + tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_FPCXT_S: + { + TCGv_i32 control, sfpa, fpscr; + /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */ + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(sfpa); + /* + * Store result before updating FPSCR etc, in case + * it is a memory write which causes an exception. + */ + storefn(s, opaque, tmp, true); + /* + * Now we must reset FPSCR from FPDSCR_NS, and clear + * CONTROL.SFPA; so we'll end the TB here. + */ + tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK); + store_cpu_field(control, v7m.control[M_REG_S]); + fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + lookup_tb = true; + break; + } + case ARM_VFP_FPCXT_NS: + { + TCGv_i32 control, sfpa, fpscr, fpdscr; + TCGLabel *lab_active = gen_new_label(); + + lookup_tb = true; + + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* fpInactive case: reads as FPDSCR_NS */ + TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); + storefn(s, opaque, tmp, true); + lab_end = gen_new_label(); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS + * reads the same as FPCXT_S. + */ + if (!vfp_access_check_m(s, true)) { + /* + * This was only a conditional exception, so override + * gen_exception_insn_el()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + fpscr = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(fpscr, cpu_env); + tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(control); + /* Store result before updating FPSCR, in case it faults */ + storefn(s, opaque, tmp, true); + /* If SFPA is zero then set FPSCR from FPDSCR_NS */ + fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, tcg_constant_i32(0), + fpdscr, fpscr); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(sfpa); + tcg_temp_free_i32(fpdscr); + tcg_temp_free_i32(fpscr); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + storefn(s, opaque, NULL, false); + break; + } + tmp = load_cpu_field(v7m.vpr); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_P0: + tmp = load_cpu_field(v7m.vpr); + tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + storefn(s, opaque, tmp, true); + break; + default: + g_assert_not_reached(); + } + + if (lab_end) { + gen_set_label(lab_end); + } + if (lookup_tb) { + gen_lookup_tb(s); + } + return true; +} + +static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) +{ + arg_VMSR_VMRS *a = opaque; + + if (!do_access) { + return; + } + + if (a->rt == 15) { + /* Set the 4 flag bits in the CPSR */ + gen_set_nzcv(value); + tcg_temp_free_i32(value); + } else { + store_reg(s, a->rt, value); + } +} + +static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access) +{ + arg_VMSR_VMRS *a = opaque; + + if (!do_access) { + return NULL; + } + return load_reg(s, a->rt); +} + +static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) +{ + /* + * Accesses to R15 are UNPREDICTABLE; we choose to undef. + * FPSCR -> r15 is a special case which writes to the PSR flags; + * set a->reg to a special value to tell gen_M_fp_sysreg_read() + * we only care about the top 4 bits of FPSCR there. + */ + if (a->rt == 15) { + if (a->l && a->reg == ARM_VFP_FPSCR) { + a->reg = QEMU_VFP_FPSCR_NZCV; + } else { + return false; + } + } + + if (a->l) { + /* VMRS, move FP system register to gp register */ + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a); + } else { + /* VMSR, move gp register to FP system register */ + return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a); + } +} + +static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + + if (!a->a) { + offset = -offset; + } + + if (!do_access && !a->w) { + return; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + if (do_access) { + gen_aa32_st_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + tcg_temp_free_i32(value); + } + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } +} + +static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque, + bool do_access) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + TCGv_i32 value = NULL; + + if (!a->a) { + offset = -offset; + } + + if (!do_access && !a->w) { + return NULL; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + if (do_access) { + value = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + } + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + return value; +} + +static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a); +} + +static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a); +} + +static bool trans_NOCP(DisasContext *s, arg_nocp *a) +{ + /* + * Handle M-profile early check for disabled coprocessor: + * all we need to do here is emit the NOCP exception if + * the coprocessor is disabled. Otherwise we return false + * and the real VFP/etc decode will handle the insn. + */ + assert(arm_dc_feature(s, ARM_FEATURE_M)); + + if (a->cp == 11) { + a->cp = 10; + } + if (arm_dc_feature(s, ARM_FEATURE_V8_1M) && + (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) { + /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */ + a->cp = 10; + } + + if (a->cp != 10) { + gen_exception_insn(s, 0, EXCP_NOCP, syn_uncategorized()); + return true; + } + + if (s->fp_excp_el != 0) { + gen_exception_insn_el(s, 0, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return true; + } + + return false; +} + +static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a) +{ + /* This range needs a coprocessor check for v8.1M and later only */ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + return trans_NOCP(s, a); +} diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c new file mode 100644 index 0000000..db7ea3f --- /dev/null +++ b/target/arm/tcg/translate-mve.c @@ -0,0 +1,2310 @@ +/* + * ARM translation: M-profile MVE instructions + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "exec/exec-all.h" +#include "exec/gen-icount.h" +#include "translate.h" +#include "translate-a32.h" + +static inline int vidup_imm(DisasContext *s, int x) +{ + return 1 << x; +} + +/* Include the generated decoder */ +#include "decode-mve.c.inc" + +typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenLdStSGFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenLdStIlFn(TCGv_ptr, TCGv_i32, TCGv_i32); +typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenTwoOpShiftFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenLongDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64); +typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64); +typedef void MVEGenVIDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32); +typedef void MVEGenVIWDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32); +typedef void MVEGenCmpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenScalarCmpFn(TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenVABAVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenDualAccOpFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenVCVTRmodeFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); + +/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ +static inline long mve_qreg_offset(unsigned reg) +{ + return offsetof(CPUARMState, vfp.zregs[reg].d[0]); +} + +static TCGv_ptr mve_qreg_ptr(unsigned reg) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg)); + return ret; +} + +static bool mve_no_predication(DisasContext *s) +{ + /* + * Return true if we are executing the entire MVE instruction + * with no predication or partial-execution, and so we can safely + * use an inline TCG vector implementation. + */ + return s->eci == 0 && s->mve_no_pred; +} + +static bool mve_check_qreg_bank(DisasContext *s, int qmask) +{ + /* + * Check whether Qregs are in range. For v8.1M only Q0..Q7 + * are supported, see VFPSmallRegisterBank(). + */ + return qmask < 8; +} + +bool mve_eci_check(DisasContext *s) +{ + /* + * This is a beatwise insn: check that ECI is valid (not a + * reserved value) and note that we are handling it. + * Return true if OK, false if we generated an exception. + */ + s->eci_handled = true; + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + /* Reserved value: INVSTATE UsageFault */ + gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized()); + return false; + } +} + +void mve_update_eci(DisasContext *s) +{ + /* + * The helper function will always update the CPUState field, + * so we only need to update the DisasContext field. + */ + if (s->eci) { + s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE; + } +} + +void mve_update_and_store_eci(DisasContext *s) +{ + /* + * For insns which don't call a helper function that will call + * mve_advance_vpt(), this version updates s->eci and also stores + * it out to the CPUState field. + */ + if (s->eci) { + mve_update_eci(s); + store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits); + } +} + +static bool mve_skip_first_beat(DisasContext *s) +{ + /* Return true if PSR.ECI says we must skip the first beat of this insn */ + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + g_assert_not_reached(); + } +} + +static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn, + unsigned msize) +{ + TCGv_i32 addr; + uint32_t offset; + TCGv_ptr qreg; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd) || + !fn) { + return false; + } + + /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + if (a->rn == 15 || (a->rn == 13 && a->w)) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + offset = a->imm << msize; + if (!a->a) { + offset = -offset; + } + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + qreg = mve_qreg_ptr(a->qd); + fn(cpu_env, qreg, addr); + tcg_temp_free_ptr(qreg); + + /* + * Writeback always happens after the last beat of the insn, + * regardless of predication + */ + if (a->w) { + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + mve_update_eci(s); + return true; +} + +static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a) +{ + static MVEGenLdStFn * const ldstfns[4][2] = { + { gen_helper_mve_vstrb, gen_helper_mve_vldrb }, + { gen_helper_mve_vstrh, gen_helper_mve_vldrh }, + { gen_helper_mve_vstrw, gen_helper_mve_vldrw }, + { NULL, NULL } + }; + return do_ldst(s, a, ldstfns[a->size][a->l], a->size); +} + +#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST, MSIZE) \ + static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \ + { \ + static MVEGenLdStFn * const ldstfns[2][2] = { \ + { gen_helper_mve_##ST, gen_helper_mve_##SLD }, \ + { NULL, gen_helper_mve_##ULD }, \ + }; \ + return do_ldst(s, a, ldstfns[a->u][a->l], MSIZE); \ + } + +DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h, MO_8) +DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w, MO_8) +DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w, MO_16) + +static bool do_ldst_sg(DisasContext *s, arg_vldst_sg *a, MVEGenLdStSGFn fn) +{ + TCGv_i32 addr; + TCGv_ptr qd, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn || a->rn == 15) { + /* Rn case is UNPREDICTABLE */ + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + addr = load_reg(s, a->rn); + + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm, addr); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + tcg_temp_free_i32(addr); + mve_update_eci(s); + return true; +} + +/* + * The naming scheme here is "vldrb_sg_sh == in-memory byte loads + * signextended to halfword elements in register". _os_ indicates that + * the offsets in Qm should be scaled by the element size. + */ +/* This macro is just to make the arrays more compact in these functions */ +#define F(N) gen_helper_mve_##N + +/* VLDRB/VSTRB (ie msize 1) with OS=1 is UNPREDICTABLE; we UNDEF */ +static bool trans_VLDR_S_sg(DisasContext *s, arg_vldst_sg *a) +{ + static MVEGenLdStSGFn * const fns[2][4][4] = { { + { NULL, F(vldrb_sg_sh), F(vldrb_sg_sw), NULL }, + { NULL, NULL, F(vldrh_sg_sw), NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } + }, { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, F(vldrh_sg_os_sw), NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } + } + }; + if (a->qd == a->qm) { + return false; /* UNPREDICTABLE */ + } + return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]); +} + +static bool trans_VLDR_U_sg(DisasContext *s, arg_vldst_sg *a) +{ + static MVEGenLdStSGFn * const fns[2][4][4] = { { + { F(vldrb_sg_ub), F(vldrb_sg_uh), F(vldrb_sg_uw), NULL }, + { NULL, F(vldrh_sg_uh), F(vldrh_sg_uw), NULL }, + { NULL, NULL, F(vldrw_sg_uw), NULL }, + { NULL, NULL, NULL, F(vldrd_sg_ud) } + }, { + { NULL, NULL, NULL, NULL }, + { NULL, F(vldrh_sg_os_uh), F(vldrh_sg_os_uw), NULL }, + { NULL, NULL, F(vldrw_sg_os_uw), NULL }, + { NULL, NULL, NULL, F(vldrd_sg_os_ud) } + } + }; + if (a->qd == a->qm) { + return false; /* UNPREDICTABLE */ + } + return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]); +} + +static bool trans_VSTR_sg(DisasContext *s, arg_vldst_sg *a) +{ + static MVEGenLdStSGFn * const fns[2][4][4] = { { + { F(vstrb_sg_ub), F(vstrb_sg_uh), F(vstrb_sg_uw), NULL }, + { NULL, F(vstrh_sg_uh), F(vstrh_sg_uw), NULL }, + { NULL, NULL, F(vstrw_sg_uw), NULL }, + { NULL, NULL, NULL, F(vstrd_sg_ud) } + }, { + { NULL, NULL, NULL, NULL }, + { NULL, F(vstrh_sg_os_uh), F(vstrh_sg_os_uw), NULL }, + { NULL, NULL, F(vstrw_sg_os_uw), NULL }, + { NULL, NULL, NULL, F(vstrd_sg_os_ud) } + } + }; + return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]); +} + +#undef F + +static bool do_ldst_sg_imm(DisasContext *s, arg_vldst_sg_imm *a, + MVEGenLdStSGFn *fn, unsigned msize) +{ + uint32_t offset; + TCGv_ptr qd, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + offset = a->imm << msize; + if (!a->a) { + offset = -offset; + } + + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm, tcg_constant_i32(offset)); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +static bool trans_VLDRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a) +{ + static MVEGenLdStSGFn * const fns[] = { + gen_helper_mve_vldrw_sg_uw, + gen_helper_mve_vldrw_sg_wb_uw, + }; + if (a->qd == a->qm) { + return false; /* UNPREDICTABLE */ + } + return do_ldst_sg_imm(s, a, fns[a->w], MO_32); +} + +static bool trans_VLDRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a) +{ + static MVEGenLdStSGFn * const fns[] = { + gen_helper_mve_vldrd_sg_ud, + gen_helper_mve_vldrd_sg_wb_ud, + }; + if (a->qd == a->qm) { + return false; /* UNPREDICTABLE */ + } + return do_ldst_sg_imm(s, a, fns[a->w], MO_64); +} + +static bool trans_VSTRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a) +{ + static MVEGenLdStSGFn * const fns[] = { + gen_helper_mve_vstrw_sg_uw, + gen_helper_mve_vstrw_sg_wb_uw, + }; + return do_ldst_sg_imm(s, a, fns[a->w], MO_32); +} + +static bool trans_VSTRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a) +{ + static MVEGenLdStSGFn * const fns[] = { + gen_helper_mve_vstrd_sg_ud, + gen_helper_mve_vstrd_sg_wb_ud, + }; + return do_ldst_sg_imm(s, a, fns[a->w], MO_64); +} + +static bool do_vldst_il(DisasContext *s, arg_vldst_il *a, MVEGenLdStIlFn *fn, + int addrinc) +{ + TCGv_i32 rn; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd) || + !fn || (a->rn == 13 && a->w) || a->rn == 15) { + /* Variously UNPREDICTABLE or UNDEF or related-encoding */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + rn = load_reg(s, a->rn); + /* + * We pass the index of Qd, not a pointer, because the helper must + * access multiple Q registers starting at Qd and working up. + */ + fn(cpu_env, tcg_constant_i32(a->qd), rn); + + if (a->w) { + tcg_gen_addi_i32(rn, rn, addrinc); + store_reg(s, a->rn, rn); + } else { + tcg_temp_free_i32(rn); + } + mve_update_and_store_eci(s); + return true; +} + +/* This macro is just to make the arrays more compact in these functions */ +#define F(N) gen_helper_mve_##N + +static bool trans_VLD2(DisasContext *s, arg_vldst_il *a) +{ + static MVEGenLdStIlFn * const fns[4][4] = { + { F(vld20b), F(vld20h), F(vld20w), NULL, }, + { F(vld21b), F(vld21h), F(vld21w), NULL, }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }; + if (a->qd > 6) { + return false; + } + return do_vldst_il(s, a, fns[a->pat][a->size], 32); +} + +static bool trans_VLD4(DisasContext *s, arg_vldst_il *a) +{ + static MVEGenLdStIlFn * const fns[4][4] = { + { F(vld40b), F(vld40h), F(vld40w), NULL, }, + { F(vld41b), F(vld41h), F(vld41w), NULL, }, + { F(vld42b), F(vld42h), F(vld42w), NULL, }, + { F(vld43b), F(vld43h), F(vld43w), NULL, }, + }; + if (a->qd > 4) { + return false; + } + return do_vldst_il(s, a, fns[a->pat][a->size], 64); +} + +static bool trans_VST2(DisasContext *s, arg_vldst_il *a) +{ + static MVEGenLdStIlFn * const fns[4][4] = { + { F(vst20b), F(vst20h), F(vst20w), NULL, }, + { F(vst21b), F(vst21h), F(vst21w), NULL, }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }; + if (a->qd > 6) { + return false; + } + return do_vldst_il(s, a, fns[a->pat][a->size], 32); +} + +static bool trans_VST4(DisasContext *s, arg_vldst_il *a) +{ + static MVEGenLdStIlFn * const fns[4][4] = { + { F(vst40b), F(vst40h), F(vst40w), NULL, }, + { F(vst41b), F(vst41h), F(vst41w), NULL, }, + { F(vst42b), F(vst42h), F(vst42w), NULL, }, + { F(vst43b), F(vst43h), F(vst43w), NULL, }, + }; + if (a->qd > 4) { + return false; + } + return do_vldst_il(s, a, fns[a->pat][a->size], 64); +} + +#undef F + +static bool trans_VDUP(DisasContext *s, arg_VDUP *a) +{ + TCGv_ptr qd; + TCGv_i32 rt; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (a->rt == 13 || a->rt == 15) { + /* UNPREDICTABLE; we choose to UNDEF */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + rt = load_reg(s, a->rt); + if (mve_no_predication(s)) { + tcg_gen_gvec_dup_i32(a->size, mve_qreg_offset(a->qd), 16, 16, rt); + } else { + qd = mve_qreg_ptr(a->qd); + tcg_gen_dup_i32(a->size, rt, rt); + gen_helper_mve_vdup(cpu_env, qd, rt); + tcg_temp_free_ptr(qd); + } + tcg_temp_free_i32(rt); + mve_update_eci(s); + return true; +} + +static bool do_1op_vec(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn, + GVecGen2Fn vecfn) +{ + TCGv_ptr qd, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + if (vecfn && mve_no_predication(s)) { + vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm), 16, 16); + } else { + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + } + mve_update_eci(s); + return true; +} + +static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) +{ + return do_1op_vec(s, a, fn, NULL); +} + +#define DO_1OP_VEC(INSN, FN, VECFN) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_1op_vec(s, a, fns[a->size], VECFN); \ + } + +#define DO_1OP(INSN, FN) DO_1OP_VEC(INSN, FN, NULL) + +DO_1OP(VCLZ, vclz) +DO_1OP(VCLS, vcls) +DO_1OP_VEC(VABS, vabs, tcg_gen_gvec_abs) +DO_1OP_VEC(VNEG, vneg, tcg_gen_gvec_neg) +DO_1OP(VQABS, vqabs) +DO_1OP(VQNEG, vqneg) +DO_1OP(VMAXA, vmaxa) +DO_1OP(VMINA, vmina) + +/* + * For simple float/int conversions we use the fixed-point + * conversion helpers with a zero shift count + */ +#define DO_VCVT(INSN, HFN, SFN) \ + static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \ + { \ + gen_helper_mve_##HFN(env, qd, qm, tcg_constant_i32(0)); \ + } \ + static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \ + { \ + gen_helper_mve_##SFN(env, qd, qm, tcg_constant_i32(0)); \ + } \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + NULL, \ + gen_##INSN##h, \ + gen_##INSN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_1op(s, a, fns[a->size]); \ + } + +DO_VCVT(VCVT_SF, vcvt_sh, vcvt_sf) +DO_VCVT(VCVT_UF, vcvt_uh, vcvt_uf) +DO_VCVT(VCVT_FS, vcvt_hs, vcvt_fs) +DO_VCVT(VCVT_FU, vcvt_hu, vcvt_fu) + +static bool do_vcvt_rmode(DisasContext *s, arg_1op *a, + enum arm_fprounding rmode, bool u) +{ + /* + * Handle VCVT fp to int with specified rounding mode. + * This is a 1op fn but we must pass the rounding mode as + * an immediate to the helper. + */ + TCGv_ptr qd, qm; + static MVEGenVCVTRmodeFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vcvt_rm_sh, gen_helper_mve_vcvt_rm_uh }, + { gen_helper_mve_vcvt_rm_ss, gen_helper_mve_vcvt_rm_us }, + { NULL, NULL }, + }; + MVEGenVCVTRmodeFn *fn = fns[a->size][u]; + + if (!dc_isar_feature(aa32_mve_fp, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm, tcg_constant_i32(arm_rmode_to_sf(rmode))); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_VCVT_RMODE(INSN, RMODE, U) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + return do_vcvt_rmode(s, a, RMODE, U); \ + } \ + +DO_VCVT_RMODE(VCVTAS, FPROUNDING_TIEAWAY, false) +DO_VCVT_RMODE(VCVTAU, FPROUNDING_TIEAWAY, true) +DO_VCVT_RMODE(VCVTNS, FPROUNDING_TIEEVEN, false) +DO_VCVT_RMODE(VCVTNU, FPROUNDING_TIEEVEN, true) +DO_VCVT_RMODE(VCVTPS, FPROUNDING_POSINF, false) +DO_VCVT_RMODE(VCVTPU, FPROUNDING_POSINF, true) +DO_VCVT_RMODE(VCVTMS, FPROUNDING_NEGINF, false) +DO_VCVT_RMODE(VCVTMU, FPROUNDING_NEGINF, true) + +#define DO_VCVT_SH(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_1op(s, a, gen_helper_mve_##FN); \ + } \ + +DO_VCVT_SH(VCVTB_SH, vcvtb_sh) +DO_VCVT_SH(VCVTT_SH, vcvtt_sh) +DO_VCVT_SH(VCVTB_HS, vcvtb_hs) +DO_VCVT_SH(VCVTT_HS, vcvtt_hs) + +#define DO_VRINT(INSN, RMODE) \ + static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \ + { \ + gen_helper_mve_vrint_rm_h(env, qd, qm, \ + tcg_constant_i32(arm_rmode_to_sf(RMODE))); \ + } \ + static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm) \ + { \ + gen_helper_mve_vrint_rm_s(env, qd, qm, \ + tcg_constant_i32(arm_rmode_to_sf(RMODE))); \ + } \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + NULL, \ + gen_##INSN##h, \ + gen_##INSN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_1op(s, a, fns[a->size]); \ + } + +DO_VRINT(VRINTN, FPROUNDING_TIEEVEN) +DO_VRINT(VRINTA, FPROUNDING_TIEAWAY) +DO_VRINT(VRINTZ, FPROUNDING_ZERO) +DO_VRINT(VRINTM, FPROUNDING_NEGINF) +DO_VRINT(VRINTP, FPROUNDING_POSINF) + +static bool trans_VRINTX(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vrintx_h, + gen_helper_mve_vrintx_s, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} + +/* Narrowing moves: only size 0 and 1 are valid */ +#define DO_VMOVN(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + NULL, \ + NULL, \ + }; \ + return do_1op(s, a, fns[a->size]); \ + } + +DO_VMOVN(VMOVNB, vmovnb) +DO_VMOVN(VMOVNT, vmovnt) +DO_VMOVN(VQMOVUNB, vqmovunb) +DO_VMOVN(VQMOVUNT, vqmovunt) +DO_VMOVN(VQMOVN_BS, vqmovnbs) +DO_VMOVN(VQMOVN_TS, vqmovnts) +DO_VMOVN(VQMOVN_BU, vqmovnbu) +DO_VMOVN(VQMOVN_TU, vqmovntu) + +static bool trans_VREV16(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev16b, + NULL, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV32(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev32b, + gen_helper_mve_vrev32h, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV64(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev64b, + gen_helper_mve_vrev64h, + gen_helper_mve_vrev64w, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VMVN(DisasContext *s, arg_1op *a) +{ + return do_1op_vec(s, a, gen_helper_mve_vmvn, tcg_gen_gvec_not); +} + +static bool trans_VABS_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfabsh, + gen_helper_mve_vfabss, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VNEG_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfnegh, + gen_helper_mve_vfnegs, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} + +static bool do_2op_vec(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn, + GVecGen3Fn *vecfn) +{ + TCGv_ptr qd, qn, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + if (vecfn && mve_no_predication(s)) { + vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qn), + mve_qreg_offset(a->qm), 16, 16); + } else { + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qn, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + } + mve_update_eci(s); + return true; +} + +static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn *fn) +{ + return do_2op_vec(s, a, fn, NULL); +} + +#define DO_LOGIC(INSN, HELPER, VECFN) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + return do_2op_vec(s, a, HELPER, VECFN); \ + } + +DO_LOGIC(VAND, gen_helper_mve_vand, tcg_gen_gvec_and) +DO_LOGIC(VBIC, gen_helper_mve_vbic, tcg_gen_gvec_andc) +DO_LOGIC(VORR, gen_helper_mve_vorr, tcg_gen_gvec_or) +DO_LOGIC(VORN, gen_helper_mve_vorn, tcg_gen_gvec_orc) +DO_LOGIC(VEOR, gen_helper_mve_veor, tcg_gen_gvec_xor) + +static bool trans_VPSEL(DisasContext *s, arg_2op *a) +{ + /* This insn updates predication bits */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + return do_2op(s, a, gen_helper_mve_vpsel); +} + +#define DO_2OP_VEC(INSN, FN, VECFN) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + static MVEGenTwoOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op_vec(s, a, fns[a->size], VECFN); \ + } + +#define DO_2OP(INSN, FN) DO_2OP_VEC(INSN, FN, NULL) + +DO_2OP_VEC(VADD, vadd, tcg_gen_gvec_add) +DO_2OP_VEC(VSUB, vsub, tcg_gen_gvec_sub) +DO_2OP_VEC(VMUL, vmul, tcg_gen_gvec_mul) +DO_2OP(VMULH_S, vmulhs) +DO_2OP(VMULH_U, vmulhu) +DO_2OP(VRMULH_S, vrmulhs) +DO_2OP(VRMULH_U, vrmulhu) +DO_2OP_VEC(VMAX_S, vmaxs, tcg_gen_gvec_smax) +DO_2OP_VEC(VMAX_U, vmaxu, tcg_gen_gvec_umax) +DO_2OP_VEC(VMIN_S, vmins, tcg_gen_gvec_smin) +DO_2OP_VEC(VMIN_U, vminu, tcg_gen_gvec_umin) +DO_2OP(VABD_S, vabds) +DO_2OP(VABD_U, vabdu) +DO_2OP(VHADD_S, vhadds) +DO_2OP(VHADD_U, vhaddu) +DO_2OP(VHSUB_S, vhsubs) +DO_2OP(VHSUB_U, vhsubu) +DO_2OP(VMULL_BS, vmullbs) +DO_2OP(VMULL_BU, vmullbu) +DO_2OP(VMULL_TS, vmullts) +DO_2OP(VMULL_TU, vmulltu) +DO_2OP(VQDMULH, vqdmulh) +DO_2OP(VQRDMULH, vqrdmulh) +DO_2OP(VQADD_S, vqadds) +DO_2OP(VQADD_U, vqaddu) +DO_2OP(VQSUB_S, vqsubs) +DO_2OP(VQSUB_U, vqsubu) +DO_2OP(VSHL_S, vshls) +DO_2OP(VSHL_U, vshlu) +DO_2OP(VRSHL_S, vrshls) +DO_2OP(VRSHL_U, vrshlu) +DO_2OP(VQSHL_S, vqshls) +DO_2OP(VQSHL_U, vqshlu) +DO_2OP(VQRSHL_S, vqrshls) +DO_2OP(VQRSHL_U, vqrshlu) +DO_2OP(VQDMLADH, vqdmladh) +DO_2OP(VQDMLADHX, vqdmladhx) +DO_2OP(VQRDMLADH, vqrdmladh) +DO_2OP(VQRDMLADHX, vqrdmladhx) +DO_2OP(VQDMLSDH, vqdmlsdh) +DO_2OP(VQDMLSDHX, vqdmlsdhx) +DO_2OP(VQRDMLSDH, vqrdmlsdh) +DO_2OP(VQRDMLSDHX, vqrdmlsdhx) +DO_2OP(VRHADD_S, vrhadds) +DO_2OP(VRHADD_U, vrhaddu) +/* + * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose + * so we can reuse the DO_2OP macro. (Our implementation calculates the + * "expected" results in this case.) Similarly for VHCADD. + */ +DO_2OP(VCADD90, vcadd90) +DO_2OP(VCADD270, vcadd270) +DO_2OP(VHCADD90, vhcadd90) +DO_2OP(VHCADD270, vhcadd270) + +static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullbh, + gen_helper_mve_vqdmullbw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullth, + gen_helper_mve_vqdmulltw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + +static bool trans_VMULLP_B(DisasContext *s, arg_2op *a) +{ + /* + * Note that a->size indicates the output size, ie VMULL.P8 + * is the 8x8->16 operation and a->size is MO_16; VMULL.P16 + * is the 16x16->32 operation and a->size is MO_32. + */ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vmullpbh, + gen_helper_mve_vmullpbw, + NULL, + }; + return do_2op(s, a, fns[a->size]); +} + +static bool trans_VMULLP_T(DisasContext *s, arg_2op *a) +{ + /* a->size is as for trans_VMULLP_B */ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vmullpth, + gen_helper_mve_vmullptw, + NULL, + }; + return do_2op(s, a, fns[a->size]); +} + +/* + * VADC and VSBC: these perform an add-with-carry or subtract-with-carry + * of the 32-bit elements in each lane of the input vectors, where the + * carry-out of each add is the carry-in of the next. The initial carry + * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C + * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C. + * These insns are subject to beat-wise execution. Partial execution + * of an I=1 (initial carry input fixed) insn which does not + * execute the first beat must start with the current FPSCR.NZCV + * value, not the fixed constant input. + */ +static bool trans_VADC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vadc); +} + +static bool trans_VADCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VADC(s, a); + } + return do_2op(s, a, gen_helper_mve_vadci); +} + +static bool trans_VSBC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vsbc); +} + +static bool trans_VSBCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VSBC(s, a); + } + return do_2op(s, a, gen_helper_mve_vsbci); +} + +#define DO_2OP_FP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + static MVEGenTwoOpFn * const fns[] = { \ + NULL, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_2op(s, a, fns[a->size]); \ + } + +DO_2OP_FP(VADD_fp, vfadd) +DO_2OP_FP(VSUB_fp, vfsub) +DO_2OP_FP(VMUL_fp, vfmul) +DO_2OP_FP(VABD_fp, vfabd) +DO_2OP_FP(VMAXNM, vmaxnm) +DO_2OP_FP(VMINNM, vminnm) +DO_2OP_FP(VCADD90_fp, vfcadd90) +DO_2OP_FP(VCADD270_fp, vfcadd270) +DO_2OP_FP(VFMA, vfma) +DO_2OP_FP(VFMS, vfms) +DO_2OP_FP(VCMUL0, vcmul0) +DO_2OP_FP(VCMUL90, vcmul90) +DO_2OP_FP(VCMUL180, vcmul180) +DO_2OP_FP(VCMUL270, vcmul270) +DO_2OP_FP(VCMLA0, vcmla0) +DO_2OP_FP(VCMLA90, vcmla90) +DO_2OP_FP(VCMLA180, vcmla180) +DO_2OP_FP(VCMLA270, vcmla270) +DO_2OP_FP(VMAXNMA, vmaxnma) +DO_2OP_FP(VMINNMA, vminnma) + +static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, + MVEGenTwoOpScalarFn fn) +{ + TCGv_ptr qd, qn; + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn) || + !fn) { + return false; + } + if (a->rm == 13 || a->rm == 15) { + /* UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + rm = load_reg(s, a->rm); + fn(cpu_env, qd, qn, rm); + tcg_temp_free_i32(rm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + mve_update_eci(s); + return true; +} + +#define DO_2OP_SCALAR(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \ + { \ + static MVEGenTwoOpScalarFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op_scalar(s, a, fns[a->size]); \ + } + +DO_2OP_SCALAR(VADD_scalar, vadd_scalar) +DO_2OP_SCALAR(VSUB_scalar, vsub_scalar) +DO_2OP_SCALAR(VMUL_scalar, vmul_scalar) +DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar) +DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar) +DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar) +DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar) +DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar) +DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar) +DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar) +DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar) +DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar) +DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar) +DO_2OP_SCALAR(VBRSR, vbrsr) +DO_2OP_SCALAR(VMLA, vmla) +DO_2OP_SCALAR(VMLAS, vmlas) +DO_2OP_SCALAR(VQDMLAH, vqdmlah) +DO_2OP_SCALAR(VQRDMLAH, vqrdmlah) +DO_2OP_SCALAR(VQDMLASH, vqdmlash) +DO_2OP_SCALAR(VQRDMLASH, vqrdmlash) + +static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullb_scalarh, + gen_helper_mve_vqdmullb_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullt_scalarh, + gen_helper_mve_vqdmullt_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + + +#define DO_2OP_FP_SCALAR(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \ + { \ + static MVEGenTwoOpScalarFn * const fns[] = { \ + NULL, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_2op_scalar(s, a, fns[a->size]); \ + } + +DO_2OP_FP_SCALAR(VADD_fp_scalar, vfadd_scalar) +DO_2OP_FP_SCALAR(VSUB_fp_scalar, vfsub_scalar) +DO_2OP_FP_SCALAR(VMUL_fp_scalar, vfmul_scalar) +DO_2OP_FP_SCALAR(VFMA_scalar, vfma_scalar) +DO_2OP_FP_SCALAR(VFMAS_scalar, vfmas_scalar) + +static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, + MVEGenLongDualAccOpFn *fn) +{ + TCGv_ptr qn, qm; + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qn | a->qm) || + !fn) { + return false; + } + /* + * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related + * encoding; rdalo always has bit 0 clear so cannot be 13 or 15. + */ + if (a->rdahi == 13 || a->rdahi == 15) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current rda value, not 0. + */ + if (a->a || mve_skip_first_beat(s)) { + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + tcg_temp_free_i32(rdalo); + tcg_temp_free_i32(rdahi); + } else { + rda = tcg_const_i64(0); + } + + fn(rda, cpu_env, qn, qm, rda); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + + rdalo = tcg_temp_new_i32(); + rdahi = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + mve_update_eci(s); + return true; +} + +static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh }, + { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavuh, NULL }, + { gen_helper_mve_vmlaldavuw, NULL }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh }, + { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhuw, NULL, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenLongDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool do_dual_acc(DisasContext *s, arg_vmladav *a, MVEGenDualAccOpFn *fn) +{ + TCGv_ptr qn, qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qn) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current rda value, not 0. + */ + if (a->a || mve_skip_first_beat(s)) { + rda = load_reg(s, a->rda); + } else { + rda = tcg_const_i32(0); + } + + fn(rda, cpu_env, qn, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + + mve_update_eci(s); + return true; +} + +#define DO_DUAL_ACC(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vmladav *a) \ + { \ + static MVEGenDualAccOpFn * const fns[4][2] = { \ + { gen_helper_mve_##FN##b, gen_helper_mve_##FN##xb }, \ + { gen_helper_mve_##FN##h, gen_helper_mve_##FN##xh }, \ + { gen_helper_mve_##FN##w, gen_helper_mve_##FN##xw }, \ + { NULL, NULL }, \ + }; \ + return do_dual_acc(s, a, fns[a->size][a->x]); \ + } + +DO_DUAL_ACC(VMLADAV_S, vmladavs) +DO_DUAL_ACC(VMLSDAV, vmlsdav) + +static bool trans_VMLADAV_U(DisasContext *s, arg_vmladav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { gen_helper_mve_vmladavub, NULL }, + { gen_helper_mve_vmladavuh, NULL }, + { gen_helper_mve_vmladavuw, NULL }, + { NULL, NULL }, + }; + return do_dual_acc(s, a, fns[a->size][a->x]); +} + +static void gen_vpst(DisasContext *s, uint32_t mask) +{ + /* + * Set the VPR mask fields. We take advantage of MASK01 and MASK23 + * being adjacent fields in the register. + * + * Updating the masks is not predicated, but it is subject to beat-wise + * execution, and the mask is updated on the odd-numbered beats. + * So if PSR.ECI says we should skip beat 1, we mustn't update the + * 01 mask field. + */ + TCGv_i32 vpr = load_cpu_field(v7m.vpr); + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + /* Update both 01 and 23 fields */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(mask | (mask << 4)), + R_V7M_VPR_MASK01_SHIFT, + R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH); + break; + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + /* Update only the 23 mask field */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(mask), + R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH); + break; + default: + g_assert_not_reached(); + } + store_cpu_field(vpr, v7m.vpr); +} + +static bool trans_VPST(DisasContext *s, arg_VPST *a) +{ + /* mask == 0 is a "related encoding" */ + if (!dc_isar_feature(aa32_mve, s) || !a->mask) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + gen_vpst(s, a->mask); + mve_update_and_store_eci(s); + return true; +} + +static bool trans_VPNOT(DisasContext *s, arg_VPNOT *a) +{ + /* + * Invert the predicate in VPR.P0. We have call out to + * a helper because this insn itself is beatwise and can + * be predicated. + */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + gen_helper_mve_vpnot(cpu_env); + /* This insn updates predication bits */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + mve_update_eci(s); + return true; +} + +static bool trans_VADDV(DisasContext *s, arg_VADDV *a) +{ + /* VADDV: vector add across vector */ + static MVEGenVADDVFn * const fns[4][2] = { + { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub }, + { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh }, + { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw }, + { NULL, NULL } + }; + TCGv_ptr qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || + a->size == 3) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current value of Rda, not zero. + */ + if (a->a || mve_skip_first_beat(s)) { + /* Accumulate input from Rda */ + rda = load_reg(s, a->rda); + } else { + /* Accumulate starting at zero */ + rda = tcg_const_i32(0); + } + + qm = mve_qreg_ptr(a->qm); + fns[a->size][a->u](rda, cpu_env, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qm); + + mve_update_eci(s); + return true; +} + +static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a) +{ + /* + * Vector Add Long Across Vector: accumulate the 32-bit + * elements of the vector into a 64-bit result stored in + * a pair of general-purpose registers. + * No need to check Qm's bank: it is only 3 bits in decode. + */ + TCGv_ptr qm; + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + /* + * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related + * encoding; rdalo always has bit 0 clear so cannot be 13 or 15. + */ + if (a->rdahi == 13 || a->rdahi == 15) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current value of RdaHi:RdaLo, not zero. + */ + if (a->a || mve_skip_first_beat(s)) { + /* Accumulate input from RdaHi:RdaLo */ + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + tcg_temp_free_i32(rdalo); + tcg_temp_free_i32(rdahi); + } else { + /* Accumulate starting at zero */ + rda = tcg_const_i64(0); + } + + qm = mve_qreg_ptr(a->qm); + if (a->u) { + gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda); + } else { + gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda); + } + tcg_temp_free_ptr(qm); + + rdalo = tcg_temp_new_i32(); + rdahi = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + mve_update_eci(s); + return true; +} + +static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn, + GVecGen2iFn *vecfn) +{ + TCGv_ptr qd; + uint64_t imm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + imm = asimd_imm_const(a->imm, a->cmode, a->op); + + if (vecfn && mve_no_predication(s)) { + vecfn(MO_64, mve_qreg_offset(a->qd), mve_qreg_offset(a->qd), + imm, 16, 16); + } else { + qd = mve_qreg_ptr(a->qd); + fn(cpu_env, qd, tcg_constant_i64(imm)); + tcg_temp_free_ptr(qd); + } + mve_update_eci(s); + return true; +} + +static void gen_gvec_vmovi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, c); +} + +static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a) +{ + /* Handle decode of cmode/op here between VORR/VBIC/VMOV */ + MVEGenOneOpImmFn *fn; + GVecGen2iFn *vecfn; + + if ((a->cmode & 1) && a->cmode < 12) { + if (a->op) { + /* + * For op=1, the immediate will be inverted by asimd_imm_const(), + * so the VBIC becomes a logical AND operation. + */ + fn = gen_helper_mve_vandi; + vecfn = tcg_gen_gvec_andi; + } else { + fn = gen_helper_mve_vorri; + vecfn = tcg_gen_gvec_ori; + } + } else { + /* There is one unallocated cmode/op combination in this space */ + if (a->cmode == 15 && a->op == 1) { + return false; + } + /* asimd_imm_const() sorts out VMVNI vs VMOVI for us */ + fn = gen_helper_mve_vmovi; + vecfn = gen_gvec_vmovi; + } + return do_1imm(s, a, fn, vecfn); +} + +static bool do_2shift_vec(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn, + bool negateshift, GVecGen2iFn vecfn) +{ + TCGv_ptr qd, qm; + int shift = a->shift; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * When we handle a right shift insn using a left-shift helper + * which permits a negative shift count to indicate a right-shift, + * we must negate the shift count. + */ + if (negateshift) { + shift = -shift; + } + + if (vecfn && mve_no_predication(s)) { + vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm), + shift, 16, 16); + } else { + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm, tcg_constant_i32(shift)); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + } + mve_update_eci(s); + return true; +} + +static bool do_2shift(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn, + bool negateshift) +{ + return do_2shift_vec(s, a, fn, negateshift, NULL); +} + +#define DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, VECFN) \ + static bool trans_##INSN(DisasContext *s, arg_2shift *a) \ + { \ + static MVEGenTwoOpShiftFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2shift_vec(s, a, fns[a->size], NEGATESHIFT, VECFN); \ + } + +#define DO_2SHIFT(INSN, FN, NEGATESHIFT) \ + DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, NULL) + +static void do_gvec_shri_s(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + /* + * We get here with a negated shift count, and we must handle + * shifts by the element size, which tcg_gen_gvec_sari() does not do. + */ + shift = -shift; + if (shift == (8 << vece)) { + shift--; + } + tcg_gen_gvec_sari(vece, dofs, aofs, shift, oprsz, maxsz); +} + +static void do_gvec_shri_u(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + /* + * We get here with a negated shift count, and we must handle + * shifts by the element size, which tcg_gen_gvec_shri() does not do. + */ + shift = -shift; + if (shift == (8 << vece)) { + tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, 0); + } else { + tcg_gen_gvec_shri(vece, dofs, aofs, shift, oprsz, maxsz); + } +} + +DO_2SHIFT_VEC(VSHLI, vshli_u, false, tcg_gen_gvec_shli) +DO_2SHIFT(VQSHLI_S, vqshli_s, false) +DO_2SHIFT(VQSHLI_U, vqshli_u, false) +DO_2SHIFT(VQSHLUI, vqshlui_s, false) +/* These right shifts use a left-shift helper with negated shift count */ +DO_2SHIFT_VEC(VSHRI_S, vshli_s, true, do_gvec_shri_s) +DO_2SHIFT_VEC(VSHRI_U, vshli_u, true, do_gvec_shri_u) +DO_2SHIFT(VRSHRI_S, vrshli_s, true) +DO_2SHIFT(VRSHRI_U, vrshli_u, true) + +DO_2SHIFT_VEC(VSRI, vsri, false, gen_gvec_sri) +DO_2SHIFT_VEC(VSLI, vsli, false, gen_gvec_sli) + +#define DO_2SHIFT_FP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2shift *a) \ + { \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_2shift(s, a, gen_helper_mve_##FN, false); \ + } + +DO_2SHIFT_FP(VCVT_SH_fixed, vcvt_sh) +DO_2SHIFT_FP(VCVT_UH_fixed, vcvt_uh) +DO_2SHIFT_FP(VCVT_HS_fixed, vcvt_hs) +DO_2SHIFT_FP(VCVT_HU_fixed, vcvt_hu) +DO_2SHIFT_FP(VCVT_SF_fixed, vcvt_sf) +DO_2SHIFT_FP(VCVT_UF_fixed, vcvt_uf) +DO_2SHIFT_FP(VCVT_FS_fixed, vcvt_fs) +DO_2SHIFT_FP(VCVT_FU_fixed, vcvt_fu) + +static bool do_2shift_scalar(DisasContext *s, arg_shl_scalar *a, + MVEGenTwoOpShiftFn *fn) +{ + TCGv_ptr qda; + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qda) || + a->rm == 13 || a->rm == 15 || !fn) { + /* Rm cases are UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qda = mve_qreg_ptr(a->qda); + rm = load_reg(s, a->rm); + fn(cpu_env, qda, qda, rm); + tcg_temp_free_ptr(qda); + tcg_temp_free_i32(rm); + mve_update_eci(s); + return true; +} + +#define DO_2SHIFT_SCALAR(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_shl_scalar *a) \ + { \ + static MVEGenTwoOpShiftFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2shift_scalar(s, a, fns[a->size]); \ + } + +DO_2SHIFT_SCALAR(VSHL_S_scalar, vshli_s) +DO_2SHIFT_SCALAR(VSHL_U_scalar, vshli_u) +DO_2SHIFT_SCALAR(VRSHL_S_scalar, vrshli_s) +DO_2SHIFT_SCALAR(VRSHL_U_scalar, vrshli_u) +DO_2SHIFT_SCALAR(VQSHL_S_scalar, vqshli_s) +DO_2SHIFT_SCALAR(VQSHL_U_scalar, vqshli_u) +DO_2SHIFT_SCALAR(VQRSHL_S_scalar, vqrshli_s) +DO_2SHIFT_SCALAR(VQRSHL_U_scalar, vqrshli_u) + +#define DO_VSHLL(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2shift *a) \ + { \ + static MVEGenTwoOpShiftFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + }; \ + return do_2shift_vec(s, a, fns[a->size], false, do_gvec_##FN); \ + } + +/* + * For the VSHLL vector helpers, the vece is the size of the input + * (ie MO_8 or MO_16); the helpers want to work in the output size. + * The shift count can be 0..<input size>, inclusive. (0 is VMOVL.) + */ +static void do_gvec_vshllbs(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + unsigned ovece = vece + 1; + unsigned ibits = vece == MO_8 ? 8 : 16; + tcg_gen_gvec_shli(ovece, dofs, aofs, ibits, oprsz, maxsz); + tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz); +} + +static void do_gvec_vshllbu(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + unsigned ovece = vece + 1; + tcg_gen_gvec_andi(ovece, dofs, aofs, + ovece == MO_16 ? 0xff : 0xffff, oprsz, maxsz); + tcg_gen_gvec_shli(ovece, dofs, dofs, shift, oprsz, maxsz); +} + +static void do_gvec_vshllts(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + unsigned ovece = vece + 1; + unsigned ibits = vece == MO_8 ? 8 : 16; + if (shift == 0) { + tcg_gen_gvec_sari(ovece, dofs, aofs, ibits, oprsz, maxsz); + } else { + tcg_gen_gvec_andi(ovece, dofs, aofs, + ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz); + tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz); + } +} + +static void do_gvec_vshlltu(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + unsigned ovece = vece + 1; + unsigned ibits = vece == MO_8 ? 8 : 16; + if (shift == 0) { + tcg_gen_gvec_shri(ovece, dofs, aofs, ibits, oprsz, maxsz); + } else { + tcg_gen_gvec_andi(ovece, dofs, aofs, + ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz); + tcg_gen_gvec_shri(ovece, dofs, dofs, ibits - shift, oprsz, maxsz); + } +} + +DO_VSHLL(VSHLL_BS, vshllbs) +DO_VSHLL(VSHLL_BU, vshllbu) +DO_VSHLL(VSHLL_TS, vshllts) +DO_VSHLL(VSHLL_TU, vshlltu) + +#define DO_2SHIFT_N(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2shift *a) \ + { \ + static MVEGenTwoOpShiftFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + }; \ + return do_2shift(s, a, fns[a->size], false); \ + } + +DO_2SHIFT_N(VSHRNB, vshrnb) +DO_2SHIFT_N(VSHRNT, vshrnt) +DO_2SHIFT_N(VRSHRNB, vrshrnb) +DO_2SHIFT_N(VRSHRNT, vrshrnt) +DO_2SHIFT_N(VQSHRNB_S, vqshrnb_s) +DO_2SHIFT_N(VQSHRNT_S, vqshrnt_s) +DO_2SHIFT_N(VQSHRNB_U, vqshrnb_u) +DO_2SHIFT_N(VQSHRNT_U, vqshrnt_u) +DO_2SHIFT_N(VQSHRUNB, vqshrunb) +DO_2SHIFT_N(VQSHRUNT, vqshrunt) +DO_2SHIFT_N(VQRSHRNB_S, vqrshrnb_s) +DO_2SHIFT_N(VQRSHRNT_S, vqrshrnt_s) +DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u) +DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u) +DO_2SHIFT_N(VQRSHRUNB, vqrshrunb) +DO_2SHIFT_N(VQRSHRUNT, vqrshrunt) + +static bool trans_VSHLC(DisasContext *s, arg_VSHLC *a) +{ + /* + * Whole Vector Left Shift with Carry. The carry is taken + * from a general purpose register and written back there. + * An imm of 0 means "shift by 32". + */ + TCGv_ptr qd; + TCGv_i32 rdm; + + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (a->rdm == 13 || a->rdm == 15) { + /* CONSTRAINED UNPREDICTABLE: we UNDEF */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + rdm = load_reg(s, a->rdm); + gen_helper_mve_vshlc(rdm, cpu_env, qd, rdm, tcg_constant_i32(a->imm)); + store_reg(s, a->rdm, rdm); + tcg_temp_free_ptr(qd); + mve_update_eci(s); + return true; +} + +static bool do_vidup(DisasContext *s, arg_vidup *a, MVEGenVIDUPFn *fn) +{ + TCGv_ptr qd; + TCGv_i32 rn; + + /* + * Vector increment/decrement with wrap and duplicate (VIDUP, VDDUP). + * This fills the vector with elements of successively increasing + * or decreasing values, starting from Rn. + */ + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (a->size == MO_64) { + /* size 0b11 is another encoding */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + rn = load_reg(s, a->rn); + fn(rn, cpu_env, qd, rn, tcg_constant_i32(a->imm)); + store_reg(s, a->rn, rn); + tcg_temp_free_ptr(qd); + mve_update_eci(s); + return true; +} + +static bool do_viwdup(DisasContext *s, arg_viwdup *a, MVEGenVIWDUPFn *fn) +{ + TCGv_ptr qd; + TCGv_i32 rn, rm; + + /* + * Vector increment/decrement with wrap and duplicate (VIWDUp, VDWDUP) + * This fills the vector with elements of successively increasing + * or decreasing values, starting from Rn. Rm specifies a point where + * the count wraps back around to 0. The updated offset is written back + * to Rn. + */ + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (!fn || a->rm == 13 || a->rm == 15) { + /* + * size 0b11 is another encoding; Rm == 13 is UNPREDICTABLE; + * Rm == 13 is VIWDUP, VDWDUP. + */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + rn = load_reg(s, a->rn); + rm = load_reg(s, a->rm); + fn(rn, cpu_env, qd, rn, rm, tcg_constant_i32(a->imm)); + store_reg(s, a->rn, rn); + tcg_temp_free_ptr(qd); + tcg_temp_free_i32(rm); + mve_update_eci(s); + return true; +} + +static bool trans_VIDUP(DisasContext *s, arg_vidup *a) +{ + static MVEGenVIDUPFn * const fns[] = { + gen_helper_mve_vidupb, + gen_helper_mve_viduph, + gen_helper_mve_vidupw, + NULL, + }; + return do_vidup(s, a, fns[a->size]); +} + +static bool trans_VDDUP(DisasContext *s, arg_vidup *a) +{ + static MVEGenVIDUPFn * const fns[] = { + gen_helper_mve_vidupb, + gen_helper_mve_viduph, + gen_helper_mve_vidupw, + NULL, + }; + /* VDDUP is just like VIDUP but with a negative immediate */ + a->imm = -a->imm; + return do_vidup(s, a, fns[a->size]); +} + +static bool trans_VIWDUP(DisasContext *s, arg_viwdup *a) +{ + static MVEGenVIWDUPFn * const fns[] = { + gen_helper_mve_viwdupb, + gen_helper_mve_viwduph, + gen_helper_mve_viwdupw, + NULL, + }; + return do_viwdup(s, a, fns[a->size]); +} + +static bool trans_VDWDUP(DisasContext *s, arg_viwdup *a) +{ + static MVEGenVIWDUPFn * const fns[] = { + gen_helper_mve_vdwdupb, + gen_helper_mve_vdwduph, + gen_helper_mve_vdwdupw, + NULL, + }; + return do_viwdup(s, a, fns[a->size]); +} + +static bool do_vcmp(DisasContext *s, arg_vcmp *a, MVEGenCmpFn *fn) +{ + TCGv_ptr qn, qm; + + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qn, qm); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + if (a->mask) { + /* VPT */ + gen_vpst(s, a->mask); + } + /* This insn updates predication bits */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + mve_update_eci(s); + return true; +} + +static bool do_vcmp_scalar(DisasContext *s, arg_vcmp_scalar *a, + MVEGenScalarCmpFn *fn) +{ + TCGv_ptr qn; + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_mve, s) || !fn || a->rm == 13) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + if (a->rm == 15) { + /* Encoding Rm=0b1111 means "constant zero" */ + rm = tcg_constant_i32(0); + } else { + rm = load_reg(s, a->rm); + } + fn(cpu_env, qn, rm); + tcg_temp_free_ptr(qn); + tcg_temp_free_i32(rm); + if (a->mask) { + /* VPT */ + gen_vpst(s, a->mask); + } + /* This insn updates predication bits */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + mve_update_eci(s); + return true; +} + +#define DO_VCMP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vcmp *a) \ + { \ + static MVEGenCmpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_vcmp(s, a, fns[a->size]); \ + } \ + static bool trans_##INSN##_scalar(DisasContext *s, \ + arg_vcmp_scalar *a) \ + { \ + static MVEGenScalarCmpFn * const fns[] = { \ + gen_helper_mve_##FN##_scalarb, \ + gen_helper_mve_##FN##_scalarh, \ + gen_helper_mve_##FN##_scalarw, \ + NULL, \ + }; \ + return do_vcmp_scalar(s, a, fns[a->size]); \ + } + +DO_VCMP(VCMPEQ, vcmpeq) +DO_VCMP(VCMPNE, vcmpne) +DO_VCMP(VCMPCS, vcmpcs) +DO_VCMP(VCMPHI, vcmphi) +DO_VCMP(VCMPGE, vcmpge) +DO_VCMP(VCMPLT, vcmplt) +DO_VCMP(VCMPGT, vcmpgt) +DO_VCMP(VCMPLE, vcmple) + +#define DO_VCMP_FP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vcmp *a) \ + { \ + static MVEGenCmpFn * const fns[] = { \ + NULL, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_vcmp(s, a, fns[a->size]); \ + } \ + static bool trans_##INSN##_scalar(DisasContext *s, \ + arg_vcmp_scalar *a) \ + { \ + static MVEGenScalarCmpFn * const fns[] = { \ + NULL, \ + gen_helper_mve_##FN##_scalarh, \ + gen_helper_mve_##FN##_scalars, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_vcmp_scalar(s, a, fns[a->size]); \ + } + +DO_VCMP_FP(VCMPEQ_fp, vfcmpeq) +DO_VCMP_FP(VCMPNE_fp, vfcmpne) +DO_VCMP_FP(VCMPGE_fp, vfcmpge) +DO_VCMP_FP(VCMPLT_fp, vfcmplt) +DO_VCMP_FP(VCMPGT_fp, vfcmpgt) +DO_VCMP_FP(VCMPLE_fp, vfcmple) + +static bool do_vmaxv(DisasContext *s, arg_vmaxv *a, MVEGenVADDVFn fn) +{ + /* + * MIN/MAX operations across a vector: compute the min or + * max of the initial value in a general purpose register + * and all the elements in the vector, and store it back + * into the general purpose register. + */ + TCGv_ptr qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) || + !fn || a->rda == 13 || a->rda == 15) { + /* Rda cases are UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qm = mve_qreg_ptr(a->qm); + rda = load_reg(s, a->rda); + fn(rda, cpu_env, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_VMAXV(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vmaxv *a) \ + { \ + static MVEGenVADDVFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_vmaxv(s, a, fns[a->size]); \ + } + +DO_VMAXV(VMAXV_S, vmaxvs) +DO_VMAXV(VMAXV_U, vmaxvu) +DO_VMAXV(VMAXAV, vmaxav) +DO_VMAXV(VMINV_S, vminvs) +DO_VMAXV(VMINV_U, vminvu) +DO_VMAXV(VMINAV, vminav) + +#define DO_VMAXV_FP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vmaxv *a) \ + { \ + static MVEGenVADDVFn * const fns[] = { \ + NULL, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##s, \ + NULL, \ + }; \ + if (!dc_isar_feature(aa32_mve_fp, s)) { \ + return false; \ + } \ + return do_vmaxv(s, a, fns[a->size]); \ + } + +DO_VMAXV_FP(VMAXNMV, vmaxnmv) +DO_VMAXV_FP(VMINNMV, vminnmv) +DO_VMAXV_FP(VMAXNMAV, vmaxnmav) +DO_VMAXV_FP(VMINNMAV, vminnmav) + +static bool do_vabav(DisasContext *s, arg_vabav *a, MVEGenVABAVFn *fn) +{ + /* Absolute difference accumulated across vector */ + TCGv_ptr qn, qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qm | a->qn) || + !fn || a->rda == 13 || a->rda == 15) { + /* Rda cases are UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qm = mve_qreg_ptr(a->qm); + qn = mve_qreg_ptr(a->qn); + rda = load_reg(s, a->rda); + fn(rda, cpu_env, qn, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qm); + tcg_temp_free_ptr(qn); + mve_update_eci(s); + return true; +} + +#define DO_VABAV(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_vabav *a) \ + { \ + static MVEGenVABAVFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_vabav(s, a, fns[a->size]); \ + } + +DO_VABAV(VABAV_S, vabavs) +DO_VABAV(VABAV_U, vabavu) + +static bool trans_VMOV_to_2gp(DisasContext *s, arg_VMOV_to_2gp *a) +{ + /* + * VMOV two 32-bit vector lanes to two general-purpose registers. + * This insn is not predicated but it is subject to beat-wise + * execution if it is not in an IT block. For us this means + * only that if PSR.ECI says we should not be executing the beat + * corresponding to the lane of the vector register being accessed + * then we should skip perfoming the move, and that we need to do + * the usual check for bad ECI state and advance of ECI state. + * (If PSR.ECI is non-zero then we cannot be in an IT block.) + */ + TCGv_i32 tmp; + int vd; + + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) || + a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15 || + a->rt == a->rt2) { + /* Rt/Rt2 cases are UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* Convert Qreg index to Dreg for read_neon_element32() etc */ + vd = a->qd * 2; + + if (!mve_skip_vmov(s, vd, a->idx, MO_32)) { + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, vd, a->idx, MO_32); + store_reg(s, a->rt, tmp); + } + if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) { + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, vd + 1, a->idx, MO_32); + store_reg(s, a->rt2, tmp); + } + + mve_update_and_store_eci(s); + return true; +} + +static bool trans_VMOV_from_2gp(DisasContext *s, arg_VMOV_to_2gp *a) +{ + /* + * VMOV two general-purpose registers to two 32-bit vector lanes. + * This insn is not predicated but it is subject to beat-wise + * execution if it is not in an IT block. For us this means + * only that if PSR.ECI says we should not be executing the beat + * corresponding to the lane of the vector register being accessed + * then we should skip perfoming the move, and that we need to do + * the usual check for bad ECI state and advance of ECI state. + * (If PSR.ECI is non-zero then we cannot be in an IT block.) + */ + TCGv_i32 tmp; + int vd; + + if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) || + a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15) { + /* Rt/Rt2 cases are UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* Convert Qreg idx to Dreg for read_neon_element32() etc */ + vd = a->qd * 2; + + if (!mve_skip_vmov(s, vd, a->idx, MO_32)) { + tmp = load_reg(s, a->rt); + write_neon_element32(tmp, vd, a->idx, MO_32); + tcg_temp_free_i32(tmp); + } + if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) { + tmp = load_reg(s, a->rt2); + write_neon_element32(tmp, vd + 1, a->idx, MO_32); + tcg_temp_free_i32(tmp); + } + + mve_update_and_store_eci(s); + return true; +} diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c new file mode 100644 index 0000000..4016339 --- /dev/null +++ b/target/arm/tcg/translate-neon.c @@ -0,0 +1,4064 @@ +/* + * ARM translation: AArch32 Neon instructions + * + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2005-2007 CodeSourcery + * Copyright (c) 2007 OpenedHand, Ltd. + * Copyright (c) 2020 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "exec/exec-all.h" +#include "exec/gen-icount.h" +#include "translate.h" +#include "translate-a32.h" + +/* Include the generated Neon decoder */ +#include "decode-neon-dp.c.inc" +#include "decode-neon-ls.c.inc" +#include "decode-neon-shared.c.inc" + +static TCGv_ptr vfp_reg_ptr(bool dp, int reg) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg)); + return ret; +} + +static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop) +{ + long offset = neon_element_offset(reg, ele, mop & MO_SIZE); + + switch (mop) { + case MO_UB: + tcg_gen_ld8u_i32(var, cpu_env, offset); + break; + case MO_UW: + tcg_gen_ld16u_i32(var, cpu_env, offset); + break; + case MO_UL: + tcg_gen_ld_i32(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop) +{ + long offset = neon_element_offset(reg, ele, mop & MO_SIZE); + + switch (mop) { + case MO_UB: + tcg_gen_ld8u_i64(var, cpu_env, offset); + break; + case MO_UW: + tcg_gen_ld16u_i64(var, cpu_env, offset); + break; + case MO_UL: + tcg_gen_ld32u_i64(var, cpu_env, offset); + break; + case MO_UQ: + tcg_gen_ld_i64(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var) +{ + long offset = neon_element_offset(reg, ele, size); + + switch (size) { + case MO_8: + tcg_gen_st8_i32(var, cpu_env, offset); + break; + case MO_16: + tcg_gen_st16_i32(var, cpu_env, offset); + break; + case MO_32: + tcg_gen_st_i32(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var) +{ + long offset = neon_element_offset(reg, ele, size); + + switch (size) { + case MO_8: + tcg_gen_st8_i64(var, cpu_env, offset); + break; + case MO_16: + tcg_gen_st16_i64(var, cpu_env, offset); + break; + case MO_32: + tcg_gen_st32_i64(var, cpu_env, offset); + break; + case MO_64: + tcg_gen_st_i64(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm, + int data, gen_helper_gvec_4 *fn_gvec) +{ + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) { + return false; + } + + /* + * UNDEF accesses to odd registers for each bit of Q. + * Q will be 0b111 for all Q-reg instructions, otherwise + * when we have mixed Q- and D-reg inputs. + */ + if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + int opr_sz = q ? 16 : 8; + tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd), + vfp_reg_offset(1, vn), + vfp_reg_offset(1, vm), + vfp_reg_offset(1, vd), + opr_sz, opr_sz, data, fn_gvec); + return true; +} + +static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm, + int data, ARMFPStatusFlavour fp_flavour, + gen_helper_gvec_4_ptr *fn_gvec_ptr) +{ + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) { + return false; + } + + /* + * UNDEF accesses to odd registers for each bit of Q. + * Q will be 0b111 for all Q-reg instructions, otherwise + * when we have mixed Q- and D-reg inputs. + */ + if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + int opr_sz = q ? 16 : 8; + TCGv_ptr fpst = fpstatus_ptr(fp_flavour); + + tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd), + vfp_reg_offset(1, vn), + vfp_reg_offset(1, vm), + vfp_reg_offset(1, vd), + fpst, opr_sz, opr_sz, data, fn_gvec_ptr); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a) +{ + if (!dc_isar_feature(aa32_vcma, s)) { + return false; + } + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot, + FPST_STD_F16, gen_helper_gvec_fcmlah); + } + return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot, + FPST_STD, gen_helper_gvec_fcmlas); +} + +static bool trans_VCADD(DisasContext *s, arg_VCADD *a) +{ + int opr_sz; + TCGv_ptr fpst; + gen_helper_gvec_3_ptr *fn_gvec_ptr; + + if (!dc_isar_feature(aa32_vcma, s) + || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if ((a->vn | a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + opr_sz = (1 + a->q) * 8; + fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD); + fn_gvec_ptr = (a->size == MO_16) ? + gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd), + vfp_reg_offset(1, a->vn), + vfp_reg_offset(1, a->vm), + fpst, opr_sz, opr_sz, a->rot, + fn_gvec_ptr); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a) +{ + if (!dc_isar_feature(aa32_dp, s)) { + return false; + } + return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_sdot_b); +} + +static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a) +{ + if (!dc_isar_feature(aa32_dp, s)) { + return false; + } + return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_udot_b); +} + +static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_usdot_b); +} + +static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a) +{ + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_bfdot); +} + +static bool trans_VFML(DisasContext *s, arg_VFML *a) +{ + int opr_sz; + + if (!dc_isar_feature(aa32_fhm, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + (a->vd & 0x10)) { + return false; + } + + if (a->vd & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + opr_sz = (1 + a->q) * 8; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd), + vfp_reg_offset(a->q, a->vn), + vfp_reg_offset(a->q, a->vm), + cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */ + gen_helper_gvec_fmlal_a32); + return true; +} + +static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a) +{ + int data = (a->index << 2) | a->rot; + + if (!dc_isar_feature(aa32_vcma, s)) { + return false; + } + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data, + FPST_STD_F16, gen_helper_gvec_fcmlah_idx); + } + return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data, + FPST_STD, gen_helper_gvec_fcmlas_idx); +} + +static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a) +{ + if (!dc_isar_feature(aa32_dp, s)) { + return false; + } + return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_sdot_idx_b); +} + +static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a) +{ + if (!dc_isar_feature(aa32_dp, s)) { + return false; + } + return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_udot_idx_b); +} + +static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_usdot_idx_b); +} + +static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_sudot_idx_b); +} + +static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a) +{ + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, + gen_helper_gvec_bfdot_idx); +} + +static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a) +{ + int opr_sz; + + if (!dc_isar_feature(aa32_fhm, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) { + return false; + } + + if (a->vd & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + opr_sz = (1 + a->q) * 8; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd), + vfp_reg_offset(a->q, a->vn), + vfp_reg_offset(a->q, a->rm), + cpu_env, opr_sz, opr_sz, + (a->index << 2) | a->s, /* is_2 == 0 */ + gen_helper_gvec_fmlal_idx_a32); + return true; +} + +static struct { + int nregs; + int interleave; + int spacing; +} const neon_ls_element_type[11] = { + {1, 4, 1}, + {1, 4, 2}, + {4, 1, 1}, + {2, 2, 2}, + {1, 3, 1}, + {1, 3, 2}, + {3, 1, 1}, + {1, 1, 1}, + {1, 2, 1}, + {1, 2, 2}, + {2, 1, 1} +}; + +static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn, + int stride) +{ + if (rm != 15) { + TCGv_i32 base; + + base = load_reg(s, rn); + if (rm == 13) { + tcg_gen_addi_i32(base, base, stride); + } else { + TCGv_i32 index; + index = load_reg(s, rm); + tcg_gen_add_i32(base, base, index); + tcg_temp_free_i32(index); + } + store_reg(s, rn, base); + } +} + +static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a) +{ + /* Neon load/store multiple structures */ + int nregs, interleave, spacing, reg, n; + MemOp mop, align, endian; + int mmu_idx = get_mem_index(s); + int size = a->size; + TCGv_i64 tmp64; + TCGv_i32 addr; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + if (a->itype > 10) { + return false; + } + /* Catch UNDEF cases for bad values of align field */ + switch (a->itype & 0xc) { + case 4: + if (a->align >= 2) { + return false; + } + break; + case 8: + if (a->align == 3) { + return false; + } + break; + default: + break; + } + nregs = neon_ls_element_type[a->itype].nregs; + interleave = neon_ls_element_type[a->itype].interleave; + spacing = neon_ls_element_type[a->itype].spacing; + if (size == 3 && (interleave | spacing) != 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* For our purposes, bytes are always little-endian. */ + endian = s->be_data; + if (size == 0) { + endian = MO_LE; + } + + /* Enforce alignment requested by the instruction */ + if (a->align) { + align = pow2_align(a->align + 2); /* 4 ** a->align */ + } else { + align = s->align_mem ? MO_ALIGN : 0; + } + + /* + * Consecutive little-endian elements from a single register + * can be promoted to a larger little-endian operation. + */ + if (interleave == 1 && endian == MO_LE) { + /* Retain any natural alignment. */ + if (align == MO_ALIGN) { + align = pow2_align(size); + } + size = 3; + } + + tmp64 = tcg_temp_new_i64(); + addr = tcg_temp_new_i32(); + load_reg_var(s, addr, a->rn); + + mop = endian | size | align; + for (reg = 0; reg < nregs; reg++) { + for (n = 0; n < 8 >> size; n++) { + int xs; + for (xs = 0; xs < interleave; xs++) { + int tt = a->vd + reg + spacing * xs; + + if (a->l) { + gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop); + neon_store_element64(tt, n, size, tmp64); + } else { + neon_load_element64(tmp64, tt, n, size); + gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop); + } + tcg_gen_addi_i32(addr, addr, 1 << size); + + /* Subsequent memory operations inherit alignment */ + mop &= ~MO_AMASK; + } + } + } + tcg_temp_free_i32(addr); + tcg_temp_free_i64(tmp64); + + gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8); + return true; +} + +static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a) +{ + /* Neon load single structure to all lanes */ + int reg, stride, vec_size; + int vd = a->vd; + int size = a->size; + int nregs = a->n + 1; + TCGv_i32 addr, tmp; + MemOp mop, align; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + align = 0; + if (size == 3) { + if (nregs != 4 || a->a == 0) { + return false; + } + /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */ + size = MO_32; + align = MO_ALIGN_16; + } else if (a->a) { + switch (nregs) { + case 1: + if (size == 0) { + return false; + } + align = MO_ALIGN; + break; + case 2: + align = pow2_align(size + 1); + break; + case 3: + return false; + case 4: + if (size == 2) { + align = pow2_align(3); + } else { + align = pow2_align(size + 2); + } + break; + default: + g_assert_not_reached(); + } + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * VLD1 to all lanes: T bit indicates how many Dregs to write. + * VLD2/3/4 to all lanes: T bit indicates register stride. + */ + stride = a->t ? 2 : 1; + vec_size = nregs == 1 ? stride * 8 : 8; + mop = size | align; + tmp = tcg_temp_new_i32(); + addr = tcg_temp_new_i32(); + load_reg_var(s, addr, a->rn); + for (reg = 0; reg < nregs; reg++) { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop); + if ((vd & 1) && vec_size == 16) { + /* + * We cannot write 16 bytes at once because the + * destination is unaligned. + */ + tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd), + 8, 8, tmp); + tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1), + neon_full_reg_offset(vd), 8, 8); + } else { + tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd), + vec_size, vec_size, tmp); + } + tcg_gen_addi_i32(addr, addr, 1 << size); + vd += stride; + + /* Subsequent memory operations inherit alignment */ + mop &= ~MO_AMASK; + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(addr); + + gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs); + + return true; +} + +static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a) +{ + /* Neon load/store single structure to one lane */ + int reg; + int nregs = a->n + 1; + int vd = a->vd; + TCGv_i32 addr, tmp; + MemOp mop; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + /* Catch the UNDEF cases. This is unavoidably a bit messy. */ + switch (nregs) { + case 1: + if (a->stride != 1) { + return false; + } + if (((a->align & (1 << a->size)) != 0) || + (a->size == 2 && (a->align == 1 || a->align == 2))) { + return false; + } + break; + case 2: + if (a->size == 2 && (a->align & 2) != 0) { + return false; + } + break; + case 3: + if (a->align != 0) { + return false; + } + break; + case 4: + if (a->size == 2 && a->align == 3) { + return false; + } + break; + default: + g_assert_not_reached(); + } + if ((vd + a->stride * (nregs - 1)) > 31) { + /* + * Attempts to write off the end of the register file are + * UNPREDICTABLE; we choose to UNDEF because otherwise we would + * access off the end of the array that holds the register data. + */ + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* Pick up SCTLR settings */ + mop = finalize_memop(s, a->size); + + if (a->align) { + MemOp align_op; + + switch (nregs) { + case 1: + /* For VLD1, use natural alignment. */ + align_op = MO_ALIGN; + break; + case 2: + /* For VLD2, use double alignment. */ + align_op = pow2_align(a->size + 1); + break; + case 4: + if (a->size == MO_32) { + /* + * For VLD4.32, align = 1 is double alignment, align = 2 is + * quad alignment; align = 3 is rejected above. + */ + align_op = pow2_align(a->size + a->align); + } else { + /* For VLD4.8 and VLD.16, we want quad alignment. */ + align_op = pow2_align(a->size + 2); + } + break; + default: + /* For VLD3, the alignment field is zero and rejected above. */ + g_assert_not_reached(); + } + + mop = (mop & ~MO_AMASK) | align_op; + } + + tmp = tcg_temp_new_i32(); + addr = tcg_temp_new_i32(); + load_reg_var(s, addr, a->rn); + + for (reg = 0; reg < nregs; reg++) { + if (a->l) { + gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop); + neon_store_element(vd, a->reg_idx, a->size, tmp); + } else { /* Store */ + neon_load_element(tmp, vd, a->reg_idx, a->size); + gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop); + } + vd += a->stride; + tcg_gen_addi_i32(addr, addr, 1 << a->size); + + /* Subsequent memory operations inherit alignment */ + mop &= ~MO_AMASK; + } + tcg_temp_free_i32(addr); + tcg_temp_free_i32(tmp); + + gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs); + + return true; +} + +static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn) +{ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_full_reg_offset(a->vd); + int rn_ofs = neon_full_reg_offset(a->vn); + int rm_ofs = neon_full_reg_offset(a->vm); + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if ((a->vn | a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size); + return true; +} + +#define DO_3SAME(INSN, FUNC) \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + return do_3same(s, a, FUNC); \ + } + +DO_3SAME(VADD, tcg_gen_gvec_add) +DO_3SAME(VSUB, tcg_gen_gvec_sub) +DO_3SAME(VAND, tcg_gen_gvec_and) +DO_3SAME(VBIC, tcg_gen_gvec_andc) +DO_3SAME(VORR, tcg_gen_gvec_or) +DO_3SAME(VORN, tcg_gen_gvec_orc) +DO_3SAME(VEOR, tcg_gen_gvec_xor) +DO_3SAME(VSHL_S, gen_gvec_sshl) +DO_3SAME(VSHL_U, gen_gvec_ushl) +DO_3SAME(VQADD_S, gen_gvec_sqadd_qc) +DO_3SAME(VQADD_U, gen_gvec_uqadd_qc) +DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc) +DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc) + +/* These insns are all gvec_bitsel but with the inputs in various orders. */ +#define DO_3SAME_BITSEL(INSN, O1, O2, O3) \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz); \ + } \ + DO_3SAME(INSN, gen_##INSN##_3s) + +DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs) +DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs) +DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs) + +#define DO_3SAME_NO_SZ_3(INSN, FUNC) \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size == 3) { \ + return false; \ + } \ + return do_3same(s, a, FUNC); \ + } + +DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax) +DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax) +DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin) +DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin) +DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul) +DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla) +DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls) +DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst) +DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd) +DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba) +DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd) +DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba) + +#define DO_3SAME_CMP(INSN, COND) \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \ + } \ + DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s) + +DO_3SAME_CMP(VCGT_S, TCG_COND_GT) +DO_3SAME_CMP(VCGT_U, TCG_COND_GTU) +DO_3SAME_CMP(VCGE_S, TCG_COND_GE) +DO_3SAME_CMP(VCGE_U, TCG_COND_GEU) +DO_3SAME_CMP(VCEQ, TCG_COND_EQ) + +#define WRAP_OOL_FN(WRAPNAME, FUNC) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, \ + uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz) \ + { \ + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \ + } + +WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b) + +static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a) +{ + if (a->size != 0) { + return false; + } + return do_3same(s, a, gen_VMUL_p_3s); +} + +#define DO_VQRDMLAH(INSN, FUNC) \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (!dc_isar_feature(aa32_rdm, s)) { \ + return false; \ + } \ + if (a->size != 1 && a->size != 2) { \ + return false; \ + } \ + return do_3same(s, a, FUNC); \ + } + +DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc) +DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc) + +#define DO_SHA1(NAME, FUNC) \ + WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \ + static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (!dc_isar_feature(aa32_sha1, s)) { \ + return false; \ + } \ + return do_3same(s, a, gen_##NAME##_3s); \ + } + +DO_SHA1(SHA1C, gen_helper_crypto_sha1c) +DO_SHA1(SHA1P, gen_helper_crypto_sha1p) +DO_SHA1(SHA1M, gen_helper_crypto_sha1m) +DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0) + +#define DO_SHA2(NAME, FUNC) \ + WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \ + static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (!dc_isar_feature(aa32_sha2, s)) { \ + return false; \ + } \ + return do_3same(s, a, gen_##NAME##_3s); \ + } + +DO_SHA2(SHA256H, gen_helper_crypto_sha256h) +DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2) +DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1) + +#define DO_3SAME_64(INSN, FUNC) \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static const GVecGen3 op = { .fni8 = FUNC }; \ + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op); \ + } \ + DO_3SAME(INSN, gen_##INSN##_3s) + +#define DO_3SAME_64_ENV(INSN, FUNC) \ + static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) \ + { \ + FUNC(d, cpu_env, n, m); \ + } \ + DO_3SAME_64(INSN, gen_##INSN##_elt) + +DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64) +DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64) +DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64) +DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64) +DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64) +DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64) + +#define DO_3SAME_32(INSN, FUNC) \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static const GVecGen3 ops[4] = { \ + { .fni4 = gen_helper_neon_##FUNC##8 }, \ + { .fni4 = gen_helper_neon_##FUNC##16 }, \ + { .fni4 = gen_helper_neon_##FUNC##32 }, \ + { 0 }, \ + }; \ + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \ + } \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size > 2) { \ + return false; \ + } \ + return do_3same(s, a, gen_##INSN##_3s); \ + } + +/* + * Some helper functions need to be passed the cpu_env. In order + * to use those with the gvec APIs like tcg_gen_gvec_3() we need + * to create wrapper functions whose prototype is a NeonGenTwoOpFn() + * and which call a NeonGenTwoOpEnvFn(). + */ +#define WRAP_ENV_FN(WRAPNAME, FUNC) \ + static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m) \ + { \ + FUNC(d, cpu_env, n, m); \ + } + +#define DO_3SAME_32_ENV(INSN, FUNC) \ + WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8); \ + WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16); \ + WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32); \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static const GVecGen3 ops[4] = { \ + { .fni4 = gen_##INSN##_tramp8 }, \ + { .fni4 = gen_##INSN##_tramp16 }, \ + { .fni4 = gen_##INSN##_tramp32 }, \ + { 0 }, \ + }; \ + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \ + } \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size > 2) { \ + return false; \ + } \ + return do_3same(s, a, gen_##INSN##_3s); \ + } + +DO_3SAME_32(VHADD_S, hadd_s) +DO_3SAME_32(VHADD_U, hadd_u) +DO_3SAME_32(VHSUB_S, hsub_s) +DO_3SAME_32(VHSUB_U, hsub_u) +DO_3SAME_32(VRHADD_S, rhadd_s) +DO_3SAME_32(VRHADD_U, rhadd_u) +DO_3SAME_32(VRSHL_S, rshl_s) +DO_3SAME_32(VRSHL_U, rshl_u) + +DO_3SAME_32_ENV(VQSHL_S, qshl_s) +DO_3SAME_32_ENV(VQSHL_U, qshl_u) +DO_3SAME_32_ENV(VQRSHL_S, qrshl_s) +DO_3SAME_32_ENV(VQRSHL_U, qrshl_u) + +static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn) +{ + /* Operations handled pairwise 32 bits at a time */ + TCGv_i32 tmp, tmp2, tmp3; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (a->size == 3) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + assert(a->q == 0); /* enforced by decode patterns */ + + /* + * Note that we have to be careful not to clobber the source operands + * in the "vm == vd" case by storing the result of the first pass too + * early. Since Q is 0 there are always just two passes, so instead + * of a complicated loop over each pass we just unroll. + */ + tmp = tcg_temp_new_i32(); + tmp2 = tcg_temp_new_i32(); + tmp3 = tcg_temp_new_i32(); + + read_neon_element32(tmp, a->vn, 0, MO_32); + read_neon_element32(tmp2, a->vn, 1, MO_32); + fn(tmp, tmp, tmp2); + + read_neon_element32(tmp3, a->vm, 0, MO_32); + read_neon_element32(tmp2, a->vm, 1, MO_32); + fn(tmp3, tmp3, tmp2); + + write_neon_element32(tmp, a->vd, 0, MO_32); + write_neon_element32(tmp3, a->vd, 1, MO_32); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp3); + return true; +} + +#define DO_3SAME_PAIR(INSN, func) \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + static NeonGenTwoOpFn * const fns[] = { \ + gen_helper_neon_##func##8, \ + gen_helper_neon_##func##16, \ + gen_helper_neon_##func##32, \ + }; \ + if (a->size > 2) { \ + return false; \ + } \ + return do_3same_pair(s, a, fns[a->size]); \ + } + +/* 32-bit pairwise ops end up the same as the elementwise versions. */ +#define gen_helper_neon_pmax_s32 tcg_gen_smax_i32 +#define gen_helper_neon_pmax_u32 tcg_gen_umax_i32 +#define gen_helper_neon_pmin_s32 tcg_gen_smin_i32 +#define gen_helper_neon_pmin_u32 tcg_gen_umin_i32 +#define gen_helper_neon_padd_u32 tcg_gen_add_i32 + +DO_3SAME_PAIR(VPMAX_S, pmax_s) +DO_3SAME_PAIR(VPMIN_S, pmin_s) +DO_3SAME_PAIR(VPMAX_U, pmax_u) +DO_3SAME_PAIR(VPMIN_U, pmin_u) +DO_3SAME_PAIR(VPADD, padd_u) + +#define DO_3SAME_VQDMULH(INSN, FUNC) \ + WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16); \ + WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32); \ + static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static const GVecGen3 ops[2] = { \ + { .fni4 = gen_##INSN##_tramp16 }, \ + { .fni4 = gen_##INSN##_tramp32 }, \ + }; \ + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \ + } \ + static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size != 1 && a->size != 2) { \ + return false; \ + } \ + return do_3same(s, a, gen_##INSN##_3s); \ + } + +DO_3SAME_VQDMULH(VQDMULH, qdmulh) +DO_3SAME_VQDMULH(VQRDMULH, qrdmulh) + +#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \ + uint32_t rn_ofs, uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + TCGv_ptr fpst = fpstatus_ptr(FPST); \ + tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst, \ + oprsz, maxsz, 0, FUNC); \ + tcg_temp_free_ptr(fpst); \ + } + +#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC) \ + WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC) \ + WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC) \ + static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size == MO_16) { \ + if (!dc_isar_feature(aa32_fp16_arith, s)) { \ + return false; \ + } \ + return do_3same(s, a, gen_##INSN##_fp16_3s); \ + } \ + return do_3same(s, a, gen_##INSN##_fp32_3s); \ + } + + +DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h) +DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h) +DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h) +DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h) +DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h) +DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h) +DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h) +DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h) +DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h) +DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h) +DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h) +DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h) +DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h) +DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h) +DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h) +DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h) +DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h) + +WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s) +WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h) +WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s) +WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h) + +static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + return do_3same(s, a, gen_VMAXNM_fp16_3s); + } + return do_3same(s, a, gen_VMAXNM_fp32_3s); +} + +static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + return do_3same(s, a, gen_VMINNM_fp16_3s); + } + return do_3same(s, a, gen_VMINNM_fp32_3s); +} + +static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, + gen_helper_gvec_3_ptr *fn) +{ + /* FP pairwise operations */ + TCGv_ptr fpstatus; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + assert(a->q == 0); /* enforced by decode patterns */ + + + fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD); + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd), + vfp_reg_offset(1, a->vn), + vfp_reg_offset(1, a->vm), + fpstatus, 8, 8, 0, fn); + tcg_temp_free_ptr(fpstatus); + + return true; +} + +/* + * For all the functions using this macro, size == 1 means fp16, + * which is an architecture extension we don't implement yet. + */ +#define DO_3S_FP_PAIR(INSN,FUNC) \ + static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \ + { \ + if (a->size == MO_16) { \ + if (!dc_isar_feature(aa32_fp16_arith, s)) { \ + return false; \ + } \ + return do_3same_fp_pair(s, a, FUNC##h); \ + } \ + return do_3same_fp_pair(s, a, FUNC##s); \ + } + +DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd) +DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax) +DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin) + +static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn) +{ + /* Handle a 2-reg-shift insn which can be vectorized. */ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_full_reg_offset(a->vd); + int rm_ofs = neon_full_reg_offset(a->vm); + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size); + return true; +} + +#define DO_2SH(INSN, FUNC) \ + static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + return do_vector_2sh(s, a, FUNC); \ + } \ + +DO_2SH(VSHL, tcg_gen_gvec_shli) +DO_2SH(VSLI, gen_gvec_sli) +DO_2SH(VSRI, gen_gvec_sri) +DO_2SH(VSRA_S, gen_gvec_ssra) +DO_2SH(VSRA_U, gen_gvec_usra) +DO_2SH(VRSHR_S, gen_gvec_srshr) +DO_2SH(VRSHR_U, gen_gvec_urshr) +DO_2SH(VRSRA_S, gen_gvec_srsra) +DO_2SH(VRSRA_U, gen_gvec_ursra) + +static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a) +{ + /* Signed shift out of range results in all-sign-bits */ + a->shift = MIN(a->shift, (8 << a->size) - 1); + return do_vector_2sh(s, a, tcg_gen_gvec_sari); +} + +static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0); +} + +static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a) +{ + /* Shift out of range is architecturally valid and results in zero. */ + if (a->shift >= (8 << a->size)) { + return do_vector_2sh(s, a, gen_zero_rd_2sh); + } else { + return do_vector_2sh(s, a, tcg_gen_gvec_shri); + } +} + +static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a, + NeonGenTwo64OpEnvFn *fn) +{ + /* + * 2-reg-and-shift operations, size == 3 case, where the + * function needs to be passed cpu_env. + */ + TCGv_i64 constimm; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * To avoid excessive duplication of ops we implement shift + * by immediate using the variable shift operations. + */ + constimm = tcg_constant_i64(dup_const(a->size, a->shift)); + + for (pass = 0; pass < a->q + 1; pass++) { + TCGv_i64 tmp = tcg_temp_new_i64(); + + read_neon_element64(tmp, a->vm, pass, MO_64); + fn(tmp, cpu_env, tmp, constimm); + write_neon_element64(tmp, a->vd, pass, MO_64); + tcg_temp_free_i64(tmp); + } + return true; +} + +static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a, + NeonGenTwoOpEnvFn *fn) +{ + /* + * 2-reg-and-shift operations, size < 3 case, where the + * helper needs to be passed cpu_env. + */ + TCGv_i32 constimm, tmp; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * To avoid excessive duplication of ops we implement shift + * by immediate using the variable shift operations. + */ + constimm = tcg_constant_i32(dup_const(a->size, a->shift)); + tmp = tcg_temp_new_i32(); + + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + read_neon_element32(tmp, a->vm, pass, MO_32); + fn(tmp, cpu_env, tmp, constimm); + write_neon_element32(tmp, a->vd, pass, MO_32); + } + tcg_temp_free_i32(tmp); + return true; +} + +#define DO_2SHIFT_ENV(INSN, FUNC) \ + static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \ + } \ + static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + static NeonGenTwoOpEnvFn * const fns[] = { \ + gen_helper_neon_##FUNC##8, \ + gen_helper_neon_##FUNC##16, \ + gen_helper_neon_##FUNC##32, \ + }; \ + assert(a->size < ARRAY_SIZE(fns)); \ + return do_2shift_env_32(s, a, fns[a->size]); \ + } + +DO_2SHIFT_ENV(VQSHLU, qshlu_s) +DO_2SHIFT_ENV(VQSHL_U, qshl_u) +DO_2SHIFT_ENV(VQSHL_S, qshl_s) + +static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a, + NeonGenTwo64OpFn *shiftfn, + NeonGenNarrowEnvFn *narrowfn) +{ + /* 2-reg-and-shift narrowing-shift operations, size == 3 case */ + TCGv_i64 constimm, rm1, rm2; + TCGv_i32 rd; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vm & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * This is always a right shift, and the shiftfn is always a + * left-shift helper, which thus needs the negated shift count. + */ + constimm = tcg_constant_i64(-a->shift); + rm1 = tcg_temp_new_i64(); + rm2 = tcg_temp_new_i64(); + rd = tcg_temp_new_i32(); + + /* Load both inputs first to avoid potential overwrite if rm == rd */ + read_neon_element64(rm1, a->vm, 0, MO_64); + read_neon_element64(rm2, a->vm, 1, MO_64); + + shiftfn(rm1, rm1, constimm); + narrowfn(rd, cpu_env, rm1); + write_neon_element32(rd, a->vd, 0, MO_32); + + shiftfn(rm2, rm2, constimm); + narrowfn(rd, cpu_env, rm2); + write_neon_element32(rd, a->vd, 1, MO_32); + + tcg_temp_free_i32(rd); + tcg_temp_free_i64(rm1); + tcg_temp_free_i64(rm2); + + return true; +} + +static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a, + NeonGenTwoOpFn *shiftfn, + NeonGenNarrowEnvFn *narrowfn) +{ + /* 2-reg-and-shift narrowing-shift operations, size < 3 case */ + TCGv_i32 constimm, rm1, rm2, rm3, rm4; + TCGv_i64 rtmp; + uint32_t imm; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vm & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * This is always a right shift, and the shiftfn is always a + * left-shift helper, which thus needs the negated shift count + * duplicated into each lane of the immediate value. + */ + if (a->size == 1) { + imm = (uint16_t)(-a->shift); + imm |= imm << 16; + } else { + /* size == 2 */ + imm = -a->shift; + } + constimm = tcg_constant_i32(imm); + + /* Load all inputs first to avoid potential overwrite */ + rm1 = tcg_temp_new_i32(); + rm2 = tcg_temp_new_i32(); + rm3 = tcg_temp_new_i32(); + rm4 = tcg_temp_new_i32(); + read_neon_element32(rm1, a->vm, 0, MO_32); + read_neon_element32(rm2, a->vm, 1, MO_32); + read_neon_element32(rm3, a->vm, 2, MO_32); + read_neon_element32(rm4, a->vm, 3, MO_32); + rtmp = tcg_temp_new_i64(); + + shiftfn(rm1, rm1, constimm); + shiftfn(rm2, rm2, constimm); + + tcg_gen_concat_i32_i64(rtmp, rm1, rm2); + tcg_temp_free_i32(rm2); + + narrowfn(rm1, cpu_env, rtmp); + write_neon_element32(rm1, a->vd, 0, MO_32); + tcg_temp_free_i32(rm1); + + shiftfn(rm3, rm3, constimm); + shiftfn(rm4, rm4, constimm); + + tcg_gen_concat_i32_i64(rtmp, rm3, rm4); + tcg_temp_free_i32(rm4); + + narrowfn(rm3, cpu_env, rtmp); + tcg_temp_free_i64(rtmp); + write_neon_element32(rm3, a->vd, 1, MO_32); + tcg_temp_free_i32(rm3); + return true; +} + +#define DO_2SN_64(INSN, FUNC, NARROWFUNC) \ + static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC); \ + } +#define DO_2SN_32(INSN, FUNC, NARROWFUNC) \ + static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \ + } + +static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +{ + tcg_gen_extrl_i64_i32(dest, src); +} + +static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +{ + gen_helper_neon_narrow_u16(dest, src); +} + +static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src) +{ + gen_helper_neon_narrow_u8(dest, src); +} + +DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32) +DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16) +DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8) + +DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32) +DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16) +DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8) + +DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32) +DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16) +DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8) + +DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32) +DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16) +DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8) +DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32) +DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16) +DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8) + +DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32) +DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16) +DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8) + +DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32) +DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16) +DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8) + +DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32) +DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16) +DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8) + +static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a, + NeonGenWidenFn *widenfn, bool u) +{ + TCGv_i64 tmp; + TCGv_i32 rm0, rm1; + uint64_t widen_mask = 0; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vd & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * This is a widen-and-shift operation. The shift is always less + * than the width of the source type, so after widening the input + * vector we can simply shift the whole 64-bit widened register, + * and then clear the potential overflow bits resulting from left + * bits of the narrow input appearing as right bits of the left + * neighbour narrow input. Calculate a mask of bits to clear. + */ + if ((a->shift != 0) && (a->size < 2 || u)) { + int esize = 8 << a->size; + widen_mask = MAKE_64BIT_MASK(0, esize); + widen_mask >>= esize - a->shift; + widen_mask = dup_const(a->size + 1, widen_mask); + } + + rm0 = tcg_temp_new_i32(); + rm1 = tcg_temp_new_i32(); + read_neon_element32(rm0, a->vm, 0, MO_32); + read_neon_element32(rm1, a->vm, 1, MO_32); + tmp = tcg_temp_new_i64(); + + widenfn(tmp, rm0); + tcg_temp_free_i32(rm0); + if (a->shift != 0) { + tcg_gen_shli_i64(tmp, tmp, a->shift); + tcg_gen_andi_i64(tmp, tmp, ~widen_mask); + } + write_neon_element64(tmp, a->vd, 0, MO_64); + + widenfn(tmp, rm1); + tcg_temp_free_i32(rm1); + if (a->shift != 0) { + tcg_gen_shli_i64(tmp, tmp, a->shift); + tcg_gen_andi_i64(tmp, tmp, ~widen_mask); + } + write_neon_element64(tmp, a->vd, 1, MO_64); + tcg_temp_free_i64(tmp); + return true; +} + +static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + }; + return do_vshll_2sh(s, a, widenfn[a->size], false); +} + +static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + }; + return do_vshll_2sh(s, a, widenfn[a->size], true); +} + +static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a, + gen_helper_gvec_2_ptr *fn) +{ + /* FP operations in 2-reg-and-shift group */ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_full_reg_offset(a->vd); + int rm_ofs = neon_full_reg_offset(a->vm); + TCGv_ptr fpst; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm | a->vd) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD); + tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn); + tcg_temp_free_ptr(fpst); + return true; +} + +#define DO_FP_2SH(INSN, FUNC) \ + static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \ + { \ + return do_fp_2sh(s, a, FUNC); \ + } + +DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf) +DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf) +DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs) +DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu) + +DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh) +DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh) +DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs) +DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu) + +static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a, + GVecGen2iFn *fn) +{ + uint64_t imm; + int reg_ofs, vec_size; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (a->vd & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + reg_ofs = neon_full_reg_offset(a->vd); + vec_size = a->q ? 16 : 8; + imm = asimd_imm_const(a->imm, a->cmode, a->op); + + fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size); + return true; +} + +static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c); +} + +static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a) +{ + /* Handle decode of cmode/op here between VORR/VBIC/VMOV */ + GVecGen2iFn *fn; + + if ((a->cmode & 1) && a->cmode < 12) { + /* for op=1, the imm will be inverted, so BIC becomes AND. */ + fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori; + } else { + /* There is one unallocated cmode/op combination in this space */ + if (a->cmode == 15 && a->op == 1) { + return false; + } + fn = gen_VMOV_1r; + } + return do_1reg_imm(s, a, fn); +} + +static bool do_prewiden_3d(DisasContext *s, arg_3diff *a, + NeonGenWidenFn *widenfn, + NeonGenTwo64OpFn *opfn, + int src1_mop, int src2_mop) +{ + /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */ + TCGv_i64 rn0_64, rn1_64, rm_64; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn) { + /* size == 3 case, which is an entirely different insn group */ + return false; + } + + if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rn0_64 = tcg_temp_new_i64(); + rn1_64 = tcg_temp_new_i64(); + rm_64 = tcg_temp_new_i64(); + + if (src1_mop >= 0) { + read_neon_element64(rn0_64, a->vn, 0, src1_mop); + } else { + TCGv_i32 tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vn, 0, MO_32); + widenfn(rn0_64, tmp); + tcg_temp_free_i32(tmp); + } + if (src2_mop >= 0) { + read_neon_element64(rm_64, a->vm, 0, src2_mop); + } else { + TCGv_i32 tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vm, 0, MO_32); + widenfn(rm_64, tmp); + tcg_temp_free_i32(tmp); + } + + opfn(rn0_64, rn0_64, rm_64); + + /* + * Load second pass inputs before storing the first pass result, to + * avoid incorrect results if a narrow input overlaps with the result. + */ + if (src1_mop >= 0) { + read_neon_element64(rn1_64, a->vn, 1, src1_mop); + } else { + TCGv_i32 tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vn, 1, MO_32); + widenfn(rn1_64, tmp); + tcg_temp_free_i32(tmp); + } + if (src2_mop >= 0) { + read_neon_element64(rm_64, a->vm, 1, src2_mop); + } else { + TCGv_i32 tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vm, 1, MO_32); + widenfn(rm_64, tmp); + tcg_temp_free_i32(tmp); + } + + write_neon_element64(rn0_64, a->vd, 0, MO_64); + + opfn(rn1_64, rn1_64, rm_64); + write_neon_element64(rn1_64, a->vd, 1, MO_64); + + tcg_temp_free_i64(rn0_64); + tcg_temp_free_i64(rn1_64); + tcg_temp_free_i64(rm_64); + + return true; +} + +#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN) \ + static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \ + { \ + static NeonGenWidenFn * const widenfn[] = { \ + gen_helper_neon_widen_##S##8, \ + gen_helper_neon_widen_##S##16, \ + NULL, NULL, \ + }; \ + static NeonGenTwo64OpFn * const addfn[] = { \ + gen_helper_neon_##OP##l_u16, \ + gen_helper_neon_##OP##l_u32, \ + tcg_gen_##OP##_i64, \ + NULL, \ + }; \ + int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1; \ + return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size], \ + SRC1WIDE ? MO_UQ : narrow_mop, \ + narrow_mop); \ + } + +DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN) +DO_PREWIDEN(VADDL_U, u, add, false, 0) +DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN) +DO_PREWIDEN(VSUBL_U, u, sub, false, 0) +DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN) +DO_PREWIDEN(VADDW_U, u, add, true, 0) +DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN) +DO_PREWIDEN(VSUBW_U, u, sub, true, 0) + +static bool do_narrow_3d(DisasContext *s, arg_3diff *a, + NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn) +{ + /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */ + TCGv_i64 rn_64, rm_64; + TCGv_i32 rd0, rd1; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn || !narrowfn) { + /* size == 3 case, which is an entirely different insn group */ + return false; + } + + if ((a->vn | a->vm) & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rn_64 = tcg_temp_new_i64(); + rm_64 = tcg_temp_new_i64(); + rd0 = tcg_temp_new_i32(); + rd1 = tcg_temp_new_i32(); + + read_neon_element64(rn_64, a->vn, 0, MO_64); + read_neon_element64(rm_64, a->vm, 0, MO_64); + + opfn(rn_64, rn_64, rm_64); + + narrowfn(rd0, rn_64); + + read_neon_element64(rn_64, a->vn, 1, MO_64); + read_neon_element64(rm_64, a->vm, 1, MO_64); + + opfn(rn_64, rn_64, rm_64); + + narrowfn(rd1, rn_64); + + write_neon_element32(rd0, a->vd, 0, MO_32); + write_neon_element32(rd1, a->vd, 1, MO_32); + + tcg_temp_free_i32(rd0); + tcg_temp_free_i32(rd1); + tcg_temp_free_i64(rn_64); + tcg_temp_free_i64(rm_64); + + return true; +} + +#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP) \ + static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \ + { \ + static NeonGenTwo64OpFn * const addfn[] = { \ + gen_helper_neon_##OP##l_u16, \ + gen_helper_neon_##OP##l_u32, \ + tcg_gen_##OP##_i64, \ + NULL, \ + }; \ + static NeonGenNarrowFn * const narrowfn[] = { \ + gen_helper_neon_##NARROWTYPE##_high_u8, \ + gen_helper_neon_##NARROWTYPE##_high_u16, \ + EXTOP, \ + NULL, \ + }; \ + return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]); \ + } + +static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn) +{ + tcg_gen_addi_i64(rn, rn, 1u << 31); + tcg_gen_extrh_i64_i32(rd, rn); +} + +DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32) +DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32) +DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32) +DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32) + +static bool do_long_3d(DisasContext *s, arg_3diff *a, + NeonGenTwoOpWidenFn *opfn, + NeonGenTwo64OpFn *accfn) +{ + /* + * 3-regs different lengths, long operations. + * These perform an operation on two inputs that returns a double-width + * result, and then possibly perform an accumulation operation of + * that result into the double-width destination. + */ + TCGv_i64 rd0, rd1, tmp; + TCGv_i32 rn, rm; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn) { + /* size == 3 case, which is an entirely different insn group */ + return false; + } + + if (a->vd & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rd0 = tcg_temp_new_i64(); + rd1 = tcg_temp_new_i64(); + + rn = tcg_temp_new_i32(); + rm = tcg_temp_new_i32(); + read_neon_element32(rn, a->vn, 0, MO_32); + read_neon_element32(rm, a->vm, 0, MO_32); + opfn(rd0, rn, rm); + + read_neon_element32(rn, a->vn, 1, MO_32); + read_neon_element32(rm, a->vm, 1, MO_32); + opfn(rd1, rn, rm); + tcg_temp_free_i32(rn); + tcg_temp_free_i32(rm); + + /* Don't store results until after all loads: they might overlap */ + if (accfn) { + tmp = tcg_temp_new_i64(); + read_neon_element64(tmp, a->vd, 0, MO_64); + accfn(rd0, tmp, rd0); + read_neon_element64(tmp, a->vd, 1, MO_64); + accfn(rd1, tmp, rd1); + tcg_temp_free_i64(tmp); + } + + write_neon_element64(rd0, a->vd, 0, MO_64); + write_neon_element64(rd1, a->vd, 1, MO_64); + tcg_temp_free_i64(rd0); + tcg_temp_free_i64(rd1); + + return true; +} + +static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_abdl_s16, + gen_helper_neon_abdl_s32, + gen_helper_neon_abdl_s64, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], NULL); +} + +static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_abdl_u16, + gen_helper_neon_abdl_u32, + gen_helper_neon_abdl_u64, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], NULL); +} + +static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_abdl_s16, + gen_helper_neon_abdl_s32, + gen_helper_neon_abdl_s64, + NULL, + }; + static NeonGenTwo64OpFn * const addfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], addfn[a->size]); +} + +static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_abdl_u16, + gen_helper_neon_abdl_u32, + gen_helper_neon_abdl_u64, + NULL, + }; + static NeonGenTwo64OpFn * const addfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], addfn[a->size]); +} + +static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm) +{ + TCGv_i32 lo = tcg_temp_new_i32(); + TCGv_i32 hi = tcg_temp_new_i32(); + + tcg_gen_muls2_i32(lo, hi, rn, rm); + tcg_gen_concat_i32_i64(rd, lo, hi); + + tcg_temp_free_i32(lo); + tcg_temp_free_i32(hi); +} + +static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm) +{ + TCGv_i32 lo = tcg_temp_new_i32(); + TCGv_i32 hi = tcg_temp_new_i32(); + + tcg_gen_mulu2_i32(lo, hi, rn, rm); + tcg_gen_concat_i32_i64(rd, lo, hi); + + tcg_temp_free_i32(lo); + tcg_temp_free_i32(hi); +} + +static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_mull_s8, + gen_helper_neon_mull_s16, + gen_mull_s32, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], NULL); +} + +static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + gen_helper_neon_mull_u8, + gen_helper_neon_mull_u16, + gen_mull_u32, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], NULL); +} + +#define DO_VMLAL(INSN,MULL,ACC) \ + static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \ + { \ + static NeonGenTwoOpWidenFn * const opfn[] = { \ + gen_helper_neon_##MULL##8, \ + gen_helper_neon_##MULL##16, \ + gen_##MULL##32, \ + NULL, \ + }; \ + static NeonGenTwo64OpFn * const accfn[] = { \ + gen_helper_neon_##ACC##l_u16, \ + gen_helper_neon_##ACC##l_u32, \ + tcg_gen_##ACC##_i64, \ + NULL, \ + }; \ + return do_long_3d(s, a, opfn[a->size], accfn[a->size]); \ + } + +DO_VMLAL(VMLAL_S,mull_s,add) +DO_VMLAL(VMLAL_U,mull_u,add) +DO_VMLAL(VMLSL_S,mull_s,sub) +DO_VMLAL(VMLSL_U,mull_u,sub) + +static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm) +{ + gen_helper_neon_mull_s16(rd, rn, rm); + gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd); +} + +static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm) +{ + gen_mull_s32(rd, rn, rm); + gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd); +} + +static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], NULL); +} + +static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm); +} + +static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm); +} + +static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + NULL, + gen_VQDMLAL_acc_16, + gen_VQDMLAL_acc_32, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], accfn[a->size]); +} + +static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + gen_helper_neon_negl_u32(rm, rm); + gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm); +} + +static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_neg_i64(rm, rm); + gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm); +} + +static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + NULL, + gen_VQDMLSL_acc_16, + gen_VQDMLSL_acc_32, + NULL, + }; + + return do_long_3d(s, a, opfn[a->size], accfn[a->size]); +} + +static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a) +{ + gen_helper_gvec_3 *fn_gvec; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (a->vd & 1) { + return false; + } + + switch (a->size) { + case 0: + fn_gvec = gen_helper_neon_pmull_h; + break; + case 2: + if (!dc_isar_feature(aa32_pmull, s)) { + return false; + } + fn_gvec = gen_helper_gvec_pmull_q; + break; + default: + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd), + neon_full_reg_offset(a->vn), + neon_full_reg_offset(a->vm), + 16, 16, 0, fn_gvec); + return true; +} + +static void gen_neon_dup_low16(TCGv_i32 var) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_ext16u_i32(var, var); + tcg_gen_shli_i32(tmp, var, 16); + tcg_gen_or_i32(var, var, tmp); + tcg_temp_free_i32(tmp); +} + +static void gen_neon_dup_high16(TCGv_i32 var) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_andi_i32(var, var, 0xffff0000); + tcg_gen_shri_i32(tmp, var, 16); + tcg_gen_or_i32(var, var, tmp); + tcg_temp_free_i32(tmp); +} + +static inline TCGv_i32 neon_get_scalar(int size, int reg) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + if (size == MO_16) { + read_neon_element32(tmp, reg & 7, reg >> 4, MO_32); + if (reg & 8) { + gen_neon_dup_high16(tmp); + } else { + gen_neon_dup_low16(tmp); + } + } else { + read_neon_element32(tmp, reg & 15, reg >> 4, MO_32); + } + return tmp; +} + +static bool do_2scalar(DisasContext *s, arg_2scalar *a, + NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn) +{ + /* + * Two registers and a scalar: perform an operation between + * the input elements and the scalar, and then possibly + * perform an accumulation operation of that result into the + * destination. + */ + TCGv_i32 scalar, tmp; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn) { + /* Bad size (including size == 3, which is a different insn group) */ + return false; + } + + if (a->q && ((a->vd | a->vn) & 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + scalar = neon_get_scalar(a->size, a->vm); + tmp = tcg_temp_new_i32(); + + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + read_neon_element32(tmp, a->vn, pass, MO_32); + opfn(tmp, tmp, scalar); + if (accfn) { + TCGv_i32 rd = tcg_temp_new_i32(); + read_neon_element32(rd, a->vd, pass, MO_32); + accfn(tmp, rd, tmp); + tcg_temp_free_i32(rd); + } + write_neon_element32(tmp, a->vd, pass, MO_32); + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(scalar); + return true; +} + +static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpFn * const opfn[] = { + NULL, + gen_helper_neon_mul_u16, + tcg_gen_mul_i32, + NULL, + }; + + return do_2scalar(s, a, opfn[a->size], NULL); +} + +static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpFn * const opfn[] = { + NULL, + gen_helper_neon_mul_u16, + tcg_gen_mul_i32, + NULL, + }; + static NeonGenTwoOpFn * const accfn[] = { + NULL, + gen_helper_neon_add_u16, + tcg_gen_add_i32, + NULL, + }; + + return do_2scalar(s, a, opfn[a->size], accfn[a->size]); +} + +static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpFn * const opfn[] = { + NULL, + gen_helper_neon_mul_u16, + tcg_gen_mul_i32, + NULL, + }; + static NeonGenTwoOpFn * const accfn[] = { + NULL, + gen_helper_neon_sub_u16, + tcg_gen_sub_i32, + NULL, + }; + + return do_2scalar(s, a, opfn[a->size], accfn[a->size]); +} + +static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a, + gen_helper_gvec_3_ptr *fn) +{ + /* Two registers and a scalar, using gvec */ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_full_reg_offset(a->vd); + int rn_ofs = neon_full_reg_offset(a->vn); + int rm_ofs; + int idx; + TCGv_ptr fpstatus; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!fn) { + /* Bad size (including size == 3, which is a different insn group) */ + return false; + } + + if (a->q && ((a->vd | a->vn) & 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* a->vm is M:Vm, which encodes both register and index */ + idx = extract32(a->vm, a->size + 2, 2); + a->vm = extract32(a->vm, 0, a->size + 2); + rm_ofs = neon_full_reg_offset(a->vm); + + fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD); + tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus, + vec_size, vec_size, idx, fn); + tcg_temp_free_ptr(fpstatus); + return true; +} + +#define DO_VMUL_F_2sc(NAME, FUNC) \ + static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a) \ + { \ + static gen_helper_gvec_3_ptr * const opfn[] = { \ + NULL, \ + gen_helper_##FUNC##_h, \ + gen_helper_##FUNC##_s, \ + NULL, \ + }; \ + if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \ + return false; \ + } \ + return do_2scalar_fp_vec(s, a, opfn[a->size]); \ + } + +DO_VMUL_F_2sc(VMUL, gvec_fmul_idx) +DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx) +DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx) + +WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16) +WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32) +WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16) +WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32) + +static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpFn * const opfn[] = { + NULL, + gen_VQDMULH_16, + gen_VQDMULH_32, + NULL, + }; + + return do_2scalar(s, a, opfn[a->size], NULL); +} + +static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpFn * const opfn[] = { + NULL, + gen_VQRDMULH_16, + gen_VQRDMULH_32, + NULL, + }; + + return do_2scalar(s, a, opfn[a->size], NULL); +} + +static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a, + NeonGenThreeOpEnvFn *opfn) +{ + /* + * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn + * performs a kind of fused op-then-accumulate using a helper + * function that takes all of rd, rn and the scalar at once. + */ + TCGv_i32 scalar, rn, rd; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + if (!dc_isar_feature(aa32_rdm, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn) { + /* Bad size (including size == 3, which is a different insn group) */ + return false; + } + + if (a->q && ((a->vd | a->vn) & 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + scalar = neon_get_scalar(a->size, a->vm); + rn = tcg_temp_new_i32(); + rd = tcg_temp_new_i32(); + + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + read_neon_element32(rn, a->vn, pass, MO_32); + read_neon_element32(rd, a->vd, pass, MO_32); + opfn(rd, cpu_env, rn, scalar, rd); + write_neon_element32(rd, a->vd, pass, MO_32); + } + tcg_temp_free_i32(rn); + tcg_temp_free_i32(rd); + tcg_temp_free_i32(scalar); + + return true; +} + +static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenThreeOpEnvFn *opfn[] = { + NULL, + gen_helper_neon_qrdmlah_s16, + gen_helper_neon_qrdmlah_s32, + NULL, + }; + return do_vqrdmlah_2sc(s, a, opfn[a->size]); +} + +static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenThreeOpEnvFn *opfn[] = { + NULL, + gen_helper_neon_qrdmlsh_s16, + gen_helper_neon_qrdmlsh_s32, + NULL, + }; + return do_vqrdmlah_2sc(s, a, opfn[a->size]); +} + +static bool do_2scalar_long(DisasContext *s, arg_2scalar *a, + NeonGenTwoOpWidenFn *opfn, + NeonGenTwo64OpFn *accfn) +{ + /* + * Two registers and a scalar, long operations: perform an + * operation on the input elements and the scalar which produces + * a double-width result, and then possibly perform an accumulation + * operation of that result into the destination. + */ + TCGv_i32 scalar, rn; + TCGv_i64 rn0_64, rn1_64; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!opfn) { + /* Bad size (including size == 3, which is a different insn group) */ + return false; + } + + if (a->vd & 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + scalar = neon_get_scalar(a->size, a->vm); + + /* Load all inputs before writing any outputs, in case of overlap */ + rn = tcg_temp_new_i32(); + read_neon_element32(rn, a->vn, 0, MO_32); + rn0_64 = tcg_temp_new_i64(); + opfn(rn0_64, rn, scalar); + + read_neon_element32(rn, a->vn, 1, MO_32); + rn1_64 = tcg_temp_new_i64(); + opfn(rn1_64, rn, scalar); + tcg_temp_free_i32(rn); + tcg_temp_free_i32(scalar); + + if (accfn) { + TCGv_i64 t64 = tcg_temp_new_i64(); + read_neon_element64(t64, a->vd, 0, MO_64); + accfn(rn0_64, t64, rn0_64); + read_neon_element64(t64, a->vd, 1, MO_64); + accfn(rn1_64, t64, rn1_64); + tcg_temp_free_i64(t64); + } + + write_neon_element64(rn0_64, a->vd, 0, MO_64); + write_neon_element64(rn1_64, a->vd, 1, MO_64); + tcg_temp_free_i64(rn0_64); + tcg_temp_free_i64(rn1_64); + return true; +} + +static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_helper_neon_mull_s16, + gen_mull_s32, + NULL, + }; + + return do_2scalar_long(s, a, opfn[a->size], NULL); +} + +static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_helper_neon_mull_u16, + gen_mull_u32, + NULL, + }; + + return do_2scalar_long(s, a, opfn[a->size], NULL); +} + +#define DO_VMLAL_2SC(INSN, MULL, ACC) \ + static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a) \ + { \ + static NeonGenTwoOpWidenFn * const opfn[] = { \ + NULL, \ + gen_helper_neon_##MULL##16, \ + gen_##MULL##32, \ + NULL, \ + }; \ + static NeonGenTwo64OpFn * const accfn[] = { \ + NULL, \ + gen_helper_neon_##ACC##l_u32, \ + tcg_gen_##ACC##_i64, \ + NULL, \ + }; \ + return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]); \ + } + +DO_VMLAL_2SC(VMLAL_S, mull_s, add) +DO_VMLAL_2SC(VMLAL_U, mull_u, add) +DO_VMLAL_2SC(VMLSL_S, mull_s, sub) +DO_VMLAL_2SC(VMLSL_U, mull_u, sub) + +static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + + return do_2scalar_long(s, a, opfn[a->size], NULL); +} + +static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + NULL, + gen_VQDMLAL_acc_16, + gen_VQDMLAL_acc_32, + NULL, + }; + + return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]); +} + +static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a) +{ + static NeonGenTwoOpWidenFn * const opfn[] = { + NULL, + gen_VQDMULL_16, + gen_VQDMULL_32, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + NULL, + gen_VQDMLSL_acc_16, + gen_VQDMLSL_acc_32, + NULL, + }; + + return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]); +} + +static bool trans_VEXT(DisasContext *s, arg_VEXT *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if ((a->vn | a->vm | a->vd) & a->q) { + return false; + } + + if (a->imm > 7 && !a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (!a->q) { + /* Extract 64 bits from <Vm:Vn> */ + TCGv_i64 left, right, dest; + + left = tcg_temp_new_i64(); + right = tcg_temp_new_i64(); + dest = tcg_temp_new_i64(); + + read_neon_element64(right, a->vn, 0, MO_64); + read_neon_element64(left, a->vm, 0, MO_64); + tcg_gen_extract2_i64(dest, right, left, a->imm * 8); + write_neon_element64(dest, a->vd, 0, MO_64); + + tcg_temp_free_i64(left); + tcg_temp_free_i64(right); + tcg_temp_free_i64(dest); + } else { + /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */ + TCGv_i64 left, middle, right, destleft, destright; + + left = tcg_temp_new_i64(); + middle = tcg_temp_new_i64(); + right = tcg_temp_new_i64(); + destleft = tcg_temp_new_i64(); + destright = tcg_temp_new_i64(); + + if (a->imm < 8) { + read_neon_element64(right, a->vn, 0, MO_64); + read_neon_element64(middle, a->vn, 1, MO_64); + tcg_gen_extract2_i64(destright, right, middle, a->imm * 8); + read_neon_element64(left, a->vm, 0, MO_64); + tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8); + } else { + read_neon_element64(right, a->vn, 1, MO_64); + read_neon_element64(middle, a->vm, 0, MO_64); + tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8); + read_neon_element64(left, a->vm, 1, MO_64); + tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8); + } + + write_neon_element64(destright, a->vd, 0, MO_64); + write_neon_element64(destleft, a->vd, 1, MO_64); + + tcg_temp_free_i64(destright); + tcg_temp_free_i64(destleft); + tcg_temp_free_i64(right); + tcg_temp_free_i64(middle); + tcg_temp_free_i64(left); + } + return true; +} + +static bool trans_VTBL(DisasContext *s, arg_VTBL *a) +{ + TCGv_i64 val, def; + TCGv_i32 desc; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if ((a->vn + a->len + 1) > 32) { + /* + * This is UNPREDICTABLE; we choose to UNDEF to avoid the + * helper function running off the end of the register file. + */ + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + desc = tcg_constant_i32((a->vn << 2) | a->len); + def = tcg_temp_new_i64(); + if (a->op) { + read_neon_element64(def, a->vd, 0, MO_64); + } else { + tcg_gen_movi_i64(def, 0); + } + val = tcg_temp_new_i64(); + read_neon_element64(val, a->vm, 0, MO_64); + + gen_helper_neon_tbl(val, cpu_env, desc, val, def); + write_neon_element64(val, a->vd, 0, MO_64); + + tcg_temp_free_i64(def); + tcg_temp_free_i64(val); + return true; +} + +static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vd & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd), + neon_element_offset(a->vm, a->index, a->size), + a->q ? 16 : 8, a->q ? 16 : 8); + return true; +} + +static bool trans_VREV64(DisasContext *s, arg_VREV64 *a) +{ + int pass, half; + TCGv_i32 tmp[2]; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (a->size == 3) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp[0] = tcg_temp_new_i32(); + tmp[1] = tcg_temp_new_i32(); + + for (pass = 0; pass < (a->q ? 2 : 1); pass++) { + for (half = 0; half < 2; half++) { + read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32); + switch (a->size) { + case 0: + tcg_gen_bswap32_i32(tmp[half], tmp[half]); + break; + case 1: + gen_swap_half(tmp[half], tmp[half]); + break; + case 2: + break; + default: + g_assert_not_reached(); + } + } + write_neon_element32(tmp[1], a->vd, pass * 2, MO_32); + write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32); + } + + tcg_temp_free_i32(tmp[0]); + tcg_temp_free_i32(tmp[1]); + return true; +} + +static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a, + NeonGenWidenFn *widenfn, + NeonGenTwo64OpFn *opfn, + NeonGenTwo64OpFn *accfn) +{ + /* + * Pairwise long operations: widen both halves of the pair, + * combine the pairs with the opfn, and then possibly accumulate + * into the destination with the accfn. + */ + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!widenfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + for (pass = 0; pass < a->q + 1; pass++) { + TCGv_i32 tmp; + TCGv_i64 rm0_64, rm1_64, rd_64; + + rm0_64 = tcg_temp_new_i64(); + rm1_64 = tcg_temp_new_i64(); + rd_64 = tcg_temp_new_i64(); + + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vm, pass * 2, MO_32); + widenfn(rm0_64, tmp); + read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32); + widenfn(rm1_64, tmp); + tcg_temp_free_i32(tmp); + + opfn(rd_64, rm0_64, rm1_64); + tcg_temp_free_i64(rm0_64); + tcg_temp_free_i64(rm1_64); + + if (accfn) { + TCGv_i64 tmp64 = tcg_temp_new_i64(); + read_neon_element64(tmp64, a->vd, pass, MO_64); + accfn(rd_64, tmp64, rd_64); + tcg_temp_free_i64(tmp64); + } + write_neon_element64(rd_64, a->vd, pass, MO_64); + tcg_temp_free_i64(rd_64); + } + return true; +} + +static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} + +static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} + +typedef void ZipFn(TCGv_ptr, TCGv_ptr); + +static bool do_zip_uzp(DisasContext *s, arg_2misc *a, + ZipFn *fn) +{ + TCGv_ptr pd, pm; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!fn) { + /* Bad size or size/q combination */ + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + pd = vfp_reg_ptr(true, a->vd); + pm = vfp_reg_ptr(true, a->vm); + fn(pd, pm); + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(pm); + return true; +} + +static bool trans_VUZP(DisasContext *s, arg_2misc *a) +{ + static ZipFn * const fn[2][4] = { + { + gen_helper_neon_unzip8, + gen_helper_neon_unzip16, + NULL, + NULL, + }, { + gen_helper_neon_qunzip8, + gen_helper_neon_qunzip16, + gen_helper_neon_qunzip32, + NULL, + } + }; + return do_zip_uzp(s, a, fn[a->q][a->size]); +} + +static bool trans_VZIP(DisasContext *s, arg_2misc *a) +{ + static ZipFn * const fn[2][4] = { + { + gen_helper_neon_zip8, + gen_helper_neon_zip16, + NULL, + NULL, + }, { + gen_helper_neon_qzip8, + gen_helper_neon_qzip16, + gen_helper_neon_qzip32, + NULL, + } + }; + return do_zip_uzp(s, a, fn[a->q][a->size]); +} + +static bool do_vmovn(DisasContext *s, arg_2misc *a, + NeonGenNarrowEnvFn *narrowfn) +{ + TCGv_i64 rm; + TCGv_i32 rd0, rd1; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vm & 1) { + return false; + } + + if (!narrowfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rm = tcg_temp_new_i64(); + rd0 = tcg_temp_new_i32(); + rd1 = tcg_temp_new_i32(); + + read_neon_element64(rm, a->vm, 0, MO_64); + narrowfn(rd0, cpu_env, rm); + read_neon_element64(rm, a->vm, 1, MO_64); + narrowfn(rd1, cpu_env, rm); + write_neon_element32(rd0, a->vd, 0, MO_32); + write_neon_element32(rd1, a->vd, 1, MO_32); + tcg_temp_free_i32(rd0); + tcg_temp_free_i32(rd1); + tcg_temp_free_i64(rm); + return true; +} + +#define DO_VMOVN(INSN, FUNC) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + static NeonGenNarrowEnvFn * const narrowfn[] = { \ + FUNC##8, \ + FUNC##16, \ + FUNC##32, \ + NULL, \ + }; \ + return do_vmovn(s, a, narrowfn[a->size]); \ + } + +DO_VMOVN(VMOVN, gen_neon_narrow_u) +DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat) +DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s) +DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u) + +static bool trans_VSHLL(DisasContext *s, arg_2misc *a) +{ + TCGv_i32 rm0, rm1; + TCGv_i64 rd; + static NeonGenWidenFn * const widenfns[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + NeonGenWidenFn *widenfn = widenfns[a->size]; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vd & 1) { + return false; + } + + if (!widenfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rd = tcg_temp_new_i64(); + rm0 = tcg_temp_new_i32(); + rm1 = tcg_temp_new_i32(); + + read_neon_element32(rm0, a->vm, 0, MO_32); + read_neon_element32(rm1, a->vm, 1, MO_32); + + widenfn(rd, rm0); + tcg_gen_shli_i64(rd, rd, 8 << a->size); + write_neon_element64(rd, a->vd, 0, MO_64); + widenfn(rd, rm1); + tcg_gen_shli_i64(rd, rd, 8 << a->size); + write_neon_element64(rd, a->vd, 1, MO_64); + + tcg_temp_free_i64(rd); + tcg_temp_free_i32(rm0); + tcg_temp_free_i32(rm1); + return true; +} + +static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a) +{ + TCGv_ptr fpst; + TCGv_i64 tmp; + TCGv_i32 dst0, dst1; + + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm & 1) || (a->size != 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_STD); + tmp = tcg_temp_new_i64(); + dst0 = tcg_temp_new_i32(); + dst1 = tcg_temp_new_i32(); + + read_neon_element64(tmp, a->vm, 0, MO_64); + gen_helper_bfcvt_pair(dst0, tmp, fpst); + + read_neon_element64(tmp, a->vm, 1, MO_64); + gen_helper_bfcvt_pair(dst1, tmp, fpst); + + write_neon_element32(dst0, a->vd, 0, MO_32); + write_neon_element32(dst1, a->vd, 1, MO_32); + + tcg_temp_free_i64(tmp); + tcg_temp_free_i32(dst0); + tcg_temp_free_i32(dst1); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp, tmp, tmp2, tmp3; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm & 1) || (a->size != 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_STD); + ahp = get_ahp_flag(); + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vm, 0, MO_32); + gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); + tmp2 = tcg_temp_new_i32(); + read_neon_element32(tmp2, a->vm, 1, MO_32); + gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp); + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp2, tmp2, tmp); + read_neon_element32(tmp, a->vm, 2, MO_32); + gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); + tmp3 = tcg_temp_new_i32(); + read_neon_element32(tmp3, a->vm, 3, MO_32); + write_neon_element32(tmp2, a->vd, 0, MO_32); + tcg_temp_free_i32(tmp2); + gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp); + tcg_gen_shli_i32(tmp3, tmp3, 16); + tcg_gen_or_i32(tmp3, tmp3, tmp); + write_neon_element32(tmp3, a->vd, 1, MO_32); + tcg_temp_free_i32(tmp3); + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(ahp); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp, tmp, tmp2, tmp3; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd & 1) || (a->size != 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_STD); + ahp = get_ahp_flag(); + tmp3 = tcg_temp_new_i32(); + tmp2 = tcg_temp_new_i32(); + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vm, 0, MO_32); + read_neon_element32(tmp2, a->vm, 1, MO_32); + tcg_gen_ext16u_i32(tmp3, tmp); + gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); + write_neon_element32(tmp3, a->vd, 0, MO_32); + tcg_gen_shri_i32(tmp, tmp, 16); + gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp); + write_neon_element32(tmp, a->vd, 1, MO_32); + tcg_temp_free_i32(tmp); + tcg_gen_ext16u_i32(tmp3, tmp2); + gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); + write_neon_element32(tmp3, a->vd, 2, MO_32); + tcg_temp_free_i32(tmp3); + tcg_gen_shri_i32(tmp2, tmp2, 16); + gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp); + write_neon_element32(tmp2, a->vd, 3, MO_32); + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(ahp); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn) +{ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_full_reg_offset(a->vd); + int rm_ofs = neon_full_reg_offset(a->vm); + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size == 3) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size); + + return true; +} + +#define DO_2MISC_VEC(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_2misc_vec(s, a, FN); \ + } + +DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg) +DO_2MISC_VEC(VABS, tcg_gen_gvec_abs) +DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0) +DO_2MISC_VEC(VCGT0, gen_gvec_cgt0) +DO_2MISC_VEC(VCLE0, gen_gvec_cle0) +DO_2MISC_VEC(VCGE0, gen_gvec_cge0) +DO_2MISC_VEC(VCLT0, gen_gvec_clt0) + +static bool trans_VMVN(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc_vec(s, a, tcg_gen_gvec_not); +} + +#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, uint32_t oprsz, \ + uint32_t maxsz) \ + { \ + tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz, \ + DATA, FUNC); \ + } + +#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, uint32_t oprsz, \ + uint32_t maxsz) \ + { \ + tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC); \ + } + +WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0) +WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1) +WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0) +WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1) +WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0) +WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0) +WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0) + +#define DO_2M_CRYPTO(INSN, FEATURE, SIZE) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) { \ + return false; \ + } \ + return do_2misc_vec(s, a, gen_##INSN); \ + } + +DO_2M_CRYPTO(AESE, aa32_aes, 0) +DO_2M_CRYPTO(AESD, aa32_aes, 0) +DO_2M_CRYPTO(AESMC, aa32_aes, 0) +DO_2M_CRYPTO(AESIMC, aa32_aes, 0) +DO_2M_CRYPTO(SHA1H, aa32_sha1, 2) +DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2) +DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2) + +static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn) +{ + TCGv_i32 tmp; + int pass; + + /* Handle a 2-reg-misc operation by iterating 32 bits at a time */ + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!fn) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + read_neon_element32(tmp, a->vm, pass, MO_32); + fn(tmp, tmp); + write_neon_element32(tmp, a->vd, pass, MO_32); + } + tcg_temp_free_i32(tmp); + + return true; +} + +static bool trans_VREV32(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + tcg_gen_bswap32_i32, + gen_swap_half, + NULL, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VREV16(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc(s, a, gen_rev16); +} + +static bool trans_VCLS(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_helper_neon_cls_s8, + gen_helper_neon_cls_s16, + gen_helper_neon_cls_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm) +{ + tcg_gen_clzi_i32(rd, rm, 32); +} + +static bool trans_VCLZ(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_helper_neon_clz_u8, + gen_helper_neon_clz_u16, + do_VCLZ_32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VCNT(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc(s, a, gen_helper_neon_cnt_u8); +} + +static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs, + vece == MO_16 ? 0x7fff : 0x7fffffff, + oprsz, maxsz); +} + +static bool trans_VABS_F(DisasContext *s, arg_2misc *a) +{ + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + } else if (a->size != MO_32) { + return false; + } + return do_2misc_vec(s, a, gen_VABS_F); +} + +static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t oprsz, uint32_t maxsz) +{ + tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs, + vece == MO_16 ? 0x8000 : 0x80000000, + oprsz, maxsz); +} + +static bool trans_VNEG_F(DisasContext *s, arg_2misc *a) +{ + if (a->size == MO_16) { + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + } else if (a->size != MO_32) { + return false; + } + return do_2misc_vec(s, a, gen_VNEG_F); +} + +static bool trans_VRECPE(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + return do_2misc(s, a, gen_helper_recpe_u32); +} + +static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + return do_2misc(s, a, gen_helper_rsqrte_u32); +} + +#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \ + static void WRAPNAME(TCGv_i32 d, TCGv_i32 m) \ + { \ + FUNC(d, cpu_env, m); \ + } + +WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8) +WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16) +WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32) +WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8) +WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16) +WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32) + +static bool trans_VQABS(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_VQABS_s8, + gen_VQABS_s16, + gen_VQABS_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VQNEG(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_VQNEG_s8, + gen_VQNEG_s16, + gen_VQNEG_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \ + static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static gen_helper_gvec_2_ptr * const fns[4] = { \ + NULL, HFUNC, SFUNC, NULL, \ + }; \ + TCGv_ptr fpst; \ + fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD); \ + tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0, \ + fns[vece]); \ + tcg_temp_free_ptr(fpst); \ + } \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + if (a->size == MO_16) { \ + if (!dc_isar_feature(aa32_fp16_arith, s)) { \ + return false; \ + } \ + } else if (a->size != MO_32) { \ + return false; \ + } \ + return do_2misc_vec(s, a, gen_##INSN); \ + } + +DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s) +DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s) +DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s) +DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s) +DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s) +DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s) +DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s) +DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos) +DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos) +DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs) +DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs) + +DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s) + +static bool trans_VRINTX(DisasContext *s, arg_2misc *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + return trans_VRINTX_impl(s, a); +} + +#define DO_VEC_RMODE(INSN, RMODE, OP) \ + static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, \ + uint32_t oprsz, uint32_t maxsz) \ + { \ + static gen_helper_gvec_2_ptr * const fns[4] = { \ + NULL, \ + gen_helper_gvec_##OP##h, \ + gen_helper_gvec_##OP##s, \ + NULL, \ + }; \ + TCGv_ptr fpst; \ + fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD); \ + tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, \ + arm_rmode_to_sf(RMODE), fns[vece]); \ + tcg_temp_free_ptr(fpst); \ + } \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + if (!arm_dc_feature(s, ARM_FEATURE_V8)) { \ + return false; \ + } \ + if (a->size == MO_16) { \ + if (!dc_isar_feature(aa32_fp16_arith, s)) { \ + return false; \ + } \ + } else if (a->size != MO_32) { \ + return false; \ + } \ + return do_2misc_vec(s, a, gen_##INSN); \ + } + +DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u) +DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s) +DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u) +DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s) +DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u) +DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s) +DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u) +DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s) + +DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_) +DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_) +DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_) +DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_) +DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_) + +static bool trans_VSWP(DisasContext *s, arg_2misc *a) +{ + TCGv_i64 rm, rd; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size != 0) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rm = tcg_temp_new_i64(); + rd = tcg_temp_new_i64(); + for (pass = 0; pass < (a->q ? 2 : 1); pass++) { + read_neon_element64(rm, a->vm, pass, MO_64); + read_neon_element64(rd, a->vd, pass, MO_64); + write_neon_element64(rm, a->vd, pass, MO_64); + write_neon_element64(rd, a->vm, pass, MO_64); + } + tcg_temp_free_i64(rm); + tcg_temp_free_i64(rd); + + return true; +} +static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 rd, tmp; + + rd = tcg_temp_new_i32(); + tmp = tcg_temp_new_i32(); + + tcg_gen_shli_i32(rd, t0, 8); + tcg_gen_andi_i32(rd, rd, 0xff00ff00); + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); + tcg_gen_or_i32(rd, rd, tmp); + + tcg_gen_shri_i32(t1, t1, 8); + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(rd); +} + +static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 rd, tmp; + + rd = tcg_temp_new_i32(); + tmp = tcg_temp_new_i32(); + + tcg_gen_shli_i32(rd, t0, 16); + tcg_gen_andi_i32(tmp, t1, 0xffff); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shri_i32(t1, t1, 16); + tcg_gen_andi_i32(tmp, t0, 0xffff0000); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(rd); +} + +static bool trans_VTRN(DisasContext *s, arg_2misc *a) +{ + TCGv_i32 tmp, tmp2; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (a->size == 3) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + tmp2 = tcg_temp_new_i32(); + if (a->size == MO_32) { + for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) { + read_neon_element32(tmp, a->vm, pass, MO_32); + read_neon_element32(tmp2, a->vd, pass + 1, MO_32); + write_neon_element32(tmp2, a->vm, pass, MO_32); + write_neon_element32(tmp, a->vd, pass + 1, MO_32); + } + } else { + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + read_neon_element32(tmp, a->vm, pass, MO_32); + read_neon_element32(tmp2, a->vd, pass, MO_32); + if (a->size == MO_8) { + gen_neon_trn_u8(tmp, tmp2); + } else { + gen_neon_trn_u16(tmp, tmp2); + } + write_neon_element32(tmp2, a->vm, pass, MO_32); + write_neon_element32(tmp, a->vd, pass, MO_32); + } + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(tmp2); + return true; +} + +static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_smmla_b); +} + +static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_ummla_b); +} + +static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a) +{ + if (!dc_isar_feature(aa32_i8mm, s)) { + return false; + } + return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_usmmla_b); +} + +static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a) +{ + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0, + gen_helper_gvec_bfmmla); +} + +static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a) +{ + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD, + gen_helper_gvec_bfmlal); +} + +static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a) +{ + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm, + (a->index << 1) | a->q, FPST_STD, + gen_helper_gvec_bfmlal_idx); +} diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c new file mode 100644 index 0000000..7b87a9d --- /dev/null +++ b/target/arm/tcg/translate-sme.c @@ -0,0 +1,373 @@ +/* + * AArch64 SME translation + * + * Copyright (c) 2022 Linaro, Ltd + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "tcg/tcg-gvec-desc.h" +#include "translate.h" +#include "exec/helper-gen.h" +#include "translate-a64.h" +#include "fpu/softfloat.h" + + +/* + * Include the generated decoder. + */ + +#include "decode-sme.c.inc" + + +/* + * Resolve tile.size[index] to a host pointer, where tile and index + * are always decoded together, dependent on the element size. + */ +static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, + int tile_index, bool vertical) +{ + int tile = tile_index >> (4 - esz); + int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); + int pos, len, offset; + TCGv_i32 tmp; + TCGv_ptr addr; + + /* Compute the final index, which is Rs+imm. */ + tmp = tcg_temp_new_i32(); + tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); + tcg_gen_addi_i32(tmp, tmp, index); + + /* Prepare a power-of-two modulo via extraction of @len bits. */ + len = ctz32(streaming_vec_reg_size(s)) - esz; + + if (vertical) { + /* + * Compute the byte offset of the index within the tile: + * (index % (svl / size)) * size + * = (index % (svl >> esz)) << esz + * Perform the power-of-two modulo via extraction of the low @len bits. + * Perform the multiply by shifting left by @pos bits. + * Perform these operations simultaneously via deposit into zero. + */ + pos = esz; + tcg_gen_deposit_z_i32(tmp, tmp, pos, len); + + /* + * For big-endian, adjust the indexed column byte offset within + * the uint64_t host words that make up env->zarray[]. + */ + if (HOST_BIG_ENDIAN && esz < MO_64) { + tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz)); + } + } else { + /* + * Compute the byte offset of the index within the tile: + * (index % (svl / size)) * (size * sizeof(row)) + * = (index % (svl >> esz)) << (esz + log2(sizeof(row))) + */ + pos = esz + ctz32(sizeof(ARMVectorReg)); + tcg_gen_deposit_z_i32(tmp, tmp, pos, len); + + /* Row slices are always aligned and need no endian adjustment. */ + } + + /* The tile byte offset within env->zarray is the row. */ + offset = tile * sizeof(ARMVectorReg); + + /* Include the byte offset of zarray to make this relative to env. */ + offset += offsetof(CPUARMState, zarray); + tcg_gen_addi_i32(tmp, tmp, offset); + + /* Add the byte offset to env to produce the final pointer. */ + addr = tcg_temp_new_ptr(); + tcg_gen_ext_i32_ptr(addr, tmp); + tcg_temp_free_i32(tmp); + tcg_gen_add_ptr(addr, addr, cpu_env); + + return addr; +} + +static bool trans_ZERO(DisasContext *s, arg_ZERO *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_za_enabled_check(s)) { + gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm), + tcg_constant_i32(streaming_vec_reg_size(s))); + } + return true; +} + +static bool trans_MOVA(DisasContext *s, arg_MOVA *a) +{ + static gen_helper_gvec_4 * const h_fns[5] = { + gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, + gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d, + gen_helper_sve_sel_zpzz_q + }; + static gen_helper_gvec_3 * const cz_fns[5] = { + gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h, + gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d, + gen_helper_sme_mova_cz_q, + }; + static gen_helper_gvec_3 * const zc_fns[5] = { + gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h, + gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d, + gen_helper_sme_mova_zc_q, + }; + + TCGv_ptr t_za, t_zr, t_pg; + TCGv_i32 t_desc; + int svl; + + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (!sme_smza_enabled_check(s)) { + return true; + } + + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); + t_zr = vec_full_reg_ptr(s, a->zr); + t_pg = pred_full_reg_ptr(s, a->pg); + + svl = streaming_vec_reg_size(s); + t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); + + if (a->v) { + /* Vertical slice -- use sme mova helpers. */ + if (a->to_vec) { + zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); + } else { + cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); + } + } else { + /* Horizontal slice -- reuse sve sel helpers. */ + if (a->to_vec) { + h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); + } else { + h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); + } + } + + tcg_temp_free_ptr(t_za); + tcg_temp_free_ptr(t_zr); + tcg_temp_free_ptr(t_pg); + + return true; +} + +static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) +{ + typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); + + /* + * Indexed by [esz][be][v][mte][st], which is (except for load/store) + * also the order in which the elements appear in the function names, + * and so how we must concatenate the pieces. + */ + +#define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F } +#define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) } +#define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) } +#define FN_END(L, B) { FN_HV(L), FN_HV(B) } + + static GenLdSt1 * const fns[5][2][2][2][2] = { + FN_END(b, b), + FN_END(h_le, h_be), + FN_END(s_le, s_be), + FN_END(d_le, d_be), + FN_END(q_le, q_be), + }; + +#undef FN_LS +#undef FN_MTE +#undef FN_HV +#undef FN_END + + TCGv_ptr t_za, t_pg; + TCGv_i64 addr; + int svl, desc = 0; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (!sme_smza_enabled_check(s)) { + return true; + } + + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); + t_pg = pred_full_reg_ptr(s, a->pg); + addr = tcg_temp_new_i64(); + + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + + if (mte) { + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1); + desc <<= SVE_MTEDESC_SHIFT; + } else { + addr = clean_data_tbi(s, addr); + } + svl = streaming_vec_reg_size(s); + desc = simd_desc(svl, svl, desc); + + fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr, + tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_za); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i64(addr); + return true; +} + +typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); + +static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) +{ + int svl = streaming_vec_reg_size(s); + int imm = a->imm; + TCGv_ptr base; + + if (!sme_za_enabled_check(s)) { + return true; + } + + /* ZA[n] equates to ZA0H.B[n]. */ + base = get_tile_rowcol(s, MO_8, a->rv, imm, false); + + fn(s, base, 0, svl, a->rn, imm * svl); + + tcg_temp_free_ptr(base); + return true; +} + +TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) +TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) + +static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, + gen_helper_gvec_4 *fn) +{ + int svl = streaming_vec_reg_size(s); + uint32_t desc = simd_desc(svl, svl, 0); + TCGv_ptr za, zn, pn, pm; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + /* Sum XZR+zad to find ZAd. */ + za = get_tile_rowcol(s, esz, 31, a->zad, false); + zn = vec_full_reg_ptr(s, a->zn); + pn = pred_full_reg_ptr(s, a->pn); + pm = pred_full_reg_ptr(s, a->pm); + + fn(za, zn, pn, pm, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(za); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pn); + tcg_temp_free_ptr(pm); + return true; +} + +TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s) +TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s) +TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d) +TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d) + +static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz, + gen_helper_gvec_5 *fn) +{ + int svl = streaming_vec_reg_size(s); + uint32_t desc = simd_desc(svl, svl, a->sub); + TCGv_ptr za, zn, zm, pn, pm; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + /* Sum XZR+zad to find ZAd. */ + za = get_tile_rowcol(s, esz, 31, a->zad, false); + zn = vec_full_reg_ptr(s, a->zn); + zm = vec_full_reg_ptr(s, a->zm); + pn = pred_full_reg_ptr(s, a->pn); + pm = pred_full_reg_ptr(s, a->pm); + + fn(za, zn, zm, pn, pm, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(za); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pn); + tcg_temp_free_ptr(pm); + return true; +} + +static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, + gen_helper_gvec_5_ptr *fn) +{ + int svl = streaming_vec_reg_size(s); + uint32_t desc = simd_desc(svl, svl, a->sub); + TCGv_ptr za, zn, zm, pn, pm, fpst; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + /* Sum XZR+zad to find ZAd. */ + za = get_tile_rowcol(s, esz, 31, a->zad, false); + zn = vec_full_reg_ptr(s, a->zn); + zm = vec_full_reg_ptr(s, a->zm); + pn = pred_full_reg_ptr(s, a->pn); + pm = pred_full_reg_ptr(s, a->pm); + fpst = fpstatus_ptr(FPST_FPCR); + + fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(za); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pn); + tcg_temp_free_ptr(pm); + tcg_temp_free_ptr(fpst); + return true; +} + +TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h) +TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s) +TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d) + +/* TODO: FEAT_EBF16 */ +TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa) + +TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) +TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) +TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s) +TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s) + +TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d) +TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) +TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) +TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c new file mode 100644 index 0000000..621a2ab --- /dev/null +++ b/target/arm/tcg/translate-sve.c @@ -0,0 +1,7583 @@ +/* + * AArch64 SVE translation + * + * Copyright (c) 2018 Linaro, Ltd + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "exec/exec-all.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "tcg/tcg-gvec-desc.h" +#include "qemu/log.h" +#include "arm_ldst.h" +#include "translate.h" +#include "internals.h" +#include "exec/helper-proto.h" +#include "exec/helper-gen.h" +#include "exec/log.h" +#include "translate-a64.h" +#include "fpu/softfloat.h" + + +typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t, + TCGv_i64, uint32_t, uint32_t); + +typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + +typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32); +typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i64, TCGv_i32); + +/* + * Helpers for extracting complex instruction fields. + */ + +/* See e.g. ASR (immediate, predicated). + * Returns -1 for unallocated encoding; diagnose later. + */ +static int tszimm_esz(DisasContext *s, int x) +{ + x >>= 3; /* discard imm3 */ + return 31 - clz32(x); +} + +static int tszimm_shr(DisasContext *s, int x) +{ + return (16 << tszimm_esz(s, x)) - x; +} + +/* See e.g. LSL (immediate, predicated). */ +static int tszimm_shl(DisasContext *s, int x) +{ + return x - (8 << tszimm_esz(s, x)); +} + +/* The SH bit is in bit 8. Extract the low 8 and shift. */ +static inline int expand_imm_sh8s(DisasContext *s, int x) +{ + return (int8_t)x << (x & 0x100 ? 8 : 0); +} + +static inline int expand_imm_sh8u(DisasContext *s, int x) +{ + return (uint8_t)x << (x & 0x100 ? 8 : 0); +} + +/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype) + * with unsigned data. C.f. SVE Memory Contiguous Load Group. + */ +static inline int msz_dtype(DisasContext *s, int msz) +{ + static const uint8_t dtype[4] = { 0, 5, 10, 15 }; + return dtype[msz]; +} + +/* + * Include the generated decoder. + */ + +#include "decode-sve.c.inc" + +/* + * Implement all of the translator functions referenced by the decoder. + */ + +/* Invoke an out-of-line helper on 2 Zregs. */ +static bool gen_gvec_ool_zz(DisasContext *s, gen_helper_gvec_2 *fn, + int rd, int rn, int data) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_fpst_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn, + int rd, int rn, int data, + ARMFPStatusFlavour flavour) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(flavour); + + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + status, vsz, vsz, data, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool gen_gvec_fpst_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn, + arg_rr_esz *a, int data) +{ + return gen_gvec_fpst_zz(s, fn, a->rd, a->rn, data, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); +} + +/* Invoke an out-of-line helper on 3 Zregs. */ +static bool gen_gvec_ool_zzz(DisasContext *s, gen_helper_gvec_3 *fn, + int rd, int rn, int rm, int data) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_ool_arg_zzz(DisasContext *s, gen_helper_gvec_3 *fn, + arg_rrr_esz *a, int data) +{ + return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, data); +} + +/* Invoke an out-of-line helper on 3 Zregs, plus float_status. */ +static bool gen_gvec_fpst_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, + int rd, int rn, int rm, + int data, ARMFPStatusFlavour flavour) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(flavour); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + status, vsz, vsz, data, fn); + + tcg_temp_free_ptr(status); + } + return true; +} + +static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, + arg_rrr_esz *a, int data) +{ + return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); +} + +/* Invoke an out-of-line helper on 4 Zregs. */ +static bool gen_gvec_ool_zzzz(DisasContext *s, gen_helper_gvec_4 *fn, + int rd, int rn, int rm, int ra, int data) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_ool_arg_zzzz(DisasContext *s, gen_helper_gvec_4 *fn, + arg_rrrr_esz *a, int data) +{ + return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, data); +} + +static bool gen_gvec_ool_arg_zzxz(DisasContext *s, gen_helper_gvec_4 *fn, + arg_rrxr_esz *a) +{ + return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index); +} + +/* Invoke an out-of-line helper on 4 Zregs, plus a pointer. */ +static bool gen_gvec_ptr_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + int rd, int rn, int rm, int ra, + int data, TCGv_ptr ptr) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + ptr, vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_fpst_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + int rd, int rn, int rm, int ra, + int data, ARMFPStatusFlavour flavour) +{ + TCGv_ptr status = fpstatus_ptr(flavour); + bool ret = gen_gvec_ptr_zzzz(s, fn, rd, rn, rm, ra, data, status); + tcg_temp_free_ptr(status); + return ret; +} + +/* Invoke an out-of-line helper on 4 Zregs, 1 Preg, plus fpst. */ +static bool gen_gvec_fpst_zzzzp(DisasContext *s, gen_helper_gvec_5_ptr *fn, + int rd, int rn, int rm, int ra, int pg, + int data, ARMFPStatusFlavour flavour) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(flavour); + + tcg_gen_gvec_5_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + pred_full_reg_offset(s, pg), + status, vsz, vsz, data, fn); + + tcg_temp_free_ptr(status); + } + return true; +} + +/* Invoke an out-of-line helper on 2 Zregs and a predicate. */ +static bool gen_gvec_ool_zzp(DisasContext *s, gen_helper_gvec_3 *fn, + int rd, int rn, int pg, int data) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_ool_arg_zpz(DisasContext *s, gen_helper_gvec_3 *fn, + arg_rpr_esz *a, int data) +{ + return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, data); +} + +static bool gen_gvec_ool_arg_zpzi(DisasContext *s, gen_helper_gvec_3 *fn, + arg_rpri_esz *a) +{ + return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, a->imm); +} + +static bool gen_gvec_fpst_zzp(DisasContext *s, gen_helper_gvec_3_ptr *fn, + int rd, int rn, int pg, int data, + ARMFPStatusFlavour flavour) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(flavour); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + status, vsz, vsz, data, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool gen_gvec_fpst_arg_zpz(DisasContext *s, gen_helper_gvec_3_ptr *fn, + arg_rpr_esz *a, int data, + ARMFPStatusFlavour flavour) +{ + return gen_gvec_fpst_zzp(s, fn, a->rd, a->rn, a->pg, data, flavour); +} + +/* Invoke an out-of-line helper on 3 Zregs and a predicate. */ +static bool gen_gvec_ool_zzzp(DisasContext *s, gen_helper_gvec_4 *fn, + int rd, int rn, int rm, int pg, int data) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + pred_full_reg_offset(s, pg), + vsz, vsz, data, fn); + } + return true; +} + +static bool gen_gvec_ool_arg_zpzz(DisasContext *s, gen_helper_gvec_4 *fn, + arg_rprr_esz *a, int data) +{ + return gen_gvec_ool_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, data); +} + +/* Invoke an out-of-line helper on 3 Zregs and a predicate. */ +static bool gen_gvec_fpst_zzzp(DisasContext *s, gen_helper_gvec_4_ptr *fn, + int rd, int rn, int rm, int pg, int data, + ARMFPStatusFlavour flavour) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(flavour); + + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + pred_full_reg_offset(s, pg), + status, vsz, vsz, data, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool gen_gvec_fpst_arg_zpzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, + arg_rprr_esz *a) +{ + return gen_gvec_fpst_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, 0, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); +} + +/* Invoke a vector expander on two Zregs and an immediate. */ +static bool gen_gvec_fn_zzi(DisasContext *s, GVecGen2iFn *gvec_fn, + int esz, int rd, int rn, uint64_t imm) +{ + if (gvec_fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + gvec_fn(esz, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), imm, vsz, vsz); + } + return true; +} + +static bool gen_gvec_fn_arg_zzi(DisasContext *s, GVecGen2iFn *gvec_fn, + arg_rri_esz *a) +{ + if (a->esz < 0) { + /* Invalid tsz encoding -- see tszimm_esz. */ + return false; + } + return gen_gvec_fn_zzi(s, gvec_fn, a->esz, a->rd, a->rn, a->imm); +} + +/* Invoke a vector expander on three Zregs. */ +static bool gen_gvec_fn_zzz(DisasContext *s, GVecGen3Fn *gvec_fn, + int esz, int rd, int rn, int rm) +{ + if (gvec_fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + gvec_fn(esz, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), vsz, vsz); + } + return true; +} + +static bool gen_gvec_fn_arg_zzz(DisasContext *s, GVecGen3Fn *fn, + arg_rrr_esz *a) +{ + return gen_gvec_fn_zzz(s, fn, a->esz, a->rd, a->rn, a->rm); +} + +/* Invoke a vector expander on four Zregs. */ +static bool gen_gvec_fn_arg_zzzz(DisasContext *s, GVecGen4Fn *gvec_fn, + arg_rrrr_esz *a) +{ + if (gvec_fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vec_full_reg_offset(s, a->ra), vsz, vsz); + } + return true; +} + +/* Invoke a vector move on two Zregs. */ +static bool do_mov_z(DisasContext *s, int rd, int rn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_mov(MO_8, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), vsz, vsz); + } + return true; +} + +/* Initialize a Zreg with replications of a 64-bit immediate. */ +static void do_dupi_z(DisasContext *s, int rd, uint64_t word) +{ + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word); +} + +/* Invoke a vector expander on three Pregs. */ +static bool gen_gvec_fn_ppp(DisasContext *s, GVecGen3Fn *gvec_fn, + int rd, int rn, int rm) +{ + if (sve_access_check(s)) { + unsigned psz = pred_gvec_reg_size(s); + gvec_fn(MO_64, pred_full_reg_offset(s, rd), + pred_full_reg_offset(s, rn), + pred_full_reg_offset(s, rm), psz, psz); + } + return true; +} + +/* Invoke a vector move on two Pregs. */ +static bool do_mov_p(DisasContext *s, int rd, int rn) +{ + if (sve_access_check(s)) { + unsigned psz = pred_gvec_reg_size(s); + tcg_gen_gvec_mov(MO_8, pred_full_reg_offset(s, rd), + pred_full_reg_offset(s, rn), psz, psz); + } + return true; +} + +/* Set the cpu flags as per a return from an SVE helper. */ +static void do_pred_flags(TCGv_i32 t) +{ + tcg_gen_mov_i32(cpu_NF, t); + tcg_gen_andi_i32(cpu_ZF, t, 2); + tcg_gen_andi_i32(cpu_CF, t, 1); + tcg_gen_movi_i32(cpu_VF, 0); +} + +/* Subroutines computing the ARM PredTest psuedofunction. */ +static void do_predtest1(TCGv_i64 d, TCGv_i64 g) +{ + TCGv_i32 t = tcg_temp_new_i32(); + + gen_helper_sve_predtest1(t, d, g); + do_pred_flags(t); + tcg_temp_free_i32(t); +} + +static void do_predtest(DisasContext *s, int dofs, int gofs, int words) +{ + TCGv_ptr dptr = tcg_temp_new_ptr(); + TCGv_ptr gptr = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_temp_new_i32(); + + tcg_gen_addi_ptr(dptr, cpu_env, dofs); + tcg_gen_addi_ptr(gptr, cpu_env, gofs); + + gen_helper_sve_predtest(t, dptr, gptr, tcg_constant_i32(words)); + tcg_temp_free_ptr(dptr); + tcg_temp_free_ptr(gptr); + + do_pred_flags(t); + tcg_temp_free_i32(t); +} + +/* For each element size, the bits within a predicate word that are active. */ +const uint64_t pred_esz_masks[5] = { + 0xffffffffffffffffull, 0x5555555555555555ull, + 0x1111111111111111ull, 0x0101010101010101ull, + 0x0001000100010001ull, +}; + +static bool trans_INVALID(DisasContext *s, arg_INVALID *a) +{ + unallocated_encoding(s); + return true; +} + +/* + *** SVE Logical - Unpredicated Group + */ + +TRANS_FEAT(AND_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_and, a) +TRANS_FEAT(ORR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_or, a) +TRANS_FEAT(EOR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_xor, a) +TRANS_FEAT(BIC_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_andc, a) + +static void gen_xar8_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + uint64_t mask = dup_const(MO_8, 0xff >> sh); + + tcg_gen_xor_i64(t, n, m); + tcg_gen_shri_i64(d, t, sh); + tcg_gen_shli_i64(t, t, 8 - sh); + tcg_gen_andi_i64(d, d, mask); + tcg_gen_andi_i64(t, t, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_xar16_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + uint64_t mask = dup_const(MO_16, 0xffff >> sh); + + tcg_gen_xor_i64(t, n, m); + tcg_gen_shri_i64(d, t, sh); + tcg_gen_shli_i64(t, t, 16 - sh); + tcg_gen_andi_i64(d, d, mask); + tcg_gen_andi_i64(t, t, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_xar_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, int32_t sh) +{ + tcg_gen_xor_i32(d, n, m); + tcg_gen_rotri_i32(d, d, sh); +} + +static void gen_xar_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh) +{ + tcg_gen_xor_i64(d, n, m); + tcg_gen_rotri_i64(d, d, sh); +} + +static void gen_xar_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, int64_t sh) +{ + tcg_gen_xor_vec(vece, d, n, m); + tcg_gen_rotri_vec(vece, d, d, sh); +} + +void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, int64_t shift, + uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop[] = { INDEX_op_rotli_vec, 0 }; + static const GVecGen3i ops[4] = { + { .fni8 = gen_xar8_i64, + .fniv = gen_xar_vec, + .fno = gen_helper_sve2_xar_b, + .opt_opc = vecop, + .vece = MO_8 }, + { .fni8 = gen_xar16_i64, + .fniv = gen_xar_vec, + .fno = gen_helper_sve2_xar_h, + .opt_opc = vecop, + .vece = MO_16 }, + { .fni4 = gen_xar_i32, + .fniv = gen_xar_vec, + .fno = gen_helper_sve2_xar_s, + .opt_opc = vecop, + .vece = MO_32 }, + { .fni8 = gen_xar_i64, + .fniv = gen_xar_vec, + .fno = gen_helper_gvec_xar_d, + .opt_opc = vecop, + .vece = MO_64 } + }; + int esize = 8 << vece; + + /* The SVE2 range is 1 .. esize; the AdvSIMD range is 0 .. esize-1. */ + tcg_debug_assert(shift >= 0); + tcg_debug_assert(shift <= esize); + shift &= esize - 1; + + if (shift == 0) { + /* xar with no rotate devolves to xor. */ + tcg_gen_gvec_xor(vece, rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz); + } else { + tcg_gen_gvec_3i(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, + shift, &ops[vece]); + } +} + +static bool trans_XAR(DisasContext *s, arg_rrri_esz *a) +{ + if (a->esz < 0 || !dc_isar_feature(aa64_sve2, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + gen_gvec_xar(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), a->imm, vsz, vsz); + } + return true; +} + +static void gen_eor3_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) +{ + tcg_gen_xor_i64(d, n, m); + tcg_gen_xor_i64(d, d, k); +} + +static void gen_eor3_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec k) +{ + tcg_gen_xor_vec(vece, d, n, m); + tcg_gen_xor_vec(vece, d, d, k); +} + +static void gen_eor3(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen4 op = { + .fni8 = gen_eor3_i64, + .fniv = gen_eor3_vec, + .fno = gen_helper_sve2_eor3, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op); +} + +TRANS_FEAT(EOR3, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_eor3, a) + +static void gen_bcax_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) +{ + tcg_gen_andc_i64(d, m, k); + tcg_gen_xor_i64(d, d, n); +} + +static void gen_bcax_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec k) +{ + tcg_gen_andc_vec(vece, d, m, k); + tcg_gen_xor_vec(vece, d, d, n); +} + +static void gen_bcax(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen4 op = { + .fni8 = gen_bcax_i64, + .fniv = gen_bcax_vec, + .fno = gen_helper_sve2_bcax, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op); +} + +TRANS_FEAT(BCAX, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bcax, a) + +static void gen_bsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + /* BSL differs from the generic bitsel in argument ordering. */ + tcg_gen_gvec_bitsel(vece, d, a, n, m, oprsz, maxsz); +} + +TRANS_FEAT(BSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl, a) + +static void gen_bsl1n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) +{ + tcg_gen_andc_i64(n, k, n); + tcg_gen_andc_i64(m, m, k); + tcg_gen_or_i64(d, n, m); +} + +static void gen_bsl1n_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec k) +{ + if (TCG_TARGET_HAS_bitsel_vec) { + tcg_gen_not_vec(vece, n, n); + tcg_gen_bitsel_vec(vece, d, k, n, m); + } else { + tcg_gen_andc_vec(vece, n, k, n); + tcg_gen_andc_vec(vece, m, m, k); + tcg_gen_or_vec(vece, d, n, m); + } +} + +static void gen_bsl1n(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen4 op = { + .fni8 = gen_bsl1n_i64, + .fniv = gen_bsl1n_vec, + .fno = gen_helper_sve2_bsl1n, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op); +} + +TRANS_FEAT(BSL1N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl1n, a) + +static void gen_bsl2n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) +{ + /* + * Z[dn] = (n & k) | (~m & ~k) + * = | ~(m | k) + */ + tcg_gen_and_i64(n, n, k); + if (TCG_TARGET_HAS_orc_i64) { + tcg_gen_or_i64(m, m, k); + tcg_gen_orc_i64(d, n, m); + } else { + tcg_gen_nor_i64(m, m, k); + tcg_gen_or_i64(d, n, m); + } +} + +static void gen_bsl2n_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec k) +{ + if (TCG_TARGET_HAS_bitsel_vec) { + tcg_gen_not_vec(vece, m, m); + tcg_gen_bitsel_vec(vece, d, k, n, m); + } else { + tcg_gen_and_vec(vece, n, n, k); + tcg_gen_or_vec(vece, m, m, k); + tcg_gen_orc_vec(vece, d, n, m); + } +} + +static void gen_bsl2n(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen4 op = { + .fni8 = gen_bsl2n_i64, + .fniv = gen_bsl2n_vec, + .fno = gen_helper_sve2_bsl2n, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op); +} + +TRANS_FEAT(BSL2N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl2n, a) + +static void gen_nbsl_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) +{ + tcg_gen_and_i64(n, n, k); + tcg_gen_andc_i64(m, m, k); + tcg_gen_nor_i64(d, n, m); +} + +static void gen_nbsl_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec k) +{ + tcg_gen_bitsel_vec(vece, d, k, n, m); + tcg_gen_not_vec(vece, d, d); +} + +static void gen_nbsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen4 op = { + .fni8 = gen_nbsl_i64, + .fniv = gen_nbsl_vec, + .fno = gen_helper_sve2_nbsl, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op); +} + +TRANS_FEAT(NBSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_nbsl, a) + +/* + *** SVE Integer Arithmetic - Unpredicated Group + */ + +TRANS_FEAT(ADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_add, a) +TRANS_FEAT(SUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sub, a) +TRANS_FEAT(SQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ssadd, a) +TRANS_FEAT(SQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sssub, a) +TRANS_FEAT(UQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_usadd, a) +TRANS_FEAT(UQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ussub, a) + +/* + *** SVE Integer Arithmetic - Binary Predicated Group + */ + +/* Select active elememnts from Zn and inactive elements from Zm, + * storing the result in Zd. + */ +static bool do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz) +{ + static gen_helper_gvec_4 * const fns[4] = { + gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, + gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d + }; + return gen_gvec_ool_zzzp(s, fns[esz], rd, rn, rm, pg, 0); +} + +#define DO_ZPZZ(NAME, FEAT, name) \ + static gen_helper_gvec_4 * const name##_zpzz_fns[4] = { \ + gen_helper_##name##_zpzz_b, gen_helper_##name##_zpzz_h, \ + gen_helper_##name##_zpzz_s, gen_helper_##name##_zpzz_d, \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpzz, \ + name##_zpzz_fns[a->esz], a, 0) + +DO_ZPZZ(AND_zpzz, aa64_sve, sve_and) +DO_ZPZZ(EOR_zpzz, aa64_sve, sve_eor) +DO_ZPZZ(ORR_zpzz, aa64_sve, sve_orr) +DO_ZPZZ(BIC_zpzz, aa64_sve, sve_bic) + +DO_ZPZZ(ADD_zpzz, aa64_sve, sve_add) +DO_ZPZZ(SUB_zpzz, aa64_sve, sve_sub) + +DO_ZPZZ(SMAX_zpzz, aa64_sve, sve_smax) +DO_ZPZZ(UMAX_zpzz, aa64_sve, sve_umax) +DO_ZPZZ(SMIN_zpzz, aa64_sve, sve_smin) +DO_ZPZZ(UMIN_zpzz, aa64_sve, sve_umin) +DO_ZPZZ(SABD_zpzz, aa64_sve, sve_sabd) +DO_ZPZZ(UABD_zpzz, aa64_sve, sve_uabd) + +DO_ZPZZ(MUL_zpzz, aa64_sve, sve_mul) +DO_ZPZZ(SMULH_zpzz, aa64_sve, sve_smulh) +DO_ZPZZ(UMULH_zpzz, aa64_sve, sve_umulh) + +DO_ZPZZ(ASR_zpzz, aa64_sve, sve_asr) +DO_ZPZZ(LSR_zpzz, aa64_sve, sve_lsr) +DO_ZPZZ(LSL_zpzz, aa64_sve, sve_lsl) + +static gen_helper_gvec_4 * const sdiv_fns[4] = { + NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d +}; +TRANS_FEAT(SDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, sdiv_fns[a->esz], a, 0) + +static gen_helper_gvec_4 * const udiv_fns[4] = { + NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d +}; +TRANS_FEAT(UDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, udiv_fns[a->esz], a, 0) + +TRANS_FEAT(SEL_zpzz, aa64_sve, do_sel_z, a->rd, a->rn, a->rm, a->pg, a->esz) + +/* + *** SVE Integer Arithmetic - Unary Predicated Group + */ + +#define DO_ZPZ(NAME, FEAT, name) \ + static gen_helper_gvec_3 * const name##_fns[4] = { \ + gen_helper_##name##_b, gen_helper_##name##_h, \ + gen_helper_##name##_s, gen_helper_##name##_d, \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpz, name##_fns[a->esz], a, 0) + +DO_ZPZ(CLS, aa64_sve, sve_cls) +DO_ZPZ(CLZ, aa64_sve, sve_clz) +DO_ZPZ(CNT_zpz, aa64_sve, sve_cnt_zpz) +DO_ZPZ(CNOT, aa64_sve, sve_cnot) +DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz) +DO_ZPZ(ABS, aa64_sve, sve_abs) +DO_ZPZ(NEG, aa64_sve, sve_neg) +DO_ZPZ(RBIT, aa64_sve, sve_rbit) + +static gen_helper_gvec_3 * const fabs_fns[4] = { + NULL, gen_helper_sve_fabs_h, + gen_helper_sve_fabs_s, gen_helper_sve_fabs_d, +}; +TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, fabs_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const fneg_fns[4] = { + NULL, gen_helper_sve_fneg_h, + gen_helper_sve_fneg_s, gen_helper_sve_fneg_d, +}; +TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, fneg_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const sxtb_fns[4] = { + NULL, gen_helper_sve_sxtb_h, + gen_helper_sve_sxtb_s, gen_helper_sve_sxtb_d, +}; +TRANS_FEAT(SXTB, aa64_sve, gen_gvec_ool_arg_zpz, sxtb_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const uxtb_fns[4] = { + NULL, gen_helper_sve_uxtb_h, + gen_helper_sve_uxtb_s, gen_helper_sve_uxtb_d, +}; +TRANS_FEAT(UXTB, aa64_sve, gen_gvec_ool_arg_zpz, uxtb_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const sxth_fns[4] = { + NULL, NULL, gen_helper_sve_sxth_s, gen_helper_sve_sxth_d +}; +TRANS_FEAT(SXTH, aa64_sve, gen_gvec_ool_arg_zpz, sxth_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const uxth_fns[4] = { + NULL, NULL, gen_helper_sve_uxth_s, gen_helper_sve_uxth_d +}; +TRANS_FEAT(UXTH, aa64_sve, gen_gvec_ool_arg_zpz, uxth_fns[a->esz], a, 0) + +TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz, + a->esz == 3 ? gen_helper_sve_sxtw_d : NULL, a, 0) +TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz, + a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0) + +/* + *** SVE Integer Reduction Group + */ + +typedef void gen_helper_gvec_reduc(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_i32); +static bool do_vpz_ool(DisasContext *s, arg_rpr_esz *a, + gen_helper_gvec_reduc *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_zn, t_pg; + TCGv_i32 desc; + TCGv_i64 temp; + + if (fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + temp = tcg_temp_new_i64(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + fn(temp, t_zn, t_pg, desc); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); + + write_fp_dreg(s, a->rd, temp); + tcg_temp_free_i64(temp); + return true; +} + +#define DO_VPZ(NAME, name) \ + static gen_helper_gvec_reduc * const name##_fns[4] = { \ + gen_helper_sve_##name##_b, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \ + }; \ + TRANS_FEAT(NAME, aa64_sve, do_vpz_ool, a, name##_fns[a->esz]) + +DO_VPZ(ORV, orv) +DO_VPZ(ANDV, andv) +DO_VPZ(EORV, eorv) + +DO_VPZ(UADDV, uaddv) +DO_VPZ(SMAXV, smaxv) +DO_VPZ(UMAXV, umaxv) +DO_VPZ(SMINV, sminv) +DO_VPZ(UMINV, uminv) + +static gen_helper_gvec_reduc * const saddv_fns[4] = { + gen_helper_sve_saddv_b, gen_helper_sve_saddv_h, + gen_helper_sve_saddv_s, NULL +}; +TRANS_FEAT(SADDV, aa64_sve, do_vpz_ool, a, saddv_fns[a->esz]) + +#undef DO_VPZ + +/* + *** SVE Shift by Immediate - Predicated Group + */ + +/* + * Copy Zn into Zd, storing zeros into inactive elements. + * If invert, store zeros into the active elements. + */ +static bool do_movz_zpz(DisasContext *s, int rd, int rn, int pg, + int esz, bool invert) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_movz_b, gen_helper_sve_movz_h, + gen_helper_sve_movz_s, gen_helper_sve_movz_d, + }; + return gen_gvec_ool_zzp(s, fns[esz], rd, rn, pg, invert); +} + +static bool do_shift_zpzi(DisasContext *s, arg_rpri_esz *a, bool asr, + gen_helper_gvec_3 * const fns[4]) +{ + int max; + + if (a->esz < 0) { + /* Invalid tsz encoding -- see tszimm_esz. */ + return false; + } + + /* + * Shift by element size is architecturally valid. + * For arithmetic right-shift, it's the same as by one less. + * For logical shifts and ASRD, it is a zeroing operation. + */ + max = 8 << a->esz; + if (a->imm >= max) { + if (asr) { + a->imm = max - 1; + } else { + return do_movz_zpz(s, a->rd, a->rd, a->pg, a->esz, true); + } + } + return gen_gvec_ool_arg_zpzi(s, fns[a->esz], a); +} + +static gen_helper_gvec_3 * const asr_zpzi_fns[4] = { + gen_helper_sve_asr_zpzi_b, gen_helper_sve_asr_zpzi_h, + gen_helper_sve_asr_zpzi_s, gen_helper_sve_asr_zpzi_d, +}; +TRANS_FEAT(ASR_zpzi, aa64_sve, do_shift_zpzi, a, true, asr_zpzi_fns) + +static gen_helper_gvec_3 * const lsr_zpzi_fns[4] = { + gen_helper_sve_lsr_zpzi_b, gen_helper_sve_lsr_zpzi_h, + gen_helper_sve_lsr_zpzi_s, gen_helper_sve_lsr_zpzi_d, +}; +TRANS_FEAT(LSR_zpzi, aa64_sve, do_shift_zpzi, a, false, lsr_zpzi_fns) + +static gen_helper_gvec_3 * const lsl_zpzi_fns[4] = { + gen_helper_sve_lsl_zpzi_b, gen_helper_sve_lsl_zpzi_h, + gen_helper_sve_lsl_zpzi_s, gen_helper_sve_lsl_zpzi_d, +}; +TRANS_FEAT(LSL_zpzi, aa64_sve, do_shift_zpzi, a, false, lsl_zpzi_fns) + +static gen_helper_gvec_3 * const asrd_fns[4] = { + gen_helper_sve_asrd_b, gen_helper_sve_asrd_h, + gen_helper_sve_asrd_s, gen_helper_sve_asrd_d, +}; +TRANS_FEAT(ASRD, aa64_sve, do_shift_zpzi, a, false, asrd_fns) + +static gen_helper_gvec_3 * const sqshl_zpzi_fns[4] = { + gen_helper_sve2_sqshl_zpzi_b, gen_helper_sve2_sqshl_zpzi_h, + gen_helper_sve2_sqshl_zpzi_s, gen_helper_sve2_sqshl_zpzi_d, +}; +TRANS_FEAT(SQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi, + a->esz < 0 ? NULL : sqshl_zpzi_fns[a->esz], a) + +static gen_helper_gvec_3 * const uqshl_zpzi_fns[4] = { + gen_helper_sve2_uqshl_zpzi_b, gen_helper_sve2_uqshl_zpzi_h, + gen_helper_sve2_uqshl_zpzi_s, gen_helper_sve2_uqshl_zpzi_d, +}; +TRANS_FEAT(UQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi, + a->esz < 0 ? NULL : uqshl_zpzi_fns[a->esz], a) + +static gen_helper_gvec_3 * const srshr_fns[4] = { + gen_helper_sve2_srshr_b, gen_helper_sve2_srshr_h, + gen_helper_sve2_srshr_s, gen_helper_sve2_srshr_d, +}; +TRANS_FEAT(SRSHR, aa64_sve2, gen_gvec_ool_arg_zpzi, + a->esz < 0 ? NULL : srshr_fns[a->esz], a) + +static gen_helper_gvec_3 * const urshr_fns[4] = { + gen_helper_sve2_urshr_b, gen_helper_sve2_urshr_h, + gen_helper_sve2_urshr_s, gen_helper_sve2_urshr_d, +}; +TRANS_FEAT(URSHR, aa64_sve2, gen_gvec_ool_arg_zpzi, + a->esz < 0 ? NULL : urshr_fns[a->esz], a) + +static gen_helper_gvec_3 * const sqshlu_fns[4] = { + gen_helper_sve2_sqshlu_b, gen_helper_sve2_sqshlu_h, + gen_helper_sve2_sqshlu_s, gen_helper_sve2_sqshlu_d, +}; +TRANS_FEAT(SQSHLU, aa64_sve2, gen_gvec_ool_arg_zpzi, + a->esz < 0 ? NULL : sqshlu_fns[a->esz], a) + +/* + *** SVE Bitwise Shift - Predicated Group + */ + +#define DO_ZPZW(NAME, name) \ + static gen_helper_gvec_4 * const name##_zpzw_fns[4] = { \ + gen_helper_sve_##name##_zpzw_b, gen_helper_sve_##name##_zpzw_h, \ + gen_helper_sve_##name##_zpzw_s, NULL \ + }; \ + TRANS_FEAT(NAME##_zpzw, aa64_sve, gen_gvec_ool_arg_zpzz, \ + a->esz < 0 ? NULL : name##_zpzw_fns[a->esz], a, 0) + +DO_ZPZW(ASR, asr) +DO_ZPZW(LSR, lsr) +DO_ZPZW(LSL, lsl) + +#undef DO_ZPZW + +/* + *** SVE Bitwise Shift - Unpredicated Group + */ + +static bool do_shift_imm(DisasContext *s, arg_rri_esz *a, bool asr, + void (*gvec_fn)(unsigned, uint32_t, uint32_t, + int64_t, uint32_t, uint32_t)) +{ + if (a->esz < 0) { + /* Invalid tsz encoding -- see tszimm_esz. */ + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + /* Shift by element size is architecturally valid. For + arithmetic right-shift, it's the same as by one less. + Otherwise it is a zeroing operation. */ + if (a->imm >= 8 << a->esz) { + if (asr) { + a->imm = (8 << a->esz) - 1; + } else { + do_dupi_z(s, a->rd, 0); + return true; + } + } + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +TRANS_FEAT(ASR_zzi, aa64_sve, do_shift_imm, a, true, tcg_gen_gvec_sari) +TRANS_FEAT(LSR_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shri) +TRANS_FEAT(LSL_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shli) + +#define DO_ZZW(NAME, name) \ + static gen_helper_gvec_3 * const name##_zzw_fns[4] = { \ + gen_helper_sve_##name##_zzw_b, gen_helper_sve_##name##_zzw_h, \ + gen_helper_sve_##name##_zzw_s, NULL \ + }; \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_arg_zzz, \ + name##_zzw_fns[a->esz], a, 0) + +DO_ZZW(ASR_zzw, asr) +DO_ZZW(LSR_zzw, lsr) +DO_ZZW(LSL_zzw, lsl) + +#undef DO_ZZW + +/* + *** SVE Integer Multiply-Add Group + */ + +static bool do_zpzzz_ool(DisasContext *s, arg_rprrr_esz *a, + gen_helper_gvec_5 *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_5_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->ra), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + vsz, vsz, 0, fn); + } + return true; +} + +static gen_helper_gvec_5 * const mla_fns[4] = { + gen_helper_sve_mla_b, gen_helper_sve_mla_h, + gen_helper_sve_mla_s, gen_helper_sve_mla_d, +}; +TRANS_FEAT(MLA, aa64_sve, do_zpzzz_ool, a, mla_fns[a->esz]) + +static gen_helper_gvec_5 * const mls_fns[4] = { + gen_helper_sve_mls_b, gen_helper_sve_mls_h, + gen_helper_sve_mls_s, gen_helper_sve_mls_d, +}; +TRANS_FEAT(MLS, aa64_sve, do_zpzzz_ool, a, mls_fns[a->esz]) + +/* + *** SVE Index Generation Group + */ + +static bool do_index(DisasContext *s, int esz, int rd, + TCGv_i64 start, TCGv_i64 incr) +{ + unsigned vsz; + TCGv_i32 desc; + TCGv_ptr t_zd; + + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + t_zd = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd)); + if (esz == 3) { + gen_helper_sve_index_d(t_zd, start, incr, desc); + } else { + typedef void index_fn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32); + static index_fn * const fns[3] = { + gen_helper_sve_index_b, + gen_helper_sve_index_h, + gen_helper_sve_index_s, + }; + TCGv_i32 s32 = tcg_temp_new_i32(); + TCGv_i32 i32 = tcg_temp_new_i32(); + + tcg_gen_extrl_i64_i32(s32, start); + tcg_gen_extrl_i64_i32(i32, incr); + fns[esz](t_zd, s32, i32, desc); + + tcg_temp_free_i32(s32); + tcg_temp_free_i32(i32); + } + tcg_temp_free_ptr(t_zd); + return true; +} + +TRANS_FEAT(INDEX_ii, aa64_sve, do_index, a->esz, a->rd, + tcg_constant_i64(a->imm1), tcg_constant_i64(a->imm2)) +TRANS_FEAT(INDEX_ir, aa64_sve, do_index, a->esz, a->rd, + tcg_constant_i64(a->imm), cpu_reg(s, a->rm)) +TRANS_FEAT(INDEX_ri, aa64_sve, do_index, a->esz, a->rd, + cpu_reg(s, a->rn), tcg_constant_i64(a->imm)) +TRANS_FEAT(INDEX_rr, aa64_sve, do_index, a->esz, a->rd, + cpu_reg(s, a->rn), cpu_reg(s, a->rm)) + +/* + *** SVE Stack Allocation Group + */ + +static bool trans_ADDVL(DisasContext *s, arg_ADDVL *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * vec_full_reg_size(s)); + } + return true; +} + +static bool trans_ADDSVL(DisasContext *s, arg_ADDSVL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * streaming_vec_reg_size(s)); + } + return true; +} + +static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * pred_full_reg_size(s)); + } + return true; +} + +static bool trans_ADDSPL(DisasContext *s, arg_ADDSPL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * streaming_pred_reg_size(s)); + } + return true; +} + +static bool trans_RDVL(DisasContext *s, arg_RDVL *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s)); + } + return true; +} + +static bool trans_RDSVL(DisasContext *s, arg_RDSVL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + tcg_gen_movi_i64(reg, a->imm * streaming_vec_reg_size(s)); + } + return true; +} + +/* + *** SVE Compute Vector Address Group + */ + +static bool do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn) +{ + return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, a->imm); +} + +TRANS_FEAT_NONSTREAMING(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32) +TRANS_FEAT_NONSTREAMING(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64) +TRANS_FEAT_NONSTREAMING(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32) +TRANS_FEAT_NONSTREAMING(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32) + +/* + *** SVE Integer Misc - Unpredicated Group + */ + +static gen_helper_gvec_2 * const fexpa_fns[4] = { + NULL, gen_helper_sve_fexpa_h, + gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d, +}; +TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz, + fexpa_fns[a->esz], a->rd, a->rn, 0) + +static gen_helper_gvec_3 * const ftssel_fns[4] = { + NULL, gen_helper_sve_ftssel_h, + gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d, +}; +TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz, + ftssel_fns[a->esz], a, 0) + +/* + *** SVE Predicate Logical Operations Group + */ + +static bool do_pppp_flags(DisasContext *s, arg_rprr_s *a, + const GVecGen4 *gvec_op) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned psz = pred_gvec_reg_size(s); + int dofs = pred_full_reg_offset(s, a->rd); + int nofs = pred_full_reg_offset(s, a->rn); + int mofs = pred_full_reg_offset(s, a->rm); + int gofs = pred_full_reg_offset(s, a->pg); + + if (!a->s) { + tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op); + return true; + } + + if (psz == 8) { + /* Do the operation and the flags generation in temps. */ + TCGv_i64 pd = tcg_temp_new_i64(); + TCGv_i64 pn = tcg_temp_new_i64(); + TCGv_i64 pm = tcg_temp_new_i64(); + TCGv_i64 pg = tcg_temp_new_i64(); + + tcg_gen_ld_i64(pn, cpu_env, nofs); + tcg_gen_ld_i64(pm, cpu_env, mofs); + tcg_gen_ld_i64(pg, cpu_env, gofs); + + gvec_op->fni8(pd, pn, pm, pg); + tcg_gen_st_i64(pd, cpu_env, dofs); + + do_predtest1(pd, pg); + + tcg_temp_free_i64(pd); + tcg_temp_free_i64(pn); + tcg_temp_free_i64(pm); + tcg_temp_free_i64(pg); + } else { + /* The operation and flags generation is large. The computation + * of the flags depends on the original contents of the guarding + * predicate. If the destination overwrites the guarding predicate, + * then the easiest way to get this right is to save a copy. + */ + int tofs = gofs; + if (a->rd == a->pg) { + tofs = offsetof(CPUARMState, vfp.preg_tmp); + tcg_gen_gvec_mov(0, tofs, gofs, psz, psz); + } + + tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op); + do_predtest(s, dofs, tofs, psz / 8); + } + return true; +} + +static void gen_and_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_and_i64(pd, pn, pm); + tcg_gen_and_i64(pd, pd, pg); +} + +static void gen_and_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_and_vec(vece, pd, pn, pm); + tcg_gen_and_vec(vece, pd, pd, pg); +} + +static bool trans_AND_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_and_pg_i64, + .fniv = gen_and_pg_vec, + .fno = gen_helper_sve_and_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!a->s) { + if (a->rn == a->rm) { + if (a->pg == a->rn) { + return do_mov_p(s, a->rd, a->rn); + } + return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->pg); + } else if (a->pg == a->rn || a->pg == a->rm) { + return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->rm); + } + } + return do_pppp_flags(s, a, &op); +} + +static void gen_bic_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_andc_i64(pd, pn, pm); + tcg_gen_and_i64(pd, pd, pg); +} + +static void gen_bic_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_andc_vec(vece, pd, pn, pm); + tcg_gen_and_vec(vece, pd, pd, pg); +} + +static bool trans_BIC_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_bic_pg_i64, + .fniv = gen_bic_pg_vec, + .fno = gen_helper_sve_bic_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!a->s && a->pg == a->rn) { + return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->rn, a->rm); + } + return do_pppp_flags(s, a, &op); +} + +static void gen_eor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_xor_i64(pd, pn, pm); + tcg_gen_and_i64(pd, pd, pg); +} + +static void gen_eor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_xor_vec(vece, pd, pn, pm); + tcg_gen_and_vec(vece, pd, pd, pg); +} + +static bool trans_EOR_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_eor_pg_i64, + .fniv = gen_eor_pg_vec, + .fno = gen_helper_sve_eor_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + /* Alias NOT (predicate) is EOR Pd.B, Pg/Z, Pn.B, Pg.B */ + if (!a->s && a->pg == a->rm) { + return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->pg, a->rn); + } + return do_pppp_flags(s, a, &op); +} + +static bool trans_SEL_pppp(DisasContext *s, arg_rprr_s *a) +{ + if (a->s || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned psz = pred_gvec_reg_size(s); + tcg_gen_gvec_bitsel(MO_8, pred_full_reg_offset(s, a->rd), + pred_full_reg_offset(s, a->pg), + pred_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->rm), psz, psz); + } + return true; +} + +static void gen_orr_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_or_i64(pd, pn, pm); + tcg_gen_and_i64(pd, pd, pg); +} + +static void gen_orr_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_or_vec(vece, pd, pn, pm); + tcg_gen_and_vec(vece, pd, pd, pg); +} + +static bool trans_ORR_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_orr_pg_i64, + .fniv = gen_orr_pg_vec, + .fno = gen_helper_sve_orr_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!a->s && a->pg == a->rn && a->rn == a->rm) { + return do_mov_p(s, a->rd, a->rn); + } + return do_pppp_flags(s, a, &op); +} + +static void gen_orn_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_orc_i64(pd, pn, pm); + tcg_gen_and_i64(pd, pd, pg); +} + +static void gen_orn_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_orc_vec(vece, pd, pn, pm); + tcg_gen_and_vec(vece, pd, pd, pg); +} + +static bool trans_ORN_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_orn_pg_i64, + .fniv = gen_orn_pg_vec, + .fno = gen_helper_sve_orn_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + return do_pppp_flags(s, a, &op); +} + +static void gen_nor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_or_i64(pd, pn, pm); + tcg_gen_andc_i64(pd, pg, pd); +} + +static void gen_nor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_or_vec(vece, pd, pn, pm); + tcg_gen_andc_vec(vece, pd, pg, pd); +} + +static bool trans_NOR_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_nor_pg_i64, + .fniv = gen_nor_pg_vec, + .fno = gen_helper_sve_nor_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + return do_pppp_flags(s, a, &op); +} + +static void gen_nand_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg) +{ + tcg_gen_and_i64(pd, pn, pm); + tcg_gen_andc_i64(pd, pg, pd); +} + +static void gen_nand_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn, + TCGv_vec pm, TCGv_vec pg) +{ + tcg_gen_and_vec(vece, pd, pn, pm); + tcg_gen_andc_vec(vece, pd, pg, pd); +} + +static bool trans_NAND_pppp(DisasContext *s, arg_rprr_s *a) +{ + static const GVecGen4 op = { + .fni8 = gen_nand_pg_i64, + .fniv = gen_nand_pg_vec, + .fno = gen_helper_sve_nand_pppp, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + return do_pppp_flags(s, a, &op); +} + +/* + *** SVE Predicate Misc Group + */ + +static bool trans_PTEST(DisasContext *s, arg_PTEST *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int nofs = pred_full_reg_offset(s, a->rn); + int gofs = pred_full_reg_offset(s, a->pg); + int words = DIV_ROUND_UP(pred_full_reg_size(s), 8); + + if (words == 1) { + TCGv_i64 pn = tcg_temp_new_i64(); + TCGv_i64 pg = tcg_temp_new_i64(); + + tcg_gen_ld_i64(pn, cpu_env, nofs); + tcg_gen_ld_i64(pg, cpu_env, gofs); + do_predtest1(pn, pg); + + tcg_temp_free_i64(pn); + tcg_temp_free_i64(pg); + } else { + do_predtest(s, nofs, gofs, words); + } + } + return true; +} + +/* See the ARM pseudocode DecodePredCount. */ +static unsigned decode_pred_count(unsigned fullsz, int pattern, int esz) +{ + unsigned elements = fullsz >> esz; + unsigned bound; + + switch (pattern) { + case 0x0: /* POW2 */ + return pow2floor(elements); + case 0x1: /* VL1 */ + case 0x2: /* VL2 */ + case 0x3: /* VL3 */ + case 0x4: /* VL4 */ + case 0x5: /* VL5 */ + case 0x6: /* VL6 */ + case 0x7: /* VL7 */ + case 0x8: /* VL8 */ + bound = pattern; + break; + case 0x9: /* VL16 */ + case 0xa: /* VL32 */ + case 0xb: /* VL64 */ + case 0xc: /* VL128 */ + case 0xd: /* VL256 */ + bound = 16 << (pattern - 9); + break; + case 0x1d: /* MUL4 */ + return elements - elements % 4; + case 0x1e: /* MUL3 */ + return elements - elements % 3; + case 0x1f: /* ALL */ + return elements; + default: /* #uimm5 */ + return 0; + } + return elements >= bound ? bound : 0; +} + +/* This handles all of the predicate initialization instructions, + * PTRUE, PFALSE, SETFFR. For PFALSE, we will have set PAT == 32 + * so that decode_pred_count returns 0. For SETFFR, we will have + * set RD == 16 == FFR. + */ +static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned ofs = pred_full_reg_offset(s, rd); + unsigned numelem, setsz, i; + uint64_t word, lastword; + TCGv_i64 t; + + numelem = decode_pred_count(fullsz, pat, esz); + + /* Determine what we must store into each bit, and how many. */ + if (numelem == 0) { + lastword = word = 0; + setsz = fullsz; + } else { + setsz = numelem << esz; + lastword = word = pred_esz_masks[esz]; + if (setsz % 64) { + lastword &= MAKE_64BIT_MASK(0, setsz % 64); + } + } + + t = tcg_temp_new_i64(); + if (fullsz <= 64) { + tcg_gen_movi_i64(t, lastword); + tcg_gen_st_i64(t, cpu_env, ofs); + goto done; + } + + if (word == lastword) { + unsigned maxsz = size_for_gvec(fullsz / 8); + unsigned oprsz = size_for_gvec(setsz / 8); + + if (oprsz * 8 == setsz) { + tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word); + goto done; + } + } + + setsz /= 8; + fullsz /= 8; + + tcg_gen_movi_i64(t, word); + for (i = 0; i < QEMU_ALIGN_DOWN(setsz, 8); i += 8) { + tcg_gen_st_i64(t, cpu_env, ofs + i); + } + if (lastword != word) { + tcg_gen_movi_i64(t, lastword); + tcg_gen_st_i64(t, cpu_env, ofs + i); + i += 8; + } + if (i < fullsz) { + tcg_gen_movi_i64(t, 0); + for (; i < fullsz; i += 8) { + tcg_gen_st_i64(t, cpu_env, ofs + i); + } + } + + done: + tcg_temp_free_i64(t); + + /* PTRUES */ + if (setflag) { + tcg_gen_movi_i32(cpu_NF, -(word != 0)); + tcg_gen_movi_i32(cpu_CF, word == 0); + tcg_gen_movi_i32(cpu_VF, 0); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + } + return true; +} + +TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s) + +/* Note pat == 31 is #all, to set all elements. */ +TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve, + do_predset, 0, FFR_PRED_NUM, 31, false) + +/* Note pat == 32 is #unimp, to set no elements. */ +TRANS_FEAT(PFALSE, aa64_sve, do_predset, 0, a->rd, 32, false) + +static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a) +{ + /* The path through do_pppp_flags is complicated enough to want to avoid + * duplication. Frob the arguments into the form of a predicated AND. + */ + arg_rprr_s alt_a = { + .rd = a->rd, .pg = a->pg, .s = a->s, + .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM, + }; + + s->is_nonstreaming = true; + return trans_AND_pppp(s, &alt_a); +} + +TRANS_FEAT_NONSTREAMING(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM) +TRANS_FEAT_NONSTREAMING(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn) + +static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a, + void (*gen_fn)(TCGv_i32, TCGv_ptr, + TCGv_ptr, TCGv_i32)) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGv_ptr t_pd = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + TCGv_i32 t; + unsigned desc = 0; + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s)); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + + tcg_gen_addi_ptr(t_pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->rn)); + t = tcg_temp_new_i32(); + + gen_fn(t, t_pd, t_pg, tcg_constant_i32(desc)); + tcg_temp_free_ptr(t_pd); + tcg_temp_free_ptr(t_pg); + + do_pred_flags(t); + tcg_temp_free_i32(t); + return true; +} + +TRANS_FEAT(PFIRST, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pfirst) +TRANS_FEAT(PNEXT, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pnext) + +/* + *** SVE Element Count Group + */ + +/* Perform an inline saturating addition of a 32-bit value within + * a 64-bit register. The second operand is known to be positive, + * which halves the comparisions we must perform to bound the result. + */ +static void do_sat_addsub_32(TCGv_i64 reg, TCGv_i64 val, bool u, bool d) +{ + int64_t ibound; + + /* Use normal 64-bit arithmetic to detect 32-bit overflow. */ + if (u) { + tcg_gen_ext32u_i64(reg, reg); + } else { + tcg_gen_ext32s_i64(reg, reg); + } + if (d) { + tcg_gen_sub_i64(reg, reg, val); + ibound = (u ? 0 : INT32_MIN); + tcg_gen_smax_i64(reg, reg, tcg_constant_i64(ibound)); + } else { + tcg_gen_add_i64(reg, reg, val); + ibound = (u ? UINT32_MAX : INT32_MAX); + tcg_gen_smin_i64(reg, reg, tcg_constant_i64(ibound)); + } +} + +/* Similarly with 64-bit values. */ +static void do_sat_addsub_64(TCGv_i64 reg, TCGv_i64 val, bool u, bool d) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t2; + + if (u) { + if (d) { + tcg_gen_sub_i64(t0, reg, val); + t2 = tcg_constant_i64(0); + tcg_gen_movcond_i64(TCG_COND_LTU, reg, reg, val, t2, t0); + } else { + tcg_gen_add_i64(t0, reg, val); + t2 = tcg_constant_i64(-1); + tcg_gen_movcond_i64(TCG_COND_LTU, reg, t0, reg, t2, t0); + } + } else { + TCGv_i64 t1 = tcg_temp_new_i64(); + if (d) { + /* Detect signed overflow for subtraction. */ + tcg_gen_xor_i64(t0, reg, val); + tcg_gen_sub_i64(t1, reg, val); + tcg_gen_xor_i64(reg, reg, t1); + tcg_gen_and_i64(t0, t0, reg); + + /* Bound the result. */ + tcg_gen_movi_i64(reg, INT64_MIN); + t2 = tcg_constant_i64(0); + tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, reg, t1); + } else { + /* Detect signed overflow for addition. */ + tcg_gen_xor_i64(t0, reg, val); + tcg_gen_add_i64(reg, reg, val); + tcg_gen_xor_i64(t1, reg, val); + tcg_gen_andc_i64(t0, t1, t0); + + /* Bound the result. */ + tcg_gen_movi_i64(t1, INT64_MAX); + t2 = tcg_constant_i64(0); + tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, t1, reg); + } + tcg_temp_free_i64(t1); + } + tcg_temp_free_i64(t0); +} + +/* Similarly with a vector and a scalar operand. */ +static void do_sat_addsub_vec(DisasContext *s, int esz, int rd, int rn, + TCGv_i64 val, bool u, bool d) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr dptr, nptr; + TCGv_i32 t32, desc; + TCGv_i64 t64; + + dptr = tcg_temp_new_ptr(); + nptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(dptr, cpu_env, vec_full_reg_offset(s, rd)); + tcg_gen_addi_ptr(nptr, cpu_env, vec_full_reg_offset(s, rn)); + desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + + switch (esz) { + case MO_8: + t32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t32, val); + if (d) { + tcg_gen_neg_i32(t32, t32); + } + if (u) { + gen_helper_sve_uqaddi_b(dptr, nptr, t32, desc); + } else { + gen_helper_sve_sqaddi_b(dptr, nptr, t32, desc); + } + tcg_temp_free_i32(t32); + break; + + case MO_16: + t32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t32, val); + if (d) { + tcg_gen_neg_i32(t32, t32); + } + if (u) { + gen_helper_sve_uqaddi_h(dptr, nptr, t32, desc); + } else { + gen_helper_sve_sqaddi_h(dptr, nptr, t32, desc); + } + tcg_temp_free_i32(t32); + break; + + case MO_32: + t64 = tcg_temp_new_i64(); + if (d) { + tcg_gen_neg_i64(t64, val); + } else { + tcg_gen_mov_i64(t64, val); + } + if (u) { + gen_helper_sve_uqaddi_s(dptr, nptr, t64, desc); + } else { + gen_helper_sve_sqaddi_s(dptr, nptr, t64, desc); + } + tcg_temp_free_i64(t64); + break; + + case MO_64: + if (u) { + if (d) { + gen_helper_sve_uqsubi_d(dptr, nptr, val, desc); + } else { + gen_helper_sve_uqaddi_d(dptr, nptr, val, desc); + } + } else if (d) { + t64 = tcg_temp_new_i64(); + tcg_gen_neg_i64(t64, val); + gen_helper_sve_sqaddi_d(dptr, nptr, t64, desc); + tcg_temp_free_i64(t64); + } else { + gen_helper_sve_sqaddi_d(dptr, nptr, val, desc); + } + break; + + default: + g_assert_not_reached(); + } + + tcg_temp_free_ptr(dptr); + tcg_temp_free_ptr(nptr); +} + +static bool trans_CNT_r(DisasContext *s, arg_CNT_r *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + tcg_gen_movi_i64(cpu_reg(s, a->rd), numelem * a->imm); + } + return true; +} + +static bool trans_INCDEC_r(DisasContext *s, arg_incdec_cnt *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm * (a->d ? -1 : 1); + TCGv_i64 reg = cpu_reg(s, a->rd); + + tcg_gen_addi_i64(reg, reg, inc); + } + return true; +} + +static bool trans_SINCDEC_r_32(DisasContext *s, arg_incdec_cnt *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + TCGv_i64 reg = cpu_reg(s, a->rd); + + /* Use normal 64-bit arithmetic to detect 32-bit overflow. */ + if (inc == 0) { + if (a->u) { + tcg_gen_ext32u_i64(reg, reg); + } else { + tcg_gen_ext32s_i64(reg, reg); + } + } else { + do_sat_addsub_32(reg, tcg_constant_i64(inc), a->u, a->d); + } + return true; +} + +static bool trans_SINCDEC_r_64(DisasContext *s, arg_incdec_cnt *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + TCGv_i64 reg = cpu_reg(s, a->rd); + + if (inc != 0) { + do_sat_addsub_64(reg, tcg_constant_i64(inc), a->u, a->d); + } + return true; +} + +static bool trans_INCDEC_v(DisasContext *s, arg_incdec2_cnt *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + + if (inc != 0) { + if (sve_access_check(s)) { + tcg_gen_gvec_adds(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + tcg_constant_i64(a->d ? -inc : inc), + fullsz, fullsz); + } + } else { + do_mov_z(s, a->rd, a->rn); + } + return true; +} + +static bool trans_SINCDEC_v(DisasContext *s, arg_incdec2_cnt *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + + if (inc != 0) { + if (sve_access_check(s)) { + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, + tcg_constant_i64(inc), a->u, a->d); + } + } else { + do_mov_z(s, a->rd, a->rn); + } + return true; +} + +/* + *** SVE Bitwise Immediate Group + */ + +static bool do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn) +{ + uint64_t imm; + if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1), + extract32(a->dbm, 0, 6), + extract32(a->dbm, 6, 6))) { + return false; + } + return gen_gvec_fn_zzi(s, gvec_fn, MO_64, a->rd, a->rn, imm); +} + +TRANS_FEAT(AND_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_andi) +TRANS_FEAT(ORR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_ori) +TRANS_FEAT(EOR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_xori) + +static bool trans_DUPM(DisasContext *s, arg_DUPM *a) +{ + uint64_t imm; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1), + extract32(a->dbm, 0, 6), + extract32(a->dbm, 6, 6))) { + return false; + } + if (sve_access_check(s)) { + do_dupi_z(s, a->rd, imm); + } + return true; +} + +/* + *** SVE Integer Wide Immediate - Predicated Group + */ + +/* Implement all merging copies. This is used for CPY (immediate), + * FCPY, CPY (scalar), CPY (SIMD&FP scalar). + */ +static void do_cpy_m(DisasContext *s, int esz, int rd, int rn, int pg, + TCGv_i64 val) +{ + typedef void gen_cpy(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_cpy * const fns[4] = { + gen_helper_sve_cpy_m_b, gen_helper_sve_cpy_m_h, + gen_helper_sve_cpy_m_s, gen_helper_sve_cpy_m_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + fns[esz](t_zd, t_zn, t_pg, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); +} + +static bool trans_FCPY(DisasContext *s, arg_FCPY *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + /* Decode the VFP immediate. */ + uint64_t imm = vfp_expand_imm(a->esz, a->imm); + do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(imm)); + } + return true; +} + +static bool trans_CPY_m_i(DisasContext *s, arg_rpri_esz *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(a->imm)); + } + return true; +} + +static bool trans_CPY_z_i(DisasContext *s, arg_CPY_z_i *a) +{ + static gen_helper_gvec_2i * const fns[4] = { + gen_helper_sve_cpy_z_b, gen_helper_sve_cpy_z_h, + gen_helper_sve_cpy_z_s, gen_helper_sve_cpy_z_d, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + pred_full_reg_offset(s, a->pg), + tcg_constant_i64(a->imm), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +/* + *** SVE Permute Extract Group + */ + +static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned n_ofs = imm >= vsz ? 0 : imm; + unsigned n_siz = vsz - n_ofs; + unsigned d = vec_full_reg_offset(s, rd); + unsigned n = vec_full_reg_offset(s, rn); + unsigned m = vec_full_reg_offset(s, rm); + + /* Use host vector move insns if we have appropriate sizes + * and no unfortunate overlap. + */ + if (m != d + && n_ofs == size_for_gvec(n_ofs) + && n_siz == size_for_gvec(n_siz) + && (d != n || n_siz <= n_ofs)) { + tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz); + if (n_ofs != 0) { + tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs); + } + } else { + tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext); + } + return true; +} + +TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm) +TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm) + +/* + *** SVE Permute - Unpredicated Group + */ + +static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if ((a->imm & 0x1f) == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned dofs = vec_full_reg_offset(s, a->rd); + unsigned esz, index; + + esz = ctz32(a->imm); + index = a->imm >> (esz + 1); + + if ((index << esz) < vsz) { + unsigned nofs = vec_reg_offset(s, a->rn, index, esz); + tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz); + } else { + /* + * While dup_mem handles 128-bit elements, dup_imm does not. + * Thankfully element size doesn't matter for splatting zero. + */ + tcg_gen_gvec_dup_imm(MO_64, dofs, vsz, vsz, 0); + } + } + return true; +} + +static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val) +{ + typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_insr * const fns[4] = { + gen_helper_sve_insr_b, gen_helper_sve_insr_h, + gen_helper_sve_insr_s, gen_helper_sve_insr_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + + fns[a->esz](t_zd, t_zn, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); +} + +static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64)); + do_insr_i64(s, a, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + do_insr_i64(s, a, cpu_reg(s, a->rm)); + } + return true; +} + +static gen_helper_gvec_2 * const rev_fns[4] = { + gen_helper_sve_rev_b, gen_helper_sve_rev_h, + gen_helper_sve_rev_s, gen_helper_sve_rev_d +}; +TRANS_FEAT(REV_v, aa64_sve, gen_gvec_ool_zz, rev_fns[a->esz], a->rd, a->rn, 0) + +static gen_helper_gvec_3 * const sve_tbl_fns[4] = { + gen_helper_sve_tbl_b, gen_helper_sve_tbl_h, + gen_helper_sve_tbl_s, gen_helper_sve_tbl_d +}; +TRANS_FEAT(TBL, aa64_sve, gen_gvec_ool_arg_zzz, sve_tbl_fns[a->esz], a, 0) + +static gen_helper_gvec_4 * const sve2_tbl_fns[4] = { + gen_helper_sve2_tbl_b, gen_helper_sve2_tbl_h, + gen_helper_sve2_tbl_s, gen_helper_sve2_tbl_d +}; +TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz], + a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0) + +static gen_helper_gvec_3 * const tbx_fns[4] = { + gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h, + gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d +}; +TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0) + +static bool trans_UNPK(DisasContext *s, arg_UNPK *a) +{ + static gen_helper_gvec_2 * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h }, + { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s }, + { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d }, + }; + + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + + (a->h ? vsz / 2 : 0), + vsz, vsz, 0, fns[a->esz][a->u]); + } + return true; +} + +/* + *** SVE Permute - Predicates Group + */ + +static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd, + gen_helper_gvec_3 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_ptr t_m = tcg_temp_new_ptr(); + uint32_t desc = 0; + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd); + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm)); + + fn(t_d, t_n, t_m, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + tcg_temp_free_ptr(t_m); + return true; +} + +static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd, + gen_helper_gvec_2 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + uint32_t desc = 0; + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd); + + fn(t_d, t_n, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + return true; +} + +TRANS_FEAT(ZIP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_zip_p) +TRANS_FEAT(ZIP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_zip_p) +TRANS_FEAT(UZP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_uzp_p) +TRANS_FEAT(UZP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_uzp_p) +TRANS_FEAT(TRN1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_trn_p) +TRANS_FEAT(TRN2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_trn_p) + +TRANS_FEAT(REV_p, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_rev_p) +TRANS_FEAT(PUNPKLO, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_punpk_p) +TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p) + +/* + *** SVE Permute - Interleaving Group + */ + +static gen_helper_gvec_3 * const zip_fns[4] = { + gen_helper_sve_zip_b, gen_helper_sve_zip_h, + gen_helper_sve_zip_s, gen_helper_sve_zip_d, +}; +TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz, + zip_fns[a->esz], a, 0) +TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz, + zip_fns[a->esz], a, vec_full_reg_size(s) / 2) + +TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_zip_q, a, 0) +TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_zip_q, a, + QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2) + +static gen_helper_gvec_3 * const uzp_fns[4] = { + gen_helper_sve_uzp_b, gen_helper_sve_uzp_h, + gen_helper_sve_uzp_s, gen_helper_sve_uzp_d, +}; + +TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz, + uzp_fns[a->esz], a, 0) +TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz, + uzp_fns[a->esz], a, 1 << a->esz) + +TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_uzp_q, a, 0) +TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_uzp_q, a, 16) + +static gen_helper_gvec_3 * const trn_fns[4] = { + gen_helper_sve_trn_b, gen_helper_sve_trn_h, + gen_helper_sve_trn_s, gen_helper_sve_trn_d, +}; + +TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz, + trn_fns[a->esz], a, 0) +TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz, + trn_fns[a->esz], a, 1 << a->esz) + +TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_trn_q, a, 0) +TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, + gen_helper_sve2_trn_q, a, 16) + +/* + *** SVE Permute Vector - Predicated Group + */ + +static gen_helper_gvec_3 * const compact_fns[4] = { + NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d +}; +TRANS_FEAT_NONSTREAMING(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz, + compact_fns[a->esz], a, 0) + +/* Call the helper that computes the ARM LastActiveElement pseudocode + * function, scaled by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg) +{ + /* Predicate sizes may be smaller and cannot use simd_desc. We cannot + * round up, as we do elsewhere, because we need the exact size. + */ + TCGv_ptr t_p = tcg_temp_new_ptr(); + unsigned desc = 0; + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s)); + desc = FIELD_DP32(desc, PREDDESC, ESZ, esz); + + tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg)); + + gen_helper_sve_last_active_element(ret, t_p, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_p); +} + +/* Increment LAST to the offset of the next element in the vector, + * wrapping around to 0. + */ +static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + tcg_gen_addi_i32(last, last, 1 << esz); + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_constant_i32(vsz); + TCGv_i32 zero = tcg_constant_i32(0); + tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last); + } +} + +/* If LAST < 0, set LAST to the offset of the last element in the vector. */ +static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_constant_i32(vsz - (1 << esz)); + TCGv_i32 zero = tcg_constant_i32(0); + tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last); + } +} + +/* Load an unsigned element of ESZ from BASE+OFS. */ +static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz) +{ + TCGv_i64 r = tcg_temp_new_i64(); + + switch (esz) { + case 0: + tcg_gen_ld8u_i64(r, base, ofs); + break; + case 1: + tcg_gen_ld16u_i64(r, base, ofs); + break; + case 2: + tcg_gen_ld32u_i64(r, base, ofs); + break; + case 3: + tcg_gen_ld_i64(r, base, ofs); + break; + default: + g_assert_not_reached(); + } + return r; +} + +/* Load an unsigned element of ESZ from RM[LAST]. */ +static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last, + int rm, int esz) +{ + TCGv_ptr p = tcg_temp_new_ptr(); + TCGv_i64 r; + + /* Convert offset into vector into offset into ENV. + * The final adjustment for the vector register base + * is added via constant offset to the load. + */ +#if HOST_BIG_ENDIAN + /* Adjust for element ordering. See vec_reg_offset. */ + if (esz < 3) { + tcg_gen_xori_i32(last, last, 8 - (1 << esz)); + } +#endif + tcg_gen_ext_i32_ptr(p, last); + tcg_gen_add_ptr(p, p, cpu_env); + + r = load_esz(p, vec_full_reg_offset(s, rm), esz); + tcg_temp_free_ptr(p); + + return r; +} + +/* Compute CLAST for a Zreg. */ +static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before) +{ + TCGv_i32 last; + TCGLabel *over; + TCGv_i64 ele; + unsigned vsz, esz = a->esz; + + if (!sve_access_check(s)) { + return true; + } + + last = tcg_temp_local_new_i32(); + over = gen_new_label(); + + find_last_active(s, last, esz, a->pg); + + /* There is of course no movcond for a 2048-bit vector, + * so we must branch over the actual store. + */ + tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over); + + if (!before) { + incr_last_active(s, last, esz); + } + + ele = load_last_active(s, last, a->rm, esz); + tcg_temp_free_i32(last); + + vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele); + tcg_temp_free_i64(ele); + + /* If this insn used MOVPRFX, we may need a second move. */ + if (a->rd != a->rn) { + TCGLabel *done = gen_new_label(); + tcg_gen_br(done); + + gen_set_label(over); + do_mov_z(s, a->rd, a->rn); + + gen_set_label(done); + } else { + gen_set_label(over); + } + return true; +} + +TRANS_FEAT(CLASTA_z, aa64_sve, do_clast_vector, a, false) +TRANS_FEAT(CLASTB_z, aa64_sve, do_clast_vector, a, true) + +/* Compute CLAST for a scalar. */ +static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm, + bool before, TCGv_i64 reg_val) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ele, cmp; + + find_last_active(s, last, esz, pg); + + /* Extend the original value of last prior to incrementing. */ + cmp = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(cmp, last); + + if (!before) { + incr_last_active(s, last, esz); + } + + /* The conceit here is that while last < 0 indicates not found, after + * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address + * from which we can load garbage. We then discard the garbage with + * a conditional move. + */ + ele = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + + tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, tcg_constant_i64(0), + ele, reg_val); + + tcg_temp_free_i64(cmp); + tcg_temp_free_i64(ele); +} + +/* Compute CLAST for a Vreg. */ +static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + int esz = a->esz; + int ofs = vec_reg_offset(s, a->rd, 0, esz); + TCGv_i64 reg = load_esz(cpu_env, ofs, esz); + + do_clast_scalar(s, esz, a->pg, a->rn, before, reg); + write_fp_dreg(s, a->rd, reg); + tcg_temp_free_i64(reg); + } + return true; +} + +TRANS_FEAT(CLASTA_v, aa64_sve, do_clast_fp, a, false) +TRANS_FEAT(CLASTB_v, aa64_sve, do_clast_fp, a, true) + +/* Compute CLAST for a Xreg. */ +static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + TCGv_i64 reg; + + if (!sve_access_check(s)) { + return true; + } + + reg = cpu_reg(s, a->rd); + switch (a->esz) { + case 0: + tcg_gen_ext8u_i64(reg, reg); + break; + case 1: + tcg_gen_ext16u_i64(reg, reg); + break; + case 2: + tcg_gen_ext32u_i64(reg, reg); + break; + case 3: + break; + default: + g_assert_not_reached(); + } + + do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg); + return true; +} + +TRANS_FEAT(CLASTA_r, aa64_sve, do_clast_general, a, false) +TRANS_FEAT(CLASTB_r, aa64_sve, do_clast_general, a, true) + +/* Compute LAST for a scalar. */ +static TCGv_i64 do_last_scalar(DisasContext *s, int esz, + int pg, int rm, bool before) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ret; + + find_last_active(s, last, esz, pg); + if (before) { + wrap_last_active(s, last, esz); + } else { + incr_last_active(s, last, esz); + } + + ret = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + return ret; +} + +/* Compute LAST for a Vreg. */ +static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + write_fp_dreg(s, a->rd, val); + tcg_temp_free_i64(val); + } + return true; +} + +TRANS_FEAT(LASTA_v, aa64_sve, do_last_fp, a, false) +TRANS_FEAT(LASTB_v, aa64_sve, do_last_fp, a, true) + +/* Compute LAST for a Xreg. */ +static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + tcg_gen_mov_i64(cpu_reg(s, a->rd), val); + tcg_temp_free_i64(val); + } + return true; +} + +TRANS_FEAT(LASTA_r, aa64_sve, do_last_general, a, false) +TRANS_FEAT(LASTB_r, aa64_sve, do_last_general, a, true) + +static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int ofs = vec_reg_offset(s, a->rn, 0, a->esz); + TCGv_i64 t = load_esz(cpu_env, ofs, a->esz); + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t); + tcg_temp_free_i64(t); + } + return true; +} + +static gen_helper_gvec_3 * const revb_fns[4] = { + NULL, gen_helper_sve_revb_h, + gen_helper_sve_revb_s, gen_helper_sve_revb_d, +}; +TRANS_FEAT(REVB, aa64_sve, gen_gvec_ool_arg_zpz, revb_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const revh_fns[4] = { + NULL, NULL, gen_helper_sve_revh_s, gen_helper_sve_revh_d, +}; +TRANS_FEAT(REVH, aa64_sve, gen_gvec_ool_arg_zpz, revh_fns[a->esz], a, 0) + +TRANS_FEAT(REVW, aa64_sve, gen_gvec_ool_arg_zpz, + a->esz == 3 ? gen_helper_sve_revw_d : NULL, a, 0) + +TRANS_FEAT(REVD, aa64_sme, gen_gvec_ool_arg_zpz, gen_helper_sme_revd_q, a, 0) + +TRANS_FEAT(SPLICE, aa64_sve, gen_gvec_ool_arg_zpzz, + gen_helper_sve_splice, a, a->esz) + +TRANS_FEAT(SPLICE_sve2, aa64_sve2, gen_gvec_ool_zzzp, gen_helper_sve_splice, + a->rd, a->rn, (a->rn + 1) % 32, a->pg, a->esz) + +/* + *** SVE Integer Compare - Vectors Group + */ + +static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_flags_4 *gen_fn) +{ + TCGv_ptr pd, zn, zm, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_temp_new_i32(); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + zm = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, zm, pg, tcg_constant_i32(simd_desc(vsz, vsz, 0))); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(zm); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZZ(NAME, name) \ + static gen_helper_gvec_flags_4 * const name##_ppzz_fns[4] = { \ + gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \ + gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \ + }; \ + TRANS_FEAT(NAME##_ppzz, aa64_sve, do_ppzz_flags, \ + a, name##_ppzz_fns[a->esz]) + +DO_PPZZ(CMPEQ, cmpeq) +DO_PPZZ(CMPNE, cmpne) +DO_PPZZ(CMPGT, cmpgt) +DO_PPZZ(CMPGE, cmpge) +DO_PPZZ(CMPHI, cmphi) +DO_PPZZ(CMPHS, cmphs) + +#undef DO_PPZZ + +#define DO_PPZW(NAME, name) \ + static gen_helper_gvec_flags_4 * const name##_ppzw_fns[4] = { \ + gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \ + gen_helper_sve_##name##_ppzw_s, NULL \ + }; \ + TRANS_FEAT(NAME##_ppzw, aa64_sve, do_ppzz_flags, \ + a, name##_ppzw_fns[a->esz]) + +DO_PPZW(CMPEQ, cmpeq) +DO_PPZW(CMPNE, cmpne) +DO_PPZW(CMPGT, cmpgt) +DO_PPZW(CMPGE, cmpge) +DO_PPZW(CMPHI, cmphi) +DO_PPZW(CMPHS, cmphs) +DO_PPZW(CMPLT, cmplt) +DO_PPZW(CMPLE, cmple) +DO_PPZW(CMPLO, cmplo) +DO_PPZW(CMPLS, cmpls) + +#undef DO_PPZW + +/* + *** SVE Integer Compare - Immediate Groups + */ + +static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a, + gen_helper_gvec_flags_3 *gen_fn) +{ + TCGv_ptr pd, zn, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_temp_new_i32(); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, pg, tcg_constant_i32(simd_desc(vsz, vsz, a->imm))); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZI(NAME, name) \ + static gen_helper_gvec_flags_3 * const name##_ppzi_fns[4] = { \ + gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h, \ + gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d, \ + }; \ + TRANS_FEAT(NAME##_ppzi, aa64_sve, do_ppzi_flags, a, \ + name##_ppzi_fns[a->esz]) + +DO_PPZI(CMPEQ, cmpeq) +DO_PPZI(CMPNE, cmpne) +DO_PPZI(CMPGT, cmpgt) +DO_PPZI(CMPGE, cmpge) +DO_PPZI(CMPHI, cmphi) +DO_PPZI(CMPHS, cmphs) +DO_PPZI(CMPLT, cmplt) +DO_PPZI(CMPLE, cmple) +DO_PPZI(CMPLO, cmplo) +DO_PPZI(CMPLS, cmpls) + +#undef DO_PPZI + +/* + *** SVE Partition Break Group + */ + +static bool do_brk3(DisasContext *s, arg_rprr_s *a, + gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr m = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz)); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + TCGv_i32 t = tcg_temp_new_i32(); + fn_s(t, d, n, m, g, desc); + do_pred_flags(t); + tcg_temp_free_i32(t); + } else { + fn(d, n, m, g, desc); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(m); + tcg_temp_free_ptr(g); + return true; +} + +static bool do_brk2(DisasContext *s, arg_rpr_s *a, + gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz)); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + TCGv_i32 t = tcg_temp_new_i32(); + fn_s(t, d, n, g, desc); + do_pred_flags(t); + tcg_temp_free_i32(t); + } else { + fn(d, n, g, desc); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(g); + return true; +} + +TRANS_FEAT(BRKPA, aa64_sve, do_brk3, a, + gen_helper_sve_brkpa, gen_helper_sve_brkpas) +TRANS_FEAT(BRKPB, aa64_sve, do_brk3, a, + gen_helper_sve_brkpb, gen_helper_sve_brkpbs) + +TRANS_FEAT(BRKA_m, aa64_sve, do_brk2, a, + gen_helper_sve_brka_m, gen_helper_sve_brkas_m) +TRANS_FEAT(BRKB_m, aa64_sve, do_brk2, a, + gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m) + +TRANS_FEAT(BRKA_z, aa64_sve, do_brk2, a, + gen_helper_sve_brka_z, gen_helper_sve_brkas_z) +TRANS_FEAT(BRKB_z, aa64_sve, do_brk2, a, + gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z) + +TRANS_FEAT(BRKN, aa64_sve, do_brk2, a, + gen_helper_sve_brkn, gen_helper_sve_brkns) + +/* + *** SVE Predicate Count Group + */ + +static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg) +{ + unsigned psz = pred_full_reg_size(s); + + if (psz <= 8) { + uint64_t psz_mask; + + tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn)); + if (pn != pg) { + TCGv_i64 g = tcg_temp_new_i64(); + tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_and_i64(val, val, g); + tcg_temp_free_i64(g); + } + + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + psz_mask = MAKE_64BIT_MASK(0, psz * 8); + tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask); + + tcg_gen_ctpop_i64(val, val); + } else { + TCGv_ptr t_pn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + unsigned desc = 0; + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, psz); + desc = FIELD_DP32(desc, PREDDESC, ESZ, esz); + + tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + gen_helper_sve_cntp(val, t_pn, t_pg, tcg_constant_i32(desc)); + tcg_temp_free_ptr(t_pn); + tcg_temp_free_ptr(t_pg); + } +} + +static bool trans_CNTP(DisasContext *s, arg_CNTP *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg); + } + return true; +} + +static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + if (a->d) { + tcg_gen_sub_i64(reg, reg, val); + } else { + tcg_gen_add_i64(reg, reg, val); + } + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 val = tcg_temp_new_i64(); + GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds; + + do_cntp(s, val, a->esz, a->pg, a->pg); + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), val, vsz, vsz); + } + return true; +} + +static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_32(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_64(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_temp_new_i64(); + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d); + } + return true; +} + +/* + *** SVE Integer Compare Scalars Group + */ + +static bool trans_CTERM(DisasContext *s, arg_CTERM *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ); + TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf); + TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf); + TCGv_i64 cmp = tcg_temp_new_i64(); + + tcg_gen_setcond_i64(cond, cmp, rn, rm); + tcg_gen_extrl_i64_i32(cpu_NF, cmp); + tcg_temp_free_i64(cmp); + + /* VF = !NF & !CF. */ + tcg_gen_xori_i32(cpu_VF, cpu_NF, 1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF); + + /* Both NF and VF actually look at bit 31. */ + tcg_gen_neg_i32(cpu_NF, cpu_NF); + tcg_gen_neg_i32(cpu_VF, cpu_VF); + return true; +} + +static bool trans_WHILE(DisasContext *s, arg_WHILE *a) +{ + TCGv_i64 op0, op1, t0, t1, tmax; + TCGv_i32 t2; + TCGv_ptr ptr; + unsigned vsz = vec_full_reg_size(s); + unsigned desc = 0; + TCGCond cond; + uint64_t maxval; + /* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */ + bool eq = a->eq == a->lt; + + /* The greater-than conditions are all SVE2. */ + if (a->lt + ? !dc_isar_feature(aa64_sve, s) + : !dc_isar_feature(aa64_sve2, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + op0 = read_cpu_reg(s, a->rn, 1); + op1 = read_cpu_reg(s, a->rm, 1); + + if (!a->sf) { + if (a->u) { + tcg_gen_ext32u_i64(op0, op0); + tcg_gen_ext32u_i64(op1, op1); + } else { + tcg_gen_ext32s_i64(op0, op0); + tcg_gen_ext32s_i64(op1, op1); + } + } + + /* For the helper, compress the different conditions into a computation + * of how many iterations for which the condition is true. + */ + t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + + if (a->lt) { + tcg_gen_sub_i64(t0, op1, op0); + if (a->u) { + maxval = a->sf ? UINT64_MAX : UINT32_MAX; + cond = eq ? TCG_COND_LEU : TCG_COND_LTU; + } else { + maxval = a->sf ? INT64_MAX : INT32_MAX; + cond = eq ? TCG_COND_LE : TCG_COND_LT; + } + } else { + tcg_gen_sub_i64(t0, op0, op1); + if (a->u) { + maxval = 0; + cond = eq ? TCG_COND_GEU : TCG_COND_GTU; + } else { + maxval = a->sf ? INT64_MIN : INT32_MIN; + cond = eq ? TCG_COND_GE : TCG_COND_GT; + } + } + + tmax = tcg_constant_i64(vsz >> a->esz); + if (eq) { + /* Equality means one more iteration. */ + tcg_gen_addi_i64(t0, t0, 1); + + /* + * For the less-than while, if op1 is maxval (and the only time + * the addition above could overflow), then we produce an all-true + * predicate by setting the count to the vector length. This is + * because the pseudocode is described as an increment + compare + * loop, and the maximum integer would always compare true. + * Similarly, the greater-than while has the same issue with the + * minimum integer due to the decrement + compare loop. + */ + tcg_gen_movi_i64(t1, maxval); + tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0); + } + + /* Bound to the maximum. */ + tcg_gen_umin_i64(t0, t0, tmax); + + /* Set the count to zero if the condition is false. */ + tcg_gen_movi_i64(t1, 0); + tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); + tcg_temp_free_i64(t1); + + /* Since we're bounded, pass as a 32-bit type. */ + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, t0); + tcg_temp_free_i64(t0); + + /* Scale elements to bits. */ + tcg_gen_shli_i32(t2, t2, a->esz); + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + + ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd)); + + if (a->lt) { + gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc)); + } else { + gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc)); + } + do_pred_flags(t2); + + tcg_temp_free_ptr(ptr); + tcg_temp_free_i32(t2); + return true; +} + +static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a) +{ + TCGv_i64 op0, op1, diff, t1, tmax; + TCGv_i32 t2; + TCGv_ptr ptr; + unsigned vsz = vec_full_reg_size(s); + unsigned desc = 0; + + if (!dc_isar_feature(aa64_sve2, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + op0 = read_cpu_reg(s, a->rn, 1); + op1 = read_cpu_reg(s, a->rm, 1); + + tmax = tcg_constant_i64(vsz); + diff = tcg_temp_new_i64(); + + if (a->rw) { + /* WHILERW */ + /* diff = abs(op1 - op0), noting that op0/1 are unsigned. */ + t1 = tcg_temp_new_i64(); + tcg_gen_sub_i64(diff, op0, op1); + tcg_gen_sub_i64(t1, op1, op0); + tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1); + tcg_temp_free_i64(t1); + /* Round down to a multiple of ESIZE. */ + tcg_gen_andi_i64(diff, diff, -1 << a->esz); + /* If op1 == op0, diff == 0, and the condition is always true. */ + tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff); + } else { + /* WHILEWR */ + tcg_gen_sub_i64(diff, op1, op0); + /* Round down to a multiple of ESIZE. */ + tcg_gen_andi_i64(diff, diff, -1 << a->esz); + /* If op0 >= op1, diff <= 0, the condition is always true. */ + tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff); + } + + /* Bound to the maximum. */ + tcg_gen_umin_i64(diff, diff, tmax); + + /* Since we're bounded, pass as a 32-bit type. */ + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, diff); + tcg_temp_free_i64(diff); + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + + ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd)); + + gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc)); + do_pred_flags(t2); + + tcg_temp_free_ptr(ptr); + tcg_temp_free_i32(t2); + return true; +} + +/* + *** SVE Integer Wide Immediate - Unpredicated Group + */ + +static bool trans_FDUP(DisasContext *s, arg_FDUP *a) +{ + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + uint64_t imm; + + /* Decode the VFP immediate. */ + imm = vfp_expand_imm(a->esz, a->imm); + tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm); + } + return true; +} + +static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm); + } + return true; +} + +TRANS_FEAT(ADD_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_addi, a) + +static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a) +{ + a->imm = -a->imm; + return trans_ADD_zzi(s, a); +} + +static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_sub_vec, 0 }; + static const GVecGen2s op[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_b, + .opt_opc = vecop_list, + .vece = MO_8, + .scalar_first = true }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_h, + .opt_opc = vecop_list, + .vece = MO_16, + .scalar_first = true }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_s, + .opt_opc = vecop_list, + .vece = MO_32, + .scalar_first = true }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_d, + .opt_opc = vecop_list, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64, + .scalar_first = true } + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, tcg_constant_i64(a->imm), &op[a->esz]); + } + return true; +} + +TRANS_FEAT(MUL_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_muli, a) + +static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, bool u, bool d) +{ + if (sve_access_check(s)) { + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, + tcg_constant_i64(a->imm), u, d); + } + return true; +} + +TRANS_FEAT(SQADD_zzi, aa64_sve, do_zzi_sat, a, false, false) +TRANS_FEAT(UQADD_zzi, aa64_sve, do_zzi_sat, a, true, false) +TRANS_FEAT(SQSUB_zzi, aa64_sve, do_zzi_sat, a, false, true) +TRANS_FEAT(UQSUB_zzi, aa64_sve, do_zzi_sat, a, true, true) + +static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + tcg_constant_i64(a->imm), vsz, vsz, 0, fn); + } + return true; +} + +#define DO_ZZI(NAME, name) \ + static gen_helper_gvec_2i * const name##i_fns[4] = { \ + gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h, \ + gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d, \ + }; \ + TRANS_FEAT(NAME##_zzi, aa64_sve, do_zzi_ool, a, name##i_fns[a->esz]) + +DO_ZZI(SMAX, smax) +DO_ZZI(UMAX, umax) +DO_ZZI(SMIN, smin) +DO_ZZI(UMIN, umin) + +#undef DO_ZZI + +static gen_helper_gvec_4 * const dot_fns[2][2] = { + { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, + { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } +}; +TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz, + dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0) + +/* + * SVE Multiply - Indexed + */ + +TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sdot_idx_b, a) +TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sdot_idx_h, a) +TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_udot_idx_b, a) +TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_udot_idx_h, a) + +TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sudot_idx_b, a) +TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_usdot_idx_b, a) + +#define DO_SVE2_RRX(NAME, FUNC) \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \ + a->rd, a->rn, a->rm, a->index) + +DO_SVE2_RRX(MUL_zzx_h, gen_helper_gvec_mul_idx_h) +DO_SVE2_RRX(MUL_zzx_s, gen_helper_gvec_mul_idx_s) +DO_SVE2_RRX(MUL_zzx_d, gen_helper_gvec_mul_idx_d) + +DO_SVE2_RRX(SQDMULH_zzx_h, gen_helper_sve2_sqdmulh_idx_h) +DO_SVE2_RRX(SQDMULH_zzx_s, gen_helper_sve2_sqdmulh_idx_s) +DO_SVE2_RRX(SQDMULH_zzx_d, gen_helper_sve2_sqdmulh_idx_d) + +DO_SVE2_RRX(SQRDMULH_zzx_h, gen_helper_sve2_sqrdmulh_idx_h) +DO_SVE2_RRX(SQRDMULH_zzx_s, gen_helper_sve2_sqrdmulh_idx_s) +DO_SVE2_RRX(SQRDMULH_zzx_d, gen_helper_sve2_sqrdmulh_idx_d) + +#undef DO_SVE2_RRX + +#define DO_SVE2_RRX_TB(NAME, FUNC, TOP) \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \ + a->rd, a->rn, a->rm, (a->index << 1) | TOP) + +DO_SVE2_RRX_TB(SQDMULLB_zzx_s, gen_helper_sve2_sqdmull_idx_s, false) +DO_SVE2_RRX_TB(SQDMULLB_zzx_d, gen_helper_sve2_sqdmull_idx_d, false) +DO_SVE2_RRX_TB(SQDMULLT_zzx_s, gen_helper_sve2_sqdmull_idx_s, true) +DO_SVE2_RRX_TB(SQDMULLT_zzx_d, gen_helper_sve2_sqdmull_idx_d, true) + +DO_SVE2_RRX_TB(SMULLB_zzx_s, gen_helper_sve2_smull_idx_s, false) +DO_SVE2_RRX_TB(SMULLB_zzx_d, gen_helper_sve2_smull_idx_d, false) +DO_SVE2_RRX_TB(SMULLT_zzx_s, gen_helper_sve2_smull_idx_s, true) +DO_SVE2_RRX_TB(SMULLT_zzx_d, gen_helper_sve2_smull_idx_d, true) + +DO_SVE2_RRX_TB(UMULLB_zzx_s, gen_helper_sve2_umull_idx_s, false) +DO_SVE2_RRX_TB(UMULLB_zzx_d, gen_helper_sve2_umull_idx_d, false) +DO_SVE2_RRX_TB(UMULLT_zzx_s, gen_helper_sve2_umull_idx_s, true) +DO_SVE2_RRX_TB(UMULLT_zzx_d, gen_helper_sve2_umull_idx_d, true) + +#undef DO_SVE2_RRX_TB + +#define DO_SVE2_RRXR(NAME, FUNC) \ + TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzxz, FUNC, a) + +DO_SVE2_RRXR(MLA_zzxz_h, gen_helper_gvec_mla_idx_h) +DO_SVE2_RRXR(MLA_zzxz_s, gen_helper_gvec_mla_idx_s) +DO_SVE2_RRXR(MLA_zzxz_d, gen_helper_gvec_mla_idx_d) + +DO_SVE2_RRXR(MLS_zzxz_h, gen_helper_gvec_mls_idx_h) +DO_SVE2_RRXR(MLS_zzxz_s, gen_helper_gvec_mls_idx_s) +DO_SVE2_RRXR(MLS_zzxz_d, gen_helper_gvec_mls_idx_d) + +DO_SVE2_RRXR(SQRDMLAH_zzxz_h, gen_helper_sve2_sqrdmlah_idx_h) +DO_SVE2_RRXR(SQRDMLAH_zzxz_s, gen_helper_sve2_sqrdmlah_idx_s) +DO_SVE2_RRXR(SQRDMLAH_zzxz_d, gen_helper_sve2_sqrdmlah_idx_d) + +DO_SVE2_RRXR(SQRDMLSH_zzxz_h, gen_helper_sve2_sqrdmlsh_idx_h) +DO_SVE2_RRXR(SQRDMLSH_zzxz_s, gen_helper_sve2_sqrdmlsh_idx_s) +DO_SVE2_RRXR(SQRDMLSH_zzxz_d, gen_helper_sve2_sqrdmlsh_idx_d) + +#undef DO_SVE2_RRXR + +#define DO_SVE2_RRXR_TB(NAME, FUNC, TOP) \ + TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC, \ + a->rd, a->rn, a->rm, a->ra, (a->index << 1) | TOP) + +DO_SVE2_RRXR_TB(SQDMLALB_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, false) +DO_SVE2_RRXR_TB(SQDMLALB_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, false) +DO_SVE2_RRXR_TB(SQDMLALT_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, true) +DO_SVE2_RRXR_TB(SQDMLALT_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, true) + +DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, false) +DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, false) +DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, true) +DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, true) + +DO_SVE2_RRXR_TB(SMLALB_zzxw_s, gen_helper_sve2_smlal_idx_s, false) +DO_SVE2_RRXR_TB(SMLALB_zzxw_d, gen_helper_sve2_smlal_idx_d, false) +DO_SVE2_RRXR_TB(SMLALT_zzxw_s, gen_helper_sve2_smlal_idx_s, true) +DO_SVE2_RRXR_TB(SMLALT_zzxw_d, gen_helper_sve2_smlal_idx_d, true) + +DO_SVE2_RRXR_TB(UMLALB_zzxw_s, gen_helper_sve2_umlal_idx_s, false) +DO_SVE2_RRXR_TB(UMLALB_zzxw_d, gen_helper_sve2_umlal_idx_d, false) +DO_SVE2_RRXR_TB(UMLALT_zzxw_s, gen_helper_sve2_umlal_idx_s, true) +DO_SVE2_RRXR_TB(UMLALT_zzxw_d, gen_helper_sve2_umlal_idx_d, true) + +DO_SVE2_RRXR_TB(SMLSLB_zzxw_s, gen_helper_sve2_smlsl_idx_s, false) +DO_SVE2_RRXR_TB(SMLSLB_zzxw_d, gen_helper_sve2_smlsl_idx_d, false) +DO_SVE2_RRXR_TB(SMLSLT_zzxw_s, gen_helper_sve2_smlsl_idx_s, true) +DO_SVE2_RRXR_TB(SMLSLT_zzxw_d, gen_helper_sve2_smlsl_idx_d, true) + +DO_SVE2_RRXR_TB(UMLSLB_zzxw_s, gen_helper_sve2_umlsl_idx_s, false) +DO_SVE2_RRXR_TB(UMLSLB_zzxw_d, gen_helper_sve2_umlsl_idx_d, false) +DO_SVE2_RRXR_TB(UMLSLT_zzxw_s, gen_helper_sve2_umlsl_idx_s, true) +DO_SVE2_RRXR_TB(UMLSLT_zzxw_d, gen_helper_sve2_umlsl_idx_d, true) + +#undef DO_SVE2_RRXR_TB + +#define DO_SVE2_RRXR_ROT(NAME, FUNC) \ + TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC, \ + a->rd, a->rn, a->rm, a->ra, (a->index << 2) | a->rot) + +DO_SVE2_RRXR_ROT(CMLA_zzxz_h, gen_helper_sve2_cmla_idx_h) +DO_SVE2_RRXR_ROT(CMLA_zzxz_s, gen_helper_sve2_cmla_idx_s) + +DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_h, gen_helper_sve2_sqrdcmlah_idx_h) +DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_s, gen_helper_sve2_sqrdcmlah_idx_s) + +DO_SVE2_RRXR_ROT(CDOT_zzxw_s, gen_helper_sve2_cdot_idx_s) +DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d) + +#undef DO_SVE2_RRXR_ROT + +/* + *** SVE Floating Point Multiply-Add Indexed Group + */ + +static bool do_FMLA_zzxz(DisasContext *s, arg_rrxr_esz *a, bool sub) +{ + static gen_helper_gvec_4_ptr * const fns[4] = { + NULL, + gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_fmla_idx_s, + gen_helper_gvec_fmla_idx_d, + }; + return gen_gvec_fpst_zzzz(s, fns[a->esz], a->rd, a->rn, a->rm, a->ra, + (a->index << 1) | sub, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); +} + +TRANS_FEAT(FMLA_zzxz, aa64_sve, do_FMLA_zzxz, a, false) +TRANS_FEAT(FMLS_zzxz, aa64_sve, do_FMLA_zzxz, a, true) + +/* + *** SVE Floating Point Multiply Indexed Group + */ + +static gen_helper_gvec_3_ptr * const fmul_idx_fns[4] = { + NULL, gen_helper_gvec_fmul_idx_h, + gen_helper_gvec_fmul_idx_s, gen_helper_gvec_fmul_idx_d, +}; +TRANS_FEAT(FMUL_zzx, aa64_sve, gen_gvec_fpst_zzz, + fmul_idx_fns[a->esz], a->rd, a->rn, a->rm, a->index, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +/* + *** SVE Floating Point Fast Reduction Group + */ + +typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); + +static bool do_reduce(DisasContext *s, arg_rpr_esz *a, + gen_helper_fp_reduce *fn) +{ + unsigned vsz, p2vsz; + TCGv_i32 t_desc; + TCGv_ptr t_zn, t_pg, status; + TCGv_i64 temp; + + if (fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + p2vsz = pow2ceil(vsz); + t_desc = tcg_constant_i32(simd_desc(vsz, vsz, p2vsz)); + temp = tcg_temp_new_i64(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + + fn(temp, t_zn, t_pg, status, t_desc); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(status); + + write_fp_dreg(s, a->rd, temp); + tcg_temp_free_i64(temp); + return true; +} + +#define DO_VPZ(NAME, name) \ + static gen_helper_fp_reduce * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \ + }; \ + TRANS_FEAT(NAME, aa64_sve, do_reduce, a, name##_fns[a->esz]) + +DO_VPZ(FADDV, faddv) +DO_VPZ(FMINNMV, fminnmv) +DO_VPZ(FMAXNMV, fmaxnmv) +DO_VPZ(FMINV, fminv) +DO_VPZ(FMAXV, fmaxv) + +#undef DO_VPZ + +/* + *** SVE Floating Point Unary Operations - Unpredicated Group + */ + +static gen_helper_gvec_2_ptr * const frecpe_fns[] = { + NULL, gen_helper_gvec_frecpe_h, + gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d, +}; +TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_arg_zz, frecpe_fns[a->esz], a, 0) + +static gen_helper_gvec_2_ptr * const frsqrte_fns[] = { + NULL, gen_helper_gvec_frsqrte_h, + gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d, +}; +TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_arg_zz, frsqrte_fns[a->esz], a, 0) + +/* + *** SVE Floating Point Compare with Zero Group + */ + +static bool do_ppz_fp(DisasContext *s, arg_rpr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = + fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + + tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_PPZ(NAME, name) \ + static gen_helper_gvec_3_ptr * const name##_fns[] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \ + }; \ + TRANS_FEAT(NAME, aa64_sve, do_ppz_fp, a, name##_fns[a->esz]) + +DO_PPZ(FCMGE_ppz0, fcmge0) +DO_PPZ(FCMGT_ppz0, fcmgt0) +DO_PPZ(FCMLE_ppz0, fcmle0) +DO_PPZ(FCMLT_ppz0, fcmlt0) +DO_PPZ(FCMEQ_ppz0, fcmeq0) +DO_PPZ(FCMNE_ppz0, fcmne0) + +#undef DO_PPZ + +/* + *** SVE floating-point trig multiply-add coefficient + */ + +static gen_helper_gvec_3_ptr * const ftmad_fns[4] = { + NULL, gen_helper_sve_ftmad_h, + gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d, +}; +TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz, + ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +/* + *** SVE Floating Point Accumulating Reduction Group + */ + +static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a) +{ + typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + static fadda_fn * const fns[3] = { + gen_helper_sve_fadda_h, + gen_helper_sve_fadda_s, + gen_helper_sve_fadda_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_rm, t_pg, t_fpst; + TCGv_i64 t_val; + TCGv_i32 t_desc; + + if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz); + t_rm = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + t_fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + t_desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + + fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc); + + tcg_temp_free_ptr(t_fpst); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_rm); + + write_fp_dreg(s, a->rd, t_val); + tcg_temp_free_i64(t_val); + return true; +} + +/* + *** SVE Floating Point Arithmetic - Unpredicated Group + */ + +#define DO_FP3(NAME, name) \ + static gen_helper_gvec_3_ptr * const name##_fns[4] = { \ + NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ + }; \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0) + +DO_FP3(FADD_zzz, fadd) +DO_FP3(FSUB_zzz, fsub) +DO_FP3(FMUL_zzz, fmul) +DO_FP3(FRECPS, recps) +DO_FP3(FRSQRTS, rsqrts) + +#undef DO_FP3 + +static gen_helper_gvec_3_ptr * const ftsmul_fns[4] = { + NULL, gen_helper_gvec_ftsmul_h, + gen_helper_gvec_ftsmul_s, gen_helper_gvec_ftsmul_d +}; +TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz, + ftsmul_fns[a->esz], a, 0) + +/* + *** SVE Floating Point Arithmetic - Predicated Group + */ + +#define DO_ZPZZ_FP(NAME, FEAT, name) \ + static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \ + NULL, gen_helper_##name##_h, \ + gen_helper_##name##_s, gen_helper_##name##_d \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a) + +DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd) +DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub) +DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul) +DO_ZPZZ_FP(FMIN_zpzz, aa64_sve, sve_fmin) +DO_ZPZZ_FP(FMAX_zpzz, aa64_sve, sve_fmax) +DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum) +DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum) +DO_ZPZZ_FP(FABD, aa64_sve, sve_fabd) +DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn) +DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv) +DO_ZPZZ_FP(FMULX, aa64_sve, sve_fmulx) + +typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_i64, TCGv_ptr, TCGv_i32); + +static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16, + TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_zd, t_zn, t_pg, status; + TCGv_i32 desc; + + t_zd = tcg_temp_new_ptr(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + status = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR); + desc = tcg_constant_i32(simd_desc(vsz, vsz, 0)); + fn(t_zd, t_zn, t_pg, scalar, status, desc); + + tcg_temp_free_ptr(status); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_zd); +} + +static bool do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm, + gen_helper_sve_fp2scalar *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16, + tcg_constant_i64(imm), fn); + } + return true; +} + +#define DO_FP_IMM(NAME, name, const0, const1) \ + static gen_helper_sve_fp2scalar * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d \ + }; \ + static uint64_t const name##_const[4][2] = { \ + { -1, -1 }, \ + { float16_##const0, float16_##const1 }, \ + { float32_##const0, float32_##const1 }, \ + { float64_##const0, float64_##const1 }, \ + }; \ + TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a, \ + name##_const[a->esz][a->imm], name##_fns[a->esz]) + +DO_FP_IMM(FADD, fadds, half, one) +DO_FP_IMM(FSUB, fsubs, half, one) +DO_FP_IMM(FMUL, fmuls, half, two) +DO_FP_IMM(FSUBR, fsubrs, half, one) +DO_FP_IMM(FMAXNM, fmaxnms, zero, one) +DO_FP_IMM(FMINNM, fminnms, zero, one) +DO_FP_IMM(FMAX, fmaxs, zero, one) +DO_FP_IMM(FMIN, fmins, zero, one) + +#undef DO_FP_IMM + +static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_FPCMP(NAME, name) \ + static gen_helper_gvec_4_ptr * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + TRANS_FEAT(NAME##_ppzz, aa64_sve, do_fp_cmp, a, name##_fns[a->esz]) + +DO_FPCMP(FCMGE, fcmge) +DO_FPCMP(FCMGT, fcmgt) +DO_FPCMP(FCMEQ, fcmeq) +DO_FPCMP(FCMNE, fcmne) +DO_FPCMP(FCMUO, fcmuo) +DO_FPCMP(FACGE, facge) +DO_FPCMP(FACGT, facgt) + +#undef DO_FPCMP + +static gen_helper_gvec_4_ptr * const fcadd_fns[] = { + NULL, gen_helper_sve_fcadd_h, + gen_helper_sve_fcadd_s, gen_helper_sve_fcadd_d, +}; +TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz], + a->rd, a->rn, a->rm, a->pg, a->rot, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +#define DO_FMLA(NAME, name) \ + static gen_helper_gvec_5_ptr * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, name##_fns[a->esz], \ + a->rd, a->rn, a->rm, a->ra, a->pg, 0, \ + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +DO_FMLA(FMLA_zpzzz, fmla_zpzzz) +DO_FMLA(FMLS_zpzzz, fmls_zpzzz) +DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz) +DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz) + +#undef DO_FMLA + +static gen_helper_gvec_5_ptr * const fcmla_fns[4] = { + NULL, gen_helper_sve_fcmla_zpzzz_h, + gen_helper_sve_fcmla_zpzzz_s, gen_helper_sve_fcmla_zpzzz_d, +}; +TRANS_FEAT(FCMLA_zpzzz, aa64_sve, gen_gvec_fpst_zzzzp, fcmla_fns[a->esz], + a->rd, a->rn, a->rm, a->ra, a->pg, a->rot, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +static gen_helper_gvec_4_ptr * const fcmla_idx_fns[4] = { + NULL, gen_helper_gvec_fcmlah_idx, gen_helper_gvec_fcmlas_idx, NULL +}; +TRANS_FEAT(FCMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, fcmla_idx_fns[a->esz], + a->rd, a->rn, a->rm, a->ra, a->index * 4 + a->rot, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +/* + *** SVE Floating Point Unary Operations Predicated Group + */ + +TRANS_FEAT(FCVT_sh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_sh, a, 0, FPST_FPCR) +TRANS_FEAT(FCVT_hs, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_hs, a, 0, FPST_FPCR) + +TRANS_FEAT(BFCVT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz, + gen_helper_sve_bfcvt, a, 0, FPST_FPCR) + +TRANS_FEAT(FCVT_dh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_dh, a, 0, FPST_FPCR) +TRANS_FEAT(FCVT_hd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_hd, a, 0, FPST_FPCR) +TRANS_FEAT(FCVT_ds, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_ds, a, 0, FPST_FPCR) +TRANS_FEAT(FCVT_sd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvt_sd, a, 0, FPST_FPCR) + +TRANS_FEAT(FCVTZS_hh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_hh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(FCVTZU_hh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_hh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(FCVTZS_hs, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_hs, a, 0, FPST_FPCR_F16) +TRANS_FEAT(FCVTZU_hs, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_hs, a, 0, FPST_FPCR_F16) +TRANS_FEAT(FCVTZS_hd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_hd, a, 0, FPST_FPCR_F16) +TRANS_FEAT(FCVTZU_hd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_hd, a, 0, FPST_FPCR_F16) + +TRANS_FEAT(FCVTZS_ss, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_ss, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZU_ss, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_ss, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZS_sd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_sd, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZU_sd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_sd, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZS_ds, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_ds, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZU_ds, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_ds, a, 0, FPST_FPCR) + +TRANS_FEAT(FCVTZS_dd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzs_dd, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTZU_dd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_fcvtzu_dd, a, 0, FPST_FPCR) + +static gen_helper_gvec_3_ptr * const frint_fns[] = { + NULL, + gen_helper_sve_frint_h, + gen_helper_sve_frint_s, + gen_helper_sve_frint_d +}; +TRANS_FEAT(FRINTI, aa64_sve, gen_gvec_fpst_arg_zpz, frint_fns[a->esz], + a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +static gen_helper_gvec_3_ptr * const frintx_fns[] = { + NULL, + gen_helper_sve_frintx_h, + gen_helper_sve_frintx_s, + gen_helper_sve_frintx_d +}; +TRANS_FEAT(FRINTX, aa64_sve, gen_gvec_fpst_arg_zpz, frintx_fns[a->esz], + a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + +static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a, + int mode, gen_helper_gvec_3_ptr *fn) +{ + unsigned vsz; + TCGv_i32 tmode; + TCGv_ptr status; + + if (fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + tmode = tcg_const_i32(mode); + status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR); + + gen_helper_set_rmode(tmode, tmode, status); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + + gen_helper_set_rmode(tmode, tmode, status); + tcg_temp_free_i32(tmode); + tcg_temp_free_ptr(status); + return true; +} + +TRANS_FEAT(FRINTN, aa64_sve, do_frint_mode, a, + float_round_nearest_even, frint_fns[a->esz]) +TRANS_FEAT(FRINTP, aa64_sve, do_frint_mode, a, + float_round_up, frint_fns[a->esz]) +TRANS_FEAT(FRINTM, aa64_sve, do_frint_mode, a, + float_round_down, frint_fns[a->esz]) +TRANS_FEAT(FRINTZ, aa64_sve, do_frint_mode, a, + float_round_to_zero, frint_fns[a->esz]) +TRANS_FEAT(FRINTA, aa64_sve, do_frint_mode, a, + float_round_ties_away, frint_fns[a->esz]) + +static gen_helper_gvec_3_ptr * const frecpx_fns[] = { + NULL, gen_helper_sve_frecpx_h, + gen_helper_sve_frecpx_s, gen_helper_sve_frecpx_d, +}; +TRANS_FEAT(FRECPX, aa64_sve, gen_gvec_fpst_arg_zpz, frecpx_fns[a->esz], + a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +static gen_helper_gvec_3_ptr * const fsqrt_fns[] = { + NULL, gen_helper_sve_fsqrt_h, + gen_helper_sve_fsqrt_s, gen_helper_sve_fsqrt_d, +}; +TRANS_FEAT(FSQRT, aa64_sve, gen_gvec_fpst_arg_zpz, fsqrt_fns[a->esz], + a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +TRANS_FEAT(SCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_hh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(SCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_sh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(SCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_dh, a, 0, FPST_FPCR_F16) + +TRANS_FEAT(SCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_ss, a, 0, FPST_FPCR) +TRANS_FEAT(SCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_ds, a, 0, FPST_FPCR) + +TRANS_FEAT(SCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_sd, a, 0, FPST_FPCR) +TRANS_FEAT(SCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_scvt_dd, a, 0, FPST_FPCR) + +TRANS_FEAT(UCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_hh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(UCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_sh, a, 0, FPST_FPCR_F16) +TRANS_FEAT(UCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_dh, a, 0, FPST_FPCR_F16) + +TRANS_FEAT(UCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_ss, a, 0, FPST_FPCR) +TRANS_FEAT(UCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_ds, a, 0, FPST_FPCR) +TRANS_FEAT(UCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_sd, a, 0, FPST_FPCR) + +TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, + gen_helper_sve_ucvt_dd, a, 0, FPST_FPCR) + +/* + *** SVE Memory - 32-bit Gather and Unsized Contiguous Group + */ + +/* Subroutine loading a vector register at VOFS of LEN bytes. + * The load should begin at the address Rn + IMM. + */ + +void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, + int len, int rn, int imm) +{ + int len_align = QEMU_ALIGN_DOWN(len, 8); + int len_remain = len % 8; + int nparts = len / 8 + ctpop8(len_remain); + int midx = get_mem_index(s); + TCGv_i64 dirty_addr, clean_addr, t0, t1; + + dirty_addr = tcg_temp_new_i64(); + tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len); + tcg_temp_free_i64(dirty_addr); + + /* + * Note that unpredicated load/store of vector/predicate registers + * are defined as a stream of bytes, which equates to little-endian + * operations on larger quantities. + * Attempt to keep code expansion to a minimum by limiting the + * amount of unrolling done. + */ + if (nparts <= 4) { + int i; + + t0 = tcg_temp_new_i64(); + for (i = 0; i < len_align; i += 8) { + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); + tcg_gen_st_i64(t0, base, vofs + i); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + tcg_temp_free_i64(t0); + } else { + TCGLabel *loop = gen_new_label(); + TCGv_ptr tp, i = tcg_const_local_ptr(0); + + /* Copy the clean address into a local temp, live across the loop. */ + t0 = clean_addr; + clean_addr = new_tmp_a64_local(s); + tcg_gen_mov_i64(clean_addr, t0); + + if (base != cpu_env) { + TCGv_ptr b = tcg_temp_local_new_ptr(); + tcg_gen_mov_ptr(b, base); + base = b; + } + + gen_set_label(loop); + + t0 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + + tp = tcg_temp_new_ptr(); + tcg_gen_add_ptr(tp, base, i); + tcg_gen_addi_ptr(i, i, 8); + tcg_gen_st_i64(t0, tp, vofs); + tcg_temp_free_ptr(tp); + tcg_temp_free_i64(t0); + + tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + tcg_temp_free_ptr(i); + + if (base != cpu_env) { + tcg_temp_free_ptr(base); + assert(len_remain == 0); + } + } + + /* + * Predicate register loads can be any multiple of 2. + * Note that we still store the entire 64-bit unit into cpu_env. + */ + if (len_remain) { + t0 = tcg_temp_new_i64(); + switch (len_remain) { + case 2: + case 4: + case 8: + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, + MO_LE | ctz32(len_remain)); + break; + + case 6: + t1 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_addi_i64(clean_addr, clean_addr, 4); + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); + tcg_gen_deposit_i64(t0, t0, t1, 32, 32); + tcg_temp_free_i64(t1); + break; + + default: + g_assert_not_reached(); + } + tcg_gen_st_i64(t0, base, vofs + len_align); + tcg_temp_free_i64(t0); + } +} + +/* Similarly for stores. */ +void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, + int len, int rn, int imm) +{ + int len_align = QEMU_ALIGN_DOWN(len, 8); + int len_remain = len % 8; + int nparts = len / 8 + ctpop8(len_remain); + int midx = get_mem_index(s); + TCGv_i64 dirty_addr, clean_addr, t0; + + dirty_addr = tcg_temp_new_i64(); + tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len); + tcg_temp_free_i64(dirty_addr); + + /* Note that unpredicated load/store of vector/predicate registers + * are defined as a stream of bytes, which equates to little-endian + * operations on larger quantities. There is no nice way to force + * a little-endian store for aarch64_be-linux-user out of line. + * + * Attempt to keep code expansion to a minimum by limiting the + * amount of unrolling done. + */ + if (nparts <= 4) { + int i; + + t0 = tcg_temp_new_i64(); + for (i = 0; i < len_align; i += 8) { + tcg_gen_ld_i64(t0, base, vofs + i); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + tcg_temp_free_i64(t0); + } else { + TCGLabel *loop = gen_new_label(); + TCGv_ptr tp, i = tcg_const_local_ptr(0); + + /* Copy the clean address into a local temp, live across the loop. */ + t0 = clean_addr; + clean_addr = new_tmp_a64_local(s); + tcg_gen_mov_i64(clean_addr, t0); + + if (base != cpu_env) { + TCGv_ptr b = tcg_temp_local_new_ptr(); + tcg_gen_mov_ptr(b, base); + base = b; + } + + gen_set_label(loop); + + t0 = tcg_temp_new_i64(); + tp = tcg_temp_new_ptr(); + tcg_gen_add_ptr(tp, base, i); + tcg_gen_ld_i64(t0, tp, vofs); + tcg_gen_addi_ptr(i, i, 8); + tcg_temp_free_ptr(tp); + + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_temp_free_i64(t0); + + tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + tcg_temp_free_ptr(i); + + if (base != cpu_env) { + tcg_temp_free_ptr(base); + assert(len_remain == 0); + } + } + + /* Predicate register stores can be any multiple of 2. */ + if (len_remain) { + t0 = tcg_temp_new_i64(); + tcg_gen_ld_i64(t0, base, vofs + len_align); + + switch (len_remain) { + case 2: + case 4: + case 8: + tcg_gen_qemu_st_i64(t0, clean_addr, midx, + MO_LE | ctz32(len_remain)); + break; + + case 6: + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_addi_i64(clean_addr, clean_addr, 4); + tcg_gen_shri_i64(t0, t0, 32); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); + break; + + default: + g_assert_not_reached(); + } + tcg_temp_free_i64(t0); + } +} + +static bool trans_LDR_zri(DisasContext *s, arg_rri *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int size = vec_full_reg_size(s); + int off = vec_full_reg_offset(s, a->rd); + gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_LDR_pri(DisasContext *s, arg_rri *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int size = pred_full_reg_size(s); + int off = pred_full_reg_offset(s, a->rd); + gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_STR_zri(DisasContext *s, arg_rri *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int size = vec_full_reg_size(s); + int off = vec_full_reg_offset(s, a->rd); + gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_STR_pri(DisasContext *s, arg_rri *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int size = pred_full_reg_size(s); + int off = pred_full_reg_offset(s, a->rd); + gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size); + } + return true; +} + +/* + *** SVE Memory - Contiguous Load Group + */ + +/* The memory mode of the dtype. */ +static const MemOp dtype_mop[16] = { + MO_UB, MO_UB, MO_UB, MO_UB, + MO_SL, MO_UW, MO_UW, MO_UW, + MO_SW, MO_SW, MO_UL, MO_UL, + MO_SB, MO_SB, MO_SB, MO_UQ +}; + +#define dtype_msz(x) (dtype_mop[x] & MO_SIZE) + +/* The vector element size of dtype. */ +static const uint8_t dtype_esz[16] = { + 0, 1, 2, 3, + 3, 1, 2, 3, + 3, 2, 2, 3, + 3, 2, 1, 3 +}; + +static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + int dtype, uint32_t mte_n, bool is_write, + gen_helper_gvec_mem *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + int desc = 0; + + /* + * For e.g. LD4, there are not enough arguments to pass all 4 + * registers as pointers, so encode the regno into the data field. + * For consistency, do this even for LD1. + */ + if (s->mte_active[0]) { + int msz = dtype_msz(dtype); + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1); + desc <<= SVE_MTEDESC_SHIFT; + } else { + addr = clean_data_tbi(s, addr); + } + + desc = simd_desc(vsz, vsz, zt | desc); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + fn(cpu_env, t_pg, addr, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_pg); +} + +/* Indexed by [mte][be][dtype][nreg] */ +static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { + { /* mte inactive, little-endian */ + { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r, + gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r }, + { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r, + gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r }, + { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r, + gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } }, + + /* mte inactive, big-endian */ + { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r, + gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r }, + { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r, + gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r }, + { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r, + gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } }, + + { /* mte active, little-endian */ + { { gen_helper_sve_ld1bb_r_mte, + gen_helper_sve_ld2bb_r_mte, + gen_helper_sve_ld3bb_r_mte, + gen_helper_sve_ld4bb_r_mte }, + { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_le_r_mte, + gen_helper_sve_ld2hh_le_r_mte, + gen_helper_sve_ld3hh_le_r_mte, + gen_helper_sve_ld4hh_le_r_mte }, + { gen_helper_sve_ld1hsu_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_le_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_le_r_mte, + gen_helper_sve_ld2ss_le_r_mte, + gen_helper_sve_ld3ss_le_r_mte, + gen_helper_sve_ld4ss_le_r_mte }, + { gen_helper_sve_ld1sdu_le_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_le_r_mte, + gen_helper_sve_ld2dd_le_r_mte, + gen_helper_sve_ld3dd_le_r_mte, + gen_helper_sve_ld4dd_le_r_mte } }, + + /* mte active, big-endian */ + { { gen_helper_sve_ld1bb_r_mte, + gen_helper_sve_ld2bb_r_mte, + gen_helper_sve_ld3bb_r_mte, + gen_helper_sve_ld4bb_r_mte }, + { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_be_r_mte, + gen_helper_sve_ld2hh_be_r_mte, + gen_helper_sve_ld3hh_be_r_mte, + gen_helper_sve_ld4hh_be_r_mte }, + { gen_helper_sve_ld1hsu_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_be_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_be_r_mte, + gen_helper_sve_ld2ss_be_r_mte, + gen_helper_sve_ld3ss_be_r_mte, + gen_helper_sve_ld4ss_be_r_mte }, + { gen_helper_sve_ld1sdu_be_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_be_r_mte, + gen_helper_sve_ld2dd_be_r_mte, + gen_helper_sve_ld3dd_be_r_mte, + gen_helper_sve_ld4dd_be_r_mte } } }, +}; + +static void do_ld_zpa(DisasContext *s, int zt, int pg, + TCGv_i64 addr, int dtype, int nreg) +{ + gen_helper_gvec_mem *fn + = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][nreg]; + + /* + * While there are holes in the table, they are not + * accessible via the instruction encoding. + */ + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn); +} + +static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) +{ + if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) + << dtype_msz(a->dtype)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a) +{ + static gen_helper_gvec_mem * const fns[2][2][16] = { + { /* mte inactive, little-endian */ + { gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_le_r, + gen_helper_sve_ldff1hh_le_r, + gen_helper_sve_ldff1hsu_le_r, + gen_helper_sve_ldff1hdu_le_r, + + gen_helper_sve_ldff1hds_le_r, + gen_helper_sve_ldff1hss_le_r, + gen_helper_sve_ldff1ss_le_r, + gen_helper_sve_ldff1sdu_le_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_le_r }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_be_r, + gen_helper_sve_ldff1hh_be_r, + gen_helper_sve_ldff1hsu_be_r, + gen_helper_sve_ldff1hdu_be_r, + + gen_helper_sve_ldff1hds_be_r, + gen_helper_sve_ldff1hss_be_r, + gen_helper_sve_ldff1ss_be_r, + gen_helper_sve_ldff1sdu_be_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_be_r } }, + + { /* mte active, little-endian */ + { gen_helper_sve_ldff1bb_r_mte, + gen_helper_sve_ldff1bhu_r_mte, + gen_helper_sve_ldff1bsu_r_mte, + gen_helper_sve_ldff1bdu_r_mte, + + gen_helper_sve_ldff1sds_le_r_mte, + gen_helper_sve_ldff1hh_le_r_mte, + gen_helper_sve_ldff1hsu_le_r_mte, + gen_helper_sve_ldff1hdu_le_r_mte, + + gen_helper_sve_ldff1hds_le_r_mte, + gen_helper_sve_ldff1hss_le_r_mte, + gen_helper_sve_ldff1ss_le_r_mte, + gen_helper_sve_ldff1sdu_le_r_mte, + + gen_helper_sve_ldff1bds_r_mte, + gen_helper_sve_ldff1bss_r_mte, + gen_helper_sve_ldff1bhs_r_mte, + gen_helper_sve_ldff1dd_le_r_mte }, + + /* mte active, big-endian */ + { gen_helper_sve_ldff1bb_r_mte, + gen_helper_sve_ldff1bhu_r_mte, + gen_helper_sve_ldff1bsu_r_mte, + gen_helper_sve_ldff1bdu_r_mte, + + gen_helper_sve_ldff1sds_be_r_mte, + gen_helper_sve_ldff1hh_be_r_mte, + gen_helper_sve_ldff1hsu_be_r_mte, + gen_helper_sve_ldff1hdu_be_r_mte, + + gen_helper_sve_ldff1hds_be_r_mte, + gen_helper_sve_ldff1hss_be_r_mte, + gen_helper_sve_ldff1ss_be_r_mte, + gen_helper_sve_ldff1sdu_be_r_mte, + + gen_helper_sve_ldff1bds_r_mte, + gen_helper_sve_ldff1bss_r_mte, + gen_helper_sve_ldff1bhs_r_mte, + gen_helper_sve_ldff1dd_be_r_mte } }, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false, + fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]); + } + return true; +} + +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a) +{ + static gen_helper_gvec_mem * const fns[2][2][16] = { + { /* mte inactive, little-endian */ + { gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_le_r, + gen_helper_sve_ldnf1hh_le_r, + gen_helper_sve_ldnf1hsu_le_r, + gen_helper_sve_ldnf1hdu_le_r, + + gen_helper_sve_ldnf1hds_le_r, + gen_helper_sve_ldnf1hss_le_r, + gen_helper_sve_ldnf1ss_le_r, + gen_helper_sve_ldnf1sdu_le_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_le_r }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_be_r, + gen_helper_sve_ldnf1hh_be_r, + gen_helper_sve_ldnf1hsu_be_r, + gen_helper_sve_ldnf1hdu_be_r, + + gen_helper_sve_ldnf1hds_be_r, + gen_helper_sve_ldnf1hss_be_r, + gen_helper_sve_ldnf1ss_be_r, + gen_helper_sve_ldnf1sdu_be_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_be_r } }, + + { /* mte inactive, little-endian */ + { gen_helper_sve_ldnf1bb_r_mte, + gen_helper_sve_ldnf1bhu_r_mte, + gen_helper_sve_ldnf1bsu_r_mte, + gen_helper_sve_ldnf1bdu_r_mte, + + gen_helper_sve_ldnf1sds_le_r_mte, + gen_helper_sve_ldnf1hh_le_r_mte, + gen_helper_sve_ldnf1hsu_le_r_mte, + gen_helper_sve_ldnf1hdu_le_r_mte, + + gen_helper_sve_ldnf1hds_le_r_mte, + gen_helper_sve_ldnf1hss_le_r_mte, + gen_helper_sve_ldnf1ss_le_r_mte, + gen_helper_sve_ldnf1sdu_le_r_mte, + + gen_helper_sve_ldnf1bds_r_mte, + gen_helper_sve_ldnf1bss_r_mte, + gen_helper_sve_ldnf1bhs_r_mte, + gen_helper_sve_ldnf1dd_le_r_mte }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldnf1bb_r_mte, + gen_helper_sve_ldnf1bhu_r_mte, + gen_helper_sve_ldnf1bsu_r_mte, + gen_helper_sve_ldnf1bdu_r_mte, + + gen_helper_sve_ldnf1sds_be_r_mte, + gen_helper_sve_ldnf1hh_be_r_mte, + gen_helper_sve_ldnf1hsu_be_r_mte, + gen_helper_sve_ldnf1hdu_be_r_mte, + + gen_helper_sve_ldnf1hds_be_r_mte, + gen_helper_sve_ldnf1hss_be_r_mte, + gen_helper_sve_ldnf1ss_be_r_mte, + gen_helper_sve_ldnf1sdu_be_r_mte, + + gen_helper_sve_ldnf1bds_r_mte, + gen_helper_sve_ldnf1bss_r_mte, + gen_helper_sve_ldnf1bhs_r_mte, + gen_helper_sve_ldnf1dd_be_r_mte } }, + }; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + int off = (a->imm * elements) << dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); + do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false, + fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]); + } + return true; +} + +static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + int poff; + + /* Load the first quadword using the normal predicated load helpers. */ + poff = pred_full_reg_offset(s, pg); + if (vsz > 16) { + /* + * Zero-extend the first 16 bits of the predicate into a temporary. + * This avoids triggering an assert making sure we don't have bits + * set within a predicate beyond VQ, but we have lowered VQ to 1 + * for this load operation. + */ + TCGv_i64 tmp = tcg_temp_new_i64(); +#if HOST_BIG_ENDIAN + poff += 6; +#endif + tcg_gen_ld16u_i64(tmp, cpu_env, poff); + + poff = offsetof(CPUARMState, vfp.preg_tmp); + tcg_gen_st_i64(tmp, cpu_env, poff); + tcg_temp_free_i64(tmp); + } + + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_pg, cpu_env, poff); + + gen_helper_gvec_mem *fn + = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0]; + fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(16, 16, zt))); + + tcg_temp_free_ptr(t_pg); + + /* Replicate that first quadword. */ + if (vsz > 16) { + int doff = vec_full_reg_offset(s, zt); + tcg_gen_gvec_dup_mem(4, doff + 16, doff, vsz - 16, vsz - 16); + } +} + +static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a) +{ + if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + int msz = dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ldrq(s, a->rd, a->pg, addr, a->dtype); + } + return true; +} + +static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16); + do_ldrq(s, a->rd, a->pg, addr, a->dtype); + } + return true; +} + +static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) +{ + unsigned vsz = vec_full_reg_size(s); + unsigned vsz_r32; + TCGv_ptr t_pg; + int poff, doff; + + if (vsz < 32) { + /* + * Note that this UNDEFINED check comes after CheckSVEEnabled() + * in the ARM pseudocode, which is the sve_access_check() done + * in our caller. We should not now return false from the caller. + */ + unallocated_encoding(s); + return; + } + + /* Load the first octaword using the normal predicated load helpers. */ + + poff = pred_full_reg_offset(s, pg); + if (vsz > 32) { + /* + * Zero-extend the first 32 bits of the predicate into a temporary. + * This avoids triggering an assert making sure we don't have bits + * set within a predicate beyond VQ, but we have lowered VQ to 2 + * for this load operation. + */ + TCGv_i64 tmp = tcg_temp_new_i64(); +#if HOST_BIG_ENDIAN + poff += 4; +#endif + tcg_gen_ld32u_i64(tmp, cpu_env, poff); + + poff = offsetof(CPUARMState, vfp.preg_tmp); + tcg_gen_st_i64(tmp, cpu_env, poff); + tcg_temp_free_i64(tmp); + } + + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_pg, cpu_env, poff); + + gen_helper_gvec_mem *fn + = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0]; + fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(32, 32, zt))); + + tcg_temp_free_ptr(t_pg); + + /* + * Replicate that first octaword. + * The replication happens in units of 32; if the full vector size + * is not a multiple of 32, the final bits are zeroed. + */ + doff = vec_full_reg_offset(s, zt); + vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32); + if (vsz >= 64) { + tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz_r32 - 32); + } + vsz -= vsz_r32; + if (vsz) { + tcg_gen_gvec_dup_imm(MO_64, doff + vsz_r32, vsz, vsz, 0); + } +} + +static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a) +{ + if (!dc_isar_feature(aa64_sve_f64mm, s)) { + return false; + } + if (a->rm == 31) { + return false; + } + s->is_nonstreaming = true; + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ldro(s, a->rd, a->pg, addr, a->dtype); + } + return true; +} + +static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a) +{ + if (!dc_isar_feature(aa64_sve_f64mm, s)) { + return false; + } + s->is_nonstreaming = true; + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32); + do_ldro(s, a->rd, a->pg, addr, a->dtype); + } + return true; +} + +/* Load and broadcast element. */ +static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) +{ + unsigned vsz = vec_full_reg_size(s); + unsigned psz = pred_full_reg_size(s); + unsigned esz = dtype_esz[a->dtype]; + unsigned msz = dtype_msz(a->dtype); + TCGLabel *over; + TCGv_i64 temp, clean_addr; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + over = gen_new_label(); + + /* If the guarding predicate has no bits set, no load occurs. */ + if (psz <= 8) { + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8); + temp = tcg_temp_new_i64(); + tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg)); + tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask); + tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over); + tcg_temp_free_i64(temp); + } else { + TCGv_i32 t32 = tcg_temp_new_i32(); + find_last_active(s, t32, esz, a->pg); + tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over); + tcg_temp_free_i32(t32); + } + + /* Load the data. */ + temp = tcg_temp_new_i64(); + tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz); + clean_addr = gen_mte_check1(s, temp, false, true, msz); + + tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s), + finalize_memop(s, dtype_mop[a->dtype])); + + /* Broadcast to *all* elements. */ + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, temp); + tcg_temp_free_i64(temp); + + /* Zero the inactive elements. */ + gen_set_label(over); + return do_movz_zpz(s, a->rd, a->rd, a->pg, esz, false); +} + +static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + int msz, int esz, int nreg) +{ + static gen_helper_gvec_mem * const fn_single[2][2][4][4] = { + { { { gen_helper_sve_st1bb_r, + gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, + gen_helper_sve_st1bd_r }, + { NULL, + gen_helper_sve_st1hh_le_r, + gen_helper_sve_st1hs_le_r, + gen_helper_sve_st1hd_le_r }, + { NULL, NULL, + gen_helper_sve_st1ss_le_r, + gen_helper_sve_st1sd_le_r }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_le_r } }, + { { gen_helper_sve_st1bb_r, + gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, + gen_helper_sve_st1bd_r }, + { NULL, + gen_helper_sve_st1hh_be_r, + gen_helper_sve_st1hs_be_r, + gen_helper_sve_st1hd_be_r }, + { NULL, NULL, + gen_helper_sve_st1ss_be_r, + gen_helper_sve_st1sd_be_r }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_be_r } } }, + + { { { gen_helper_sve_st1bb_r_mte, + gen_helper_sve_st1bh_r_mte, + gen_helper_sve_st1bs_r_mte, + gen_helper_sve_st1bd_r_mte }, + { NULL, + gen_helper_sve_st1hh_le_r_mte, + gen_helper_sve_st1hs_le_r_mte, + gen_helper_sve_st1hd_le_r_mte }, + { NULL, NULL, + gen_helper_sve_st1ss_le_r_mte, + gen_helper_sve_st1sd_le_r_mte }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_le_r_mte } }, + { { gen_helper_sve_st1bb_r_mte, + gen_helper_sve_st1bh_r_mte, + gen_helper_sve_st1bs_r_mte, + gen_helper_sve_st1bd_r_mte }, + { NULL, + gen_helper_sve_st1hh_be_r_mte, + gen_helper_sve_st1hs_be_r_mte, + gen_helper_sve_st1hd_be_r_mte }, + { NULL, NULL, + gen_helper_sve_st1ss_be_r_mte, + gen_helper_sve_st1sd_be_r_mte }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_be_r_mte } } }, + }; + static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = { + { { { gen_helper_sve_st2bb_r, + gen_helper_sve_st2hh_le_r, + gen_helper_sve_st2ss_le_r, + gen_helper_sve_st2dd_le_r }, + { gen_helper_sve_st3bb_r, + gen_helper_sve_st3hh_le_r, + gen_helper_sve_st3ss_le_r, + gen_helper_sve_st3dd_le_r }, + { gen_helper_sve_st4bb_r, + gen_helper_sve_st4hh_le_r, + gen_helper_sve_st4ss_le_r, + gen_helper_sve_st4dd_le_r } }, + { { gen_helper_sve_st2bb_r, + gen_helper_sve_st2hh_be_r, + gen_helper_sve_st2ss_be_r, + gen_helper_sve_st2dd_be_r }, + { gen_helper_sve_st3bb_r, + gen_helper_sve_st3hh_be_r, + gen_helper_sve_st3ss_be_r, + gen_helper_sve_st3dd_be_r }, + { gen_helper_sve_st4bb_r, + gen_helper_sve_st4hh_be_r, + gen_helper_sve_st4ss_be_r, + gen_helper_sve_st4dd_be_r } } }, + { { { gen_helper_sve_st2bb_r_mte, + gen_helper_sve_st2hh_le_r_mte, + gen_helper_sve_st2ss_le_r_mte, + gen_helper_sve_st2dd_le_r_mte }, + { gen_helper_sve_st3bb_r_mte, + gen_helper_sve_st3hh_le_r_mte, + gen_helper_sve_st3ss_le_r_mte, + gen_helper_sve_st3dd_le_r_mte }, + { gen_helper_sve_st4bb_r_mte, + gen_helper_sve_st4hh_le_r_mte, + gen_helper_sve_st4ss_le_r_mte, + gen_helper_sve_st4dd_le_r_mte } }, + { { gen_helper_sve_st2bb_r_mte, + gen_helper_sve_st2hh_be_r_mte, + gen_helper_sve_st2ss_be_r_mte, + gen_helper_sve_st2dd_be_r_mte }, + { gen_helper_sve_st3bb_r_mte, + gen_helper_sve_st3hh_be_r_mte, + gen_helper_sve_st3ss_be_r_mte, + gen_helper_sve_st3dd_be_r_mte }, + { gen_helper_sve_st4bb_r_mte, + gen_helper_sve_st4hh_be_r_mte, + gen_helper_sve_st4ss_be_r_mte, + gen_helper_sve_st4dd_be_r_mte } } }, + }; + gen_helper_gvec_mem *fn; + int be = s->be_data == MO_BE; + + if (nreg == 0) { + /* ST1 */ + fn = fn_single[s->mte_active[0]][be][msz][esz]; + nreg = 1; + } else { + /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ + assert(msz == esz); + fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz]; + } + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn); +} + +static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (a->rm == 31 || a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + if (a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> a->esz; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) << a->msz); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +/* + *** SVE gather loads / scatter stores + */ + +static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, + int scale, TCGv_i64 scalar, int msz, bool is_write, + gen_helper_gvec_mem_scatter *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_zm = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + TCGv_ptr t_zt = tcg_temp_new_ptr(); + int desc = 0; + + if (s->mte_active[0]) { + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << msz) - 1); + desc <<= SVE_MTEDESC_SHIFT; + } + desc = simd_desc(vsz, vsz, desc | scale); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm)); + tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt)); + fn(cpu_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i32(desc)); + + tcg_temp_free_ptr(t_zt); + tcg_temp_free_ptr(t_zm); + tcg_temp_free_ptr(t_pg); +} + +/* Indexed by [mte][be][ff][xs][u][msz]. */ +static gen_helper_gvec_mem_scatter * const +gather_load_fn32[2][2][2][2][2][3] = { + { /* MTE Inactive */ + { /* Little-endian */ + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_le_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_le_zsu, + gen_helper_sve_ldss_le_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_le_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_le_zss, + gen_helper_sve_ldss_le_zss, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_le_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_le_zsu, + gen_helper_sve_ldffss_le_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_le_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_le_zss, + gen_helper_sve_ldffss_le_zss, } } } }, + + { /* Big-endian */ + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_be_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_be_zsu, + gen_helper_sve_ldss_be_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_be_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_be_zss, + gen_helper_sve_ldss_be_zss, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_be_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_be_zsu, + gen_helper_sve_ldffss_be_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_be_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_be_zss, + gen_helper_sve_ldffss_be_zss, } } } } }, + { /* MTE Active */ + { /* Little-endian */ + { { { gen_helper_sve_ldbss_zsu_mte, + gen_helper_sve_ldhss_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldbsu_zsu_mte, + gen_helper_sve_ldhsu_le_zsu_mte, + gen_helper_sve_ldss_le_zsu_mte, } }, + { { gen_helper_sve_ldbss_zss_mte, + gen_helper_sve_ldhss_le_zss_mte, + NULL, }, + { gen_helper_sve_ldbsu_zss_mte, + gen_helper_sve_ldhsu_le_zss_mte, + gen_helper_sve_ldss_le_zss_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu_mte, + gen_helper_sve_ldffhss_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zsu_mte, + gen_helper_sve_ldffhsu_le_zsu_mte, + gen_helper_sve_ldffss_le_zsu_mte, } }, + { { gen_helper_sve_ldffbss_zss_mte, + gen_helper_sve_ldffhss_le_zss_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zss_mte, + gen_helper_sve_ldffhsu_le_zss_mte, + gen_helper_sve_ldffss_le_zss_mte, } } } }, + + { /* Big-endian */ + { { { gen_helper_sve_ldbss_zsu_mte, + gen_helper_sve_ldhss_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldbsu_zsu_mte, + gen_helper_sve_ldhsu_be_zsu_mte, + gen_helper_sve_ldss_be_zsu_mte, } }, + { { gen_helper_sve_ldbss_zss_mte, + gen_helper_sve_ldhss_be_zss_mte, + NULL, }, + { gen_helper_sve_ldbsu_zss_mte, + gen_helper_sve_ldhsu_be_zss_mte, + gen_helper_sve_ldss_be_zss_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu_mte, + gen_helper_sve_ldffhss_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zsu_mte, + gen_helper_sve_ldffhsu_be_zsu_mte, + gen_helper_sve_ldffss_be_zsu_mte, } }, + { { gen_helper_sve_ldffbss_zss_mte, + gen_helper_sve_ldffhss_be_zss_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zss_mte, + gen_helper_sve_ldffhsu_be_zss_mte, + gen_helper_sve_ldffss_be_zss_mte, } } } } }, +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const +gather_load_fn64[2][2][2][3][2][4] = { + { /* MTE Inactive */ + { /* Little-endian */ + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_le_zsu, + gen_helper_sve_ldsds_le_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_le_zsu, + gen_helper_sve_ldsdu_le_zsu, + gen_helper_sve_lddd_le_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_le_zss, + gen_helper_sve_ldsds_le_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_le_zss, + gen_helper_sve_ldsdu_le_zss, + gen_helper_sve_lddd_le_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_le_zd, + gen_helper_sve_ldsds_le_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_le_zd, + gen_helper_sve_ldsdu_le_zd, + gen_helper_sve_lddd_le_zd, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_le_zsu, + gen_helper_sve_ldffsds_le_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_le_zsu, + gen_helper_sve_ldffsdu_le_zsu, + gen_helper_sve_ldffdd_le_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_le_zss, + gen_helper_sve_ldffsds_le_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_le_zss, + gen_helper_sve_ldffsdu_le_zss, + gen_helper_sve_ldffdd_le_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_le_zd, + gen_helper_sve_ldffsds_le_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_le_zd, + gen_helper_sve_ldffsdu_le_zd, + gen_helper_sve_ldffdd_le_zd, } } } }, + { /* Big-endian */ + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_be_zsu, + gen_helper_sve_ldsds_be_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_be_zsu, + gen_helper_sve_ldsdu_be_zsu, + gen_helper_sve_lddd_be_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_be_zss, + gen_helper_sve_ldsds_be_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_be_zss, + gen_helper_sve_ldsdu_be_zss, + gen_helper_sve_lddd_be_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_be_zd, + gen_helper_sve_ldsds_be_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_be_zd, + gen_helper_sve_ldsdu_be_zd, + gen_helper_sve_lddd_be_zd, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_be_zsu, + gen_helper_sve_ldffsds_be_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_be_zsu, + gen_helper_sve_ldffsdu_be_zsu, + gen_helper_sve_ldffdd_be_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_be_zss, + gen_helper_sve_ldffsds_be_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_be_zss, + gen_helper_sve_ldffsdu_be_zss, + gen_helper_sve_ldffdd_be_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_be_zd, + gen_helper_sve_ldffsds_be_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_be_zd, + gen_helper_sve_ldffsdu_be_zd, + gen_helper_sve_ldffdd_be_zd, } } } } }, + { /* MTE Active */ + { /* Little-endian */ + { { { gen_helper_sve_ldbds_zsu_mte, + gen_helper_sve_ldhds_le_zsu_mte, + gen_helper_sve_ldsds_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldbdu_zsu_mte, + gen_helper_sve_ldhdu_le_zsu_mte, + gen_helper_sve_ldsdu_le_zsu_mte, + gen_helper_sve_lddd_le_zsu_mte, } }, + { { gen_helper_sve_ldbds_zss_mte, + gen_helper_sve_ldhds_le_zss_mte, + gen_helper_sve_ldsds_le_zss_mte, + NULL, }, + { gen_helper_sve_ldbdu_zss_mte, + gen_helper_sve_ldhdu_le_zss_mte, + gen_helper_sve_ldsdu_le_zss_mte, + gen_helper_sve_lddd_le_zss_mte, } }, + { { gen_helper_sve_ldbds_zd_mte, + gen_helper_sve_ldhds_le_zd_mte, + gen_helper_sve_ldsds_le_zd_mte, + NULL, }, + { gen_helper_sve_ldbdu_zd_mte, + gen_helper_sve_ldhdu_le_zd_mte, + gen_helper_sve_ldsdu_le_zd_mte, + gen_helper_sve_lddd_le_zd_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu_mte, + gen_helper_sve_ldffhds_le_zsu_mte, + gen_helper_sve_ldffsds_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zsu_mte, + gen_helper_sve_ldffhdu_le_zsu_mte, + gen_helper_sve_ldffsdu_le_zsu_mte, + gen_helper_sve_ldffdd_le_zsu_mte, } }, + { { gen_helper_sve_ldffbds_zss_mte, + gen_helper_sve_ldffhds_le_zss_mte, + gen_helper_sve_ldffsds_le_zss_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zss_mte, + gen_helper_sve_ldffhdu_le_zss_mte, + gen_helper_sve_ldffsdu_le_zss_mte, + gen_helper_sve_ldffdd_le_zss_mte, } }, + { { gen_helper_sve_ldffbds_zd_mte, + gen_helper_sve_ldffhds_le_zd_mte, + gen_helper_sve_ldffsds_le_zd_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zd_mte, + gen_helper_sve_ldffhdu_le_zd_mte, + gen_helper_sve_ldffsdu_le_zd_mte, + gen_helper_sve_ldffdd_le_zd_mte, } } } }, + { /* Big-endian */ + { { { gen_helper_sve_ldbds_zsu_mte, + gen_helper_sve_ldhds_be_zsu_mte, + gen_helper_sve_ldsds_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldbdu_zsu_mte, + gen_helper_sve_ldhdu_be_zsu_mte, + gen_helper_sve_ldsdu_be_zsu_mte, + gen_helper_sve_lddd_be_zsu_mte, } }, + { { gen_helper_sve_ldbds_zss_mte, + gen_helper_sve_ldhds_be_zss_mte, + gen_helper_sve_ldsds_be_zss_mte, + NULL, }, + { gen_helper_sve_ldbdu_zss_mte, + gen_helper_sve_ldhdu_be_zss_mte, + gen_helper_sve_ldsdu_be_zss_mte, + gen_helper_sve_lddd_be_zss_mte, } }, + { { gen_helper_sve_ldbds_zd_mte, + gen_helper_sve_ldhds_be_zd_mte, + gen_helper_sve_ldsds_be_zd_mte, + NULL, }, + { gen_helper_sve_ldbdu_zd_mte, + gen_helper_sve_ldhdu_be_zd_mte, + gen_helper_sve_ldsdu_be_zd_mte, + gen_helper_sve_lddd_be_zd_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu_mte, + gen_helper_sve_ldffhds_be_zsu_mte, + gen_helper_sve_ldffsds_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zsu_mte, + gen_helper_sve_ldffhdu_be_zsu_mte, + gen_helper_sve_ldffsdu_be_zsu_mte, + gen_helper_sve_ldffdd_be_zsu_mte, } }, + { { gen_helper_sve_ldffbds_zss_mte, + gen_helper_sve_ldffhds_be_zss_mte, + gen_helper_sve_ldffsds_be_zss_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zss_mte, + gen_helper_sve_ldffhdu_be_zss_mte, + gen_helper_sve_ldffsdu_be_zss_mte, + gen_helper_sve_ldffdd_be_zss_mte, } }, + { { gen_helper_sve_ldffbds_zd_mte, + gen_helper_sve_ldffhds_be_zd_mte, + gen_helper_sve_ldffsds_be_zd_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zd_mte, + gen_helper_sve_ldffhdu_be_zd_mte, + gen_helper_sve_ldffsdu_be_zd_mte, + gen_helper_sve_ldffdd_be_zd_mte, } } } } }, +}; + +static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[mte][be][a->ff][a->xs][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz]; + break; + } + assert(fn != NULL); + + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), a->msz, false, fn); + return true; +} + +static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (a->esz < a->msz || (a->esz == a->msz && !a->u)) { + return false; + } + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[mte][be][a->ff][0][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[mte][be][a->ff][2][a->u][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + tcg_constant_i64(a->imm << a->msz), a->msz, false, fn); + return true; +} + +static bool trans_LDNT1_zprz(DisasContext *s, arg_LD1_zprz *a) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (a->esz < a->msz + !a->u) { + return false; + } + if (!dc_isar_feature(aa64_sve2, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[mte][be][0][0][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[mte][be][0][2][a->u][a->msz]; + break; + } + assert(fn != NULL); + + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + cpu_reg(s, a->rm), a->msz, false, fn); + return true; +} + +/* Indexed by [mte][be][xs][msz]. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][2][3] = { + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_le_zsu, + gen_helper_sve_stss_le_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_le_zss, + gen_helper_sve_stss_le_zss, } }, + { /* Big-endian */ + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_be_zsu, + gen_helper_sve_stss_be_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_be_zss, + gen_helper_sve_stss_be_zss, } } }, + { /* MTE Active */ + { /* Little-endian */ + { gen_helper_sve_stbs_zsu_mte, + gen_helper_sve_sths_le_zsu_mte, + gen_helper_sve_stss_le_zsu_mte, }, + { gen_helper_sve_stbs_zss_mte, + gen_helper_sve_sths_le_zss_mte, + gen_helper_sve_stss_le_zss_mte, } }, + { /* Big-endian */ + { gen_helper_sve_stbs_zsu_mte, + gen_helper_sve_sths_be_zsu_mte, + gen_helper_sve_stss_be_zsu_mte, }, + { gen_helper_sve_stbs_zss_mte, + gen_helper_sve_sths_be_zss_mte, + gen_helper_sve_stss_be_zss_mte, } } }, +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = { + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_le_zsu, + gen_helper_sve_stsd_le_zsu, + gen_helper_sve_stdd_le_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_le_zss, + gen_helper_sve_stsd_le_zss, + gen_helper_sve_stdd_le_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_le_zd, + gen_helper_sve_stsd_le_zd, + gen_helper_sve_stdd_le_zd, } }, + { /* Big-endian */ + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_be_zsu, + gen_helper_sve_stsd_be_zsu, + gen_helper_sve_stdd_be_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_be_zss, + gen_helper_sve_stsd_be_zss, + gen_helper_sve_stdd_be_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_be_zd, + gen_helper_sve_stsd_be_zd, + gen_helper_sve_stdd_be_zd, } } }, + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbd_zsu_mte, + gen_helper_sve_sthd_le_zsu_mte, + gen_helper_sve_stsd_le_zsu_mte, + gen_helper_sve_stdd_le_zsu_mte, }, + { gen_helper_sve_stbd_zss_mte, + gen_helper_sve_sthd_le_zss_mte, + gen_helper_sve_stsd_le_zss_mte, + gen_helper_sve_stdd_le_zss_mte, }, + { gen_helper_sve_stbd_zd_mte, + gen_helper_sve_sthd_le_zd_mte, + gen_helper_sve_stsd_le_zd_mte, + gen_helper_sve_stdd_le_zd_mte, } }, + { /* Big-endian */ + { gen_helper_sve_stbd_zsu_mte, + gen_helper_sve_sthd_be_zsu_mte, + gen_helper_sve_stsd_be_zsu_mte, + gen_helper_sve_stdd_be_zsu_mte, }, + { gen_helper_sve_stbd_zss_mte, + gen_helper_sve_sthd_be_zss_mte, + gen_helper_sve_stsd_be_zss_mte, + gen_helper_sve_stdd_be_zss_mte, }, + { gen_helper_sve_stbd_zd_mte, + gen_helper_sve_sthd_be_zd_mte, + gen_helper_sve_stsd_be_zd_mte, + gen_helper_sve_stdd_be_zd_mte, } } }, +}; + +static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) +{ + gen_helper_gvec_mem_scatter *fn; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (a->esz < a->msz || (a->msz == 0 && a->scale)) { + return false; + } + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[mte][be][a->xs][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[mte][be][a->xs][a->msz]; + break; + default: + g_assert_not_reached(); + } + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), a->msz, true, fn); + return true; +} + +static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (a->esz < a->msz) { + return false; + } + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[mte][be][0][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[mte][be][2][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + tcg_constant_i64(a->imm << a->msz), a->msz, true, fn); + return true; +} + +static bool trans_STNT1_zprz(DisasContext *s, arg_ST1_zprz *a) +{ + gen_helper_gvec_mem_scatter *fn; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (a->esz < a->msz) { + return false; + } + if (!dc_isar_feature(aa64_sve2, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[mte][be][0][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[mte][be][2][a->msz]; + break; + default: + g_assert_not_reached(); + } + + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + cpu_reg(s, a->rm), a->msz, true, fn); + return true; +} + +/* + * Prefetches + */ + +static bool trans_PRF(DisasContext *s, arg_PRF *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + /* Prefetch is a nop within QEMU. */ + (void)sve_access_check(s); + return true; +} + +static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a) +{ + if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) { + return false; + } + /* Prefetch is a nop within QEMU. */ + (void)sve_access_check(s); + return true; +} + +static bool trans_PRF_ns(DisasContext *s, arg_PRF_ns *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + /* Prefetch is a nop within QEMU. */ + s->is_nonstreaming = true; + (void)sve_access_check(s); + return true; +} + +/* + * Move Prefix + * + * TODO: The implementation so far could handle predicated merging movprfx. + * The helper functions as written take an extra source register to + * use in the operation, but the result is only written when predication + * succeeds. For unpredicated movprfx, we need to rearrange the helpers + * to allow the final write back to the destination to be unconditional. + * For predicated zeroing movprfx, we need to rearrange the helpers to + * allow the final write back to zero inactives. + * + * In the meantime, just emit the moves. + */ + +TRANS_FEAT(MOVPRFX, aa64_sve, do_mov_z, a->rd, a->rn) +TRANS_FEAT(MOVPRFX_m, aa64_sve, do_sel_z, a->rd, a->rn, a->rd, a->pg, a->esz) +TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false) + +/* + * SVE2 Integer Multiply - Unpredicated + */ + +TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a) + +static gen_helper_gvec_3 * const smulh_zzz_fns[4] = { + gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h, + gen_helper_gvec_smulh_s, gen_helper_gvec_smulh_d, +}; +TRANS_FEAT(SMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + smulh_zzz_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const umulh_zzz_fns[4] = { + gen_helper_gvec_umulh_b, gen_helper_gvec_umulh_h, + gen_helper_gvec_umulh_s, gen_helper_gvec_umulh_d, +}; +TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + umulh_zzz_fns[a->esz], a, 0) + +TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + gen_helper_gvec_pmul_b, a, 0) + +static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = { + gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h, + gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d, +}; +TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + sqdmulh_zzz_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = { + gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h, + gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d, +}; +TRANS_FEAT(SQRDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + sqrdmulh_zzz_fns[a->esz], a, 0) + +/* + * SVE2 Integer - Predicated + */ + +static gen_helper_gvec_4 * const sadlp_fns[4] = { + NULL, gen_helper_sve2_sadalp_zpzz_h, + gen_helper_sve2_sadalp_zpzz_s, gen_helper_sve2_sadalp_zpzz_d, +}; +TRANS_FEAT(SADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz, + sadlp_fns[a->esz], a, 0) + +static gen_helper_gvec_4 * const uadlp_fns[4] = { + NULL, gen_helper_sve2_uadalp_zpzz_h, + gen_helper_sve2_uadalp_zpzz_s, gen_helper_sve2_uadalp_zpzz_d, +}; +TRANS_FEAT(UADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz, + uadlp_fns[a->esz], a, 0) + +/* + * SVE2 integer unary operations (predicated) + */ + +TRANS_FEAT(URECPE, aa64_sve2, gen_gvec_ool_arg_zpz, + a->esz == 2 ? gen_helper_sve2_urecpe_s : NULL, a, 0) + +TRANS_FEAT(URSQRTE, aa64_sve2, gen_gvec_ool_arg_zpz, + a->esz == 2 ? gen_helper_sve2_ursqrte_s : NULL, a, 0) + +static gen_helper_gvec_3 * const sqabs_fns[4] = { + gen_helper_sve2_sqabs_b, gen_helper_sve2_sqabs_h, + gen_helper_sve2_sqabs_s, gen_helper_sve2_sqabs_d, +}; +TRANS_FEAT(SQABS, aa64_sve2, gen_gvec_ool_arg_zpz, sqabs_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const sqneg_fns[4] = { + gen_helper_sve2_sqneg_b, gen_helper_sve2_sqneg_h, + gen_helper_sve2_sqneg_s, gen_helper_sve2_sqneg_d, +}; +TRANS_FEAT(SQNEG, aa64_sve2, gen_gvec_ool_arg_zpz, sqneg_fns[a->esz], a, 0) + +DO_ZPZZ(SQSHL, aa64_sve2, sve2_sqshl) +DO_ZPZZ(SQRSHL, aa64_sve2, sve2_sqrshl) +DO_ZPZZ(SRSHL, aa64_sve2, sve2_srshl) + +DO_ZPZZ(UQSHL, aa64_sve2, sve2_uqshl) +DO_ZPZZ(UQRSHL, aa64_sve2, sve2_uqrshl) +DO_ZPZZ(URSHL, aa64_sve2, sve2_urshl) + +DO_ZPZZ(SHADD, aa64_sve2, sve2_shadd) +DO_ZPZZ(SRHADD, aa64_sve2, sve2_srhadd) +DO_ZPZZ(SHSUB, aa64_sve2, sve2_shsub) + +DO_ZPZZ(UHADD, aa64_sve2, sve2_uhadd) +DO_ZPZZ(URHADD, aa64_sve2, sve2_urhadd) +DO_ZPZZ(UHSUB, aa64_sve2, sve2_uhsub) + +DO_ZPZZ(ADDP, aa64_sve2, sve2_addp) +DO_ZPZZ(SMAXP, aa64_sve2, sve2_smaxp) +DO_ZPZZ(UMAXP, aa64_sve2, sve2_umaxp) +DO_ZPZZ(SMINP, aa64_sve2, sve2_sminp) +DO_ZPZZ(UMINP, aa64_sve2, sve2_uminp) + +DO_ZPZZ(SQADD_zpzz, aa64_sve2, sve2_sqadd) +DO_ZPZZ(UQADD_zpzz, aa64_sve2, sve2_uqadd) +DO_ZPZZ(SQSUB_zpzz, aa64_sve2, sve2_sqsub) +DO_ZPZZ(UQSUB_zpzz, aa64_sve2, sve2_uqsub) +DO_ZPZZ(SUQADD, aa64_sve2, sve2_suqadd) +DO_ZPZZ(USQADD, aa64_sve2, sve2_usqadd) + +/* + * SVE2 Widening Integer Arithmetic + */ + +static gen_helper_gvec_3 * const saddl_fns[4] = { + NULL, gen_helper_sve2_saddl_h, + gen_helper_sve2_saddl_s, gen_helper_sve2_saddl_d, +}; +TRANS_FEAT(SADDLB, aa64_sve2, gen_gvec_ool_arg_zzz, + saddl_fns[a->esz], a, 0) +TRANS_FEAT(SADDLT, aa64_sve2, gen_gvec_ool_arg_zzz, + saddl_fns[a->esz], a, 3) +TRANS_FEAT(SADDLBT, aa64_sve2, gen_gvec_ool_arg_zzz, + saddl_fns[a->esz], a, 2) + +static gen_helper_gvec_3 * const ssubl_fns[4] = { + NULL, gen_helper_sve2_ssubl_h, + gen_helper_sve2_ssubl_s, gen_helper_sve2_ssubl_d, +}; +TRANS_FEAT(SSUBLB, aa64_sve2, gen_gvec_ool_arg_zzz, + ssubl_fns[a->esz], a, 0) +TRANS_FEAT(SSUBLT, aa64_sve2, gen_gvec_ool_arg_zzz, + ssubl_fns[a->esz], a, 3) +TRANS_FEAT(SSUBLBT, aa64_sve2, gen_gvec_ool_arg_zzz, + ssubl_fns[a->esz], a, 2) +TRANS_FEAT(SSUBLTB, aa64_sve2, gen_gvec_ool_arg_zzz, + ssubl_fns[a->esz], a, 1) + +static gen_helper_gvec_3 * const sabdl_fns[4] = { + NULL, gen_helper_sve2_sabdl_h, + gen_helper_sve2_sabdl_s, gen_helper_sve2_sabdl_d, +}; +TRANS_FEAT(SABDLB, aa64_sve2, gen_gvec_ool_arg_zzz, + sabdl_fns[a->esz], a, 0) +TRANS_FEAT(SABDLT, aa64_sve2, gen_gvec_ool_arg_zzz, + sabdl_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const uaddl_fns[4] = { + NULL, gen_helper_sve2_uaddl_h, + gen_helper_sve2_uaddl_s, gen_helper_sve2_uaddl_d, +}; +TRANS_FEAT(UADDLB, aa64_sve2, gen_gvec_ool_arg_zzz, + uaddl_fns[a->esz], a, 0) +TRANS_FEAT(UADDLT, aa64_sve2, gen_gvec_ool_arg_zzz, + uaddl_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const usubl_fns[4] = { + NULL, gen_helper_sve2_usubl_h, + gen_helper_sve2_usubl_s, gen_helper_sve2_usubl_d, +}; +TRANS_FEAT(USUBLB, aa64_sve2, gen_gvec_ool_arg_zzz, + usubl_fns[a->esz], a, 0) +TRANS_FEAT(USUBLT, aa64_sve2, gen_gvec_ool_arg_zzz, + usubl_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const uabdl_fns[4] = { + NULL, gen_helper_sve2_uabdl_h, + gen_helper_sve2_uabdl_s, gen_helper_sve2_uabdl_d, +}; +TRANS_FEAT(UABDLB, aa64_sve2, gen_gvec_ool_arg_zzz, + uabdl_fns[a->esz], a, 0) +TRANS_FEAT(UABDLT, aa64_sve2, gen_gvec_ool_arg_zzz, + uabdl_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const sqdmull_fns[4] = { + NULL, gen_helper_sve2_sqdmull_zzz_h, + gen_helper_sve2_sqdmull_zzz_s, gen_helper_sve2_sqdmull_zzz_d, +}; +TRANS_FEAT(SQDMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + sqdmull_fns[a->esz], a, 0) +TRANS_FEAT(SQDMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + sqdmull_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const smull_fns[4] = { + NULL, gen_helper_sve2_smull_zzz_h, + gen_helper_sve2_smull_zzz_s, gen_helper_sve2_smull_zzz_d, +}; +TRANS_FEAT(SMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + smull_fns[a->esz], a, 0) +TRANS_FEAT(SMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + smull_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const umull_fns[4] = { + NULL, gen_helper_sve2_umull_zzz_h, + gen_helper_sve2_umull_zzz_s, gen_helper_sve2_umull_zzz_d, +}; +TRANS_FEAT(UMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + umull_fns[a->esz], a, 0) +TRANS_FEAT(UMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, + umull_fns[a->esz], a, 3) + +static gen_helper_gvec_3 * const eoril_fns[4] = { + gen_helper_sve2_eoril_b, gen_helper_sve2_eoril_h, + gen_helper_sve2_eoril_s, gen_helper_sve2_eoril_d, +}; +TRANS_FEAT(EORBT, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 2) +TRANS_FEAT(EORTB, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 1) + +static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h, + NULL, gen_helper_sve2_pmull_d, + }; + + if (a->esz == 0) { + if (!dc_isar_feature(aa64_sve2_pmull128, s)) { + return false; + } + s->is_nonstreaming = true; + } else if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + return gen_gvec_ool_arg_zzz(s, fns[a->esz], a, sel); +} + +TRANS_FEAT(PMULLB, aa64_sve2, do_trans_pmull, a, false) +TRANS_FEAT(PMULLT, aa64_sve2, do_trans_pmull, a, true) + +static gen_helper_gvec_3 * const saddw_fns[4] = { + NULL, gen_helper_sve2_saddw_h, + gen_helper_sve2_saddw_s, gen_helper_sve2_saddw_d, +}; +TRANS_FEAT(SADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 0) +TRANS_FEAT(SADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 1) + +static gen_helper_gvec_3 * const ssubw_fns[4] = { + NULL, gen_helper_sve2_ssubw_h, + gen_helper_sve2_ssubw_s, gen_helper_sve2_ssubw_d, +}; +TRANS_FEAT(SSUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 0) +TRANS_FEAT(SSUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 1) + +static gen_helper_gvec_3 * const uaddw_fns[4] = { + NULL, gen_helper_sve2_uaddw_h, + gen_helper_sve2_uaddw_s, gen_helper_sve2_uaddw_d, +}; +TRANS_FEAT(UADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 0) +TRANS_FEAT(UADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 1) + +static gen_helper_gvec_3 * const usubw_fns[4] = { + NULL, gen_helper_sve2_usubw_h, + gen_helper_sve2_usubw_s, gen_helper_sve2_usubw_d, +}; +TRANS_FEAT(USUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 0) +TRANS_FEAT(USUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 1) + +static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm) +{ + int top = imm & 1; + int shl = imm >> 1; + int halfbits = 4 << vece; + + if (top) { + if (shl == halfbits) { + TCGv_vec t = tcg_temp_new_vec_matching(d); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits)); + tcg_gen_and_vec(vece, d, n, t); + tcg_temp_free_vec(t); + } else { + tcg_gen_sari_vec(vece, d, n, halfbits); + tcg_gen_shli_vec(vece, d, d, shl); + } + } else { + tcg_gen_shli_vec(vece, d, n, halfbits); + tcg_gen_sari_vec(vece, d, d, halfbits - shl); + } +} + +static void gen_ushll_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int imm) +{ + int halfbits = 4 << vece; + int top = imm & 1; + int shl = (imm >> 1); + int shift; + uint64_t mask; + + mask = MAKE_64BIT_MASK(0, halfbits); + mask <<= shl; + mask = dup_const(vece, mask); + + shift = shl - top * halfbits; + if (shift < 0) { + tcg_gen_shri_i64(d, n, -shift); + } else { + tcg_gen_shli_i64(d, n, shift); + } + tcg_gen_andi_i64(d, d, mask); +} + +static void gen_ushll16_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm) +{ + gen_ushll_i64(MO_16, d, n, imm); +} + +static void gen_ushll32_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm) +{ + gen_ushll_i64(MO_32, d, n, imm); +} + +static void gen_ushll64_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm) +{ + gen_ushll_i64(MO_64, d, n, imm); +} + +static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm) +{ + int halfbits = 4 << vece; + int top = imm & 1; + int shl = imm >> 1; + + if (top) { + if (shl == halfbits) { + TCGv_vec t = tcg_temp_new_vec_matching(d); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits)); + tcg_gen_and_vec(vece, d, n, t); + tcg_temp_free_vec(t); + } else { + tcg_gen_shri_vec(vece, d, n, halfbits); + tcg_gen_shli_vec(vece, d, d, shl); + } + } else { + if (shl == 0) { + TCGv_vec t = tcg_temp_new_vec_matching(d); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_and_vec(vece, d, n, t); + tcg_temp_free_vec(t); + } else { + tcg_gen_shli_vec(vece, d, n, halfbits); + tcg_gen_shri_vec(vece, d, d, halfbits - shl); + } + } +} + +static bool do_shll_tb(DisasContext *s, arg_rri_esz *a, + const GVecGen2i ops[3], bool sel) +{ + + if (a->esz < 0 || a->esz > 2) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, (a->imm << 1) | sel, + &ops[a->esz]); + } + return true; +} + +static const TCGOpcode sshll_list[] = { + INDEX_op_shli_vec, INDEX_op_sari_vec, 0 +}; +static const GVecGen2i sshll_ops[3] = { + { .fniv = gen_sshll_vec, + .opt_opc = sshll_list, + .fno = gen_helper_sve2_sshll_h, + .vece = MO_16 }, + { .fniv = gen_sshll_vec, + .opt_opc = sshll_list, + .fno = gen_helper_sve2_sshll_s, + .vece = MO_32 }, + { .fniv = gen_sshll_vec, + .opt_opc = sshll_list, + .fno = gen_helper_sve2_sshll_d, + .vece = MO_64 } +}; +TRANS_FEAT(SSHLLB, aa64_sve2, do_shll_tb, a, sshll_ops, false) +TRANS_FEAT(SSHLLT, aa64_sve2, do_shll_tb, a, sshll_ops, true) + +static const TCGOpcode ushll_list[] = { + INDEX_op_shli_vec, INDEX_op_shri_vec, 0 +}; +static const GVecGen2i ushll_ops[3] = { + { .fni8 = gen_ushll16_i64, + .fniv = gen_ushll_vec, + .opt_opc = ushll_list, + .fno = gen_helper_sve2_ushll_h, + .vece = MO_16 }, + { .fni8 = gen_ushll32_i64, + .fniv = gen_ushll_vec, + .opt_opc = ushll_list, + .fno = gen_helper_sve2_ushll_s, + .vece = MO_32 }, + { .fni8 = gen_ushll64_i64, + .fniv = gen_ushll_vec, + .opt_opc = ushll_list, + .fno = gen_helper_sve2_ushll_d, + .vece = MO_64 }, +}; +TRANS_FEAT(USHLLB, aa64_sve2, do_shll_tb, a, ushll_ops, false) +TRANS_FEAT(USHLLT, aa64_sve2, do_shll_tb, a, ushll_ops, true) + +static gen_helper_gvec_3 * const bext_fns[4] = { + gen_helper_sve2_bext_b, gen_helper_sve2_bext_h, + gen_helper_sve2_bext_s, gen_helper_sve2_bext_d, +}; +TRANS_FEAT_NONSTREAMING(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bext_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const bdep_fns[4] = { + gen_helper_sve2_bdep_b, gen_helper_sve2_bdep_h, + gen_helper_sve2_bdep_s, gen_helper_sve2_bdep_d, +}; +TRANS_FEAT_NONSTREAMING(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bdep_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const bgrp_fns[4] = { + gen_helper_sve2_bgrp_b, gen_helper_sve2_bgrp_h, + gen_helper_sve2_bgrp_s, gen_helper_sve2_bgrp_d, +}; +TRANS_FEAT_NONSTREAMING(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bgrp_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const cadd_fns[4] = { + gen_helper_sve2_cadd_b, gen_helper_sve2_cadd_h, + gen_helper_sve2_cadd_s, gen_helper_sve2_cadd_d, +}; +TRANS_FEAT(CADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz, + cadd_fns[a->esz], a, 0) +TRANS_FEAT(CADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz, + cadd_fns[a->esz], a, 1) + +static gen_helper_gvec_3 * const sqcadd_fns[4] = { + gen_helper_sve2_sqcadd_b, gen_helper_sve2_sqcadd_h, + gen_helper_sve2_sqcadd_s, gen_helper_sve2_sqcadd_d, +}; +TRANS_FEAT(SQCADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz, + sqcadd_fns[a->esz], a, 0) +TRANS_FEAT(SQCADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz, + sqcadd_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const sabal_fns[4] = { + NULL, gen_helper_sve2_sabal_h, + gen_helper_sve2_sabal_s, gen_helper_sve2_sabal_d, +}; +TRANS_FEAT(SABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 0) +TRANS_FEAT(SABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const uabal_fns[4] = { + NULL, gen_helper_sve2_uabal_h, + gen_helper_sve2_uabal_s, gen_helper_sve2_uabal_d, +}; +TRANS_FEAT(UABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 0) +TRANS_FEAT(UABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 1) + +static bool do_adcl(DisasContext *s, arg_rrrr_esz *a, bool sel) +{ + static gen_helper_gvec_4 * const fns[2] = { + gen_helper_sve2_adcl_s, + gen_helper_sve2_adcl_d, + }; + /* + * Note that in this case the ESZ field encodes both size and sign. + * Split out 'subtract' into bit 1 of the data field for the helper. + */ + return gen_gvec_ool_arg_zzzz(s, fns[a->esz & 1], a, (a->esz & 2) | sel); +} + +TRANS_FEAT(ADCLB, aa64_sve2, do_adcl, a, false) +TRANS_FEAT(ADCLT, aa64_sve2, do_adcl, a, true) + +TRANS_FEAT(SSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ssra, a) +TRANS_FEAT(USRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_usra, a) +TRANS_FEAT(SRSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_srsra, a) +TRANS_FEAT(URSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ursra, a) +TRANS_FEAT(SRI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sri, a) +TRANS_FEAT(SLI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sli, a) + +TRANS_FEAT(SABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_saba, a) +TRANS_FEAT(UABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_uaba, a) + +static bool do_narrow_extract(DisasContext *s, arg_rri_esz *a, + const GVecGen2 ops[3]) +{ + if (a->esz < 0 || a->esz > MO_32 || a->imm != 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, &ops[a->esz]); + } + return true; +} + +static const TCGOpcode sqxtn_list[] = { + INDEX_op_shli_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 +}; + +static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t mask = (1ull << halfbits) - 1; + int64_t min = -1ull << (halfbits - 1); + int64_t max = -min - 1; + + tcg_gen_dupi_vec(vece, t, min); + tcg_gen_smax_vec(vece, d, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_smin_vec(vece, d, d, t); + tcg_gen_dupi_vec(vece, t, mask); + tcg_gen_and_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +static const GVecGen2 sqxtnb_ops[3] = { + { .fniv = gen_sqxtnb_vec, + .opt_opc = sqxtn_list, + .fno = gen_helper_sve2_sqxtnb_h, + .vece = MO_16 }, + { .fniv = gen_sqxtnb_vec, + .opt_opc = sqxtn_list, + .fno = gen_helper_sve2_sqxtnb_s, + .vece = MO_32 }, + { .fniv = gen_sqxtnb_vec, + .opt_opc = sqxtn_list, + .fno = gen_helper_sve2_sqxtnb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops) + +static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t mask = (1ull << halfbits) - 1; + int64_t min = -1ull << (halfbits - 1); + int64_t max = -min - 1; + + tcg_gen_dupi_vec(vece, t, min); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_smin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_dupi_vec(vece, t, mask); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const GVecGen2 sqxtnt_ops[3] = { + { .fniv = gen_sqxtnt_vec, + .opt_opc = sqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtnt_h, + .vece = MO_16 }, + { .fniv = gen_sqxtnt_vec, + .opt_opc = sqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtnt_s, + .vece = MO_32 }, + { .fniv = gen_sqxtnt_vec, + .opt_opc = sqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtnt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQXTNT, aa64_sve2, do_narrow_extract, a, sqxtnt_ops) + +static const TCGOpcode uqxtn_list[] = { + INDEX_op_shli_vec, INDEX_op_umin_vec, 0 +}; + +static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = (1ull << halfbits) - 1; + + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_umin_vec(vece, d, n, t); + tcg_temp_free_vec(t); +} + +static const GVecGen2 uqxtnb_ops[3] = { + { .fniv = gen_uqxtnb_vec, + .opt_opc = uqxtn_list, + .fno = gen_helper_sve2_uqxtnb_h, + .vece = MO_16 }, + { .fniv = gen_uqxtnb_vec, + .opt_opc = uqxtn_list, + .fno = gen_helper_sve2_uqxtnb_s, + .vece = MO_32 }, + { .fniv = gen_uqxtnb_vec, + .opt_opc = uqxtn_list, + .fno = gen_helper_sve2_uqxtnb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops) + +static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = (1ull << halfbits) - 1; + + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const GVecGen2 uqxtnt_ops[3] = { + { .fniv = gen_uqxtnt_vec, + .opt_opc = uqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_uqxtnt_h, + .vece = MO_16 }, + { .fniv = gen_uqxtnt_vec, + .opt_opc = uqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_uqxtnt_s, + .vece = MO_32 }, + { .fniv = gen_uqxtnt_vec, + .opt_opc = uqxtn_list, + .load_dest = true, + .fno = gen_helper_sve2_uqxtnt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(UQXTNT, aa64_sve2, do_narrow_extract, a, uqxtnt_ops) + +static const TCGOpcode sqxtun_list[] = { + INDEX_op_shli_vec, INDEX_op_umin_vec, INDEX_op_smax_vec, 0 +}; + +static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = (1ull << halfbits) - 1; + + tcg_gen_dupi_vec(vece, t, 0); + tcg_gen_smax_vec(vece, d, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_umin_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +static const GVecGen2 sqxtunb_ops[3] = { + { .fniv = gen_sqxtunb_vec, + .opt_opc = sqxtun_list, + .fno = gen_helper_sve2_sqxtunb_h, + .vece = MO_16 }, + { .fniv = gen_sqxtunb_vec, + .opt_opc = sqxtun_list, + .fno = gen_helper_sve2_sqxtunb_s, + .vece = MO_32 }, + { .fniv = gen_sqxtunb_vec, + .opt_opc = sqxtun_list, + .fno = gen_helper_sve2_sqxtunb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops) + +static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = (1ull << halfbits) - 1; + + tcg_gen_dupi_vec(vece, t, 0); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const GVecGen2 sqxtunt_ops[3] = { + { .fniv = gen_sqxtunt_vec, + .opt_opc = sqxtun_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtunt_h, + .vece = MO_16 }, + { .fniv = gen_sqxtunt_vec, + .opt_opc = sqxtun_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtunt_s, + .vece = MO_32 }, + { .fniv = gen_sqxtunt_vec, + .opt_opc = sqxtun_list, + .load_dest = true, + .fno = gen_helper_sve2_sqxtunt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQXTUNT, aa64_sve2, do_narrow_extract, a, sqxtunt_ops) + +static bool do_shr_narrow(DisasContext *s, arg_rri_esz *a, + const GVecGen2i ops[3]) +{ + if (a->esz < 0 || a->esz > MO_32) { + return false; + } + assert(a->imm > 0 && a->imm <= (8 << a->esz)); + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, a->imm, &ops[a->esz]); + } + return true; +} + +static void gen_shrnb_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr) +{ + int halfbits = 4 << vece; + uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits)); + + tcg_gen_shri_i64(d, n, shr); + tcg_gen_andi_i64(d, d, mask); +} + +static void gen_shrnb16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + gen_shrnb_i64(MO_16, d, n, shr); +} + +static void gen_shrnb32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + gen_shrnb_i64(MO_32, d, n, shr); +} + +static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + gen_shrnb_i64(MO_64, d, n, shr); +} + +static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + uint64_t mask = MAKE_64BIT_MASK(0, halfbits); + + tcg_gen_shri_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, mask); + tcg_gen_and_vec(vece, d, n, t); + tcg_temp_free_vec(t); +} + +static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 }; +static const GVecGen2i shrnb_ops[3] = { + { .fni8 = gen_shrnb16_i64, + .fniv = gen_shrnb_vec, + .opt_opc = shrnb_vec_list, + .fno = gen_helper_sve2_shrnb_h, + .vece = MO_16 }, + { .fni8 = gen_shrnb32_i64, + .fniv = gen_shrnb_vec, + .opt_opc = shrnb_vec_list, + .fno = gen_helper_sve2_shrnb_s, + .vece = MO_32 }, + { .fni8 = gen_shrnb64_i64, + .fniv = gen_shrnb_vec, + .opt_opc = shrnb_vec_list, + .fno = gen_helper_sve2_shrnb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SHRNB, aa64_sve2, do_shr_narrow, a, shrnb_ops) + +static void gen_shrnt_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr) +{ + int halfbits = 4 << vece; + uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits)); + + tcg_gen_shli_i64(n, n, halfbits - shr); + tcg_gen_andi_i64(n, n, ~mask); + tcg_gen_andi_i64(d, d, mask); + tcg_gen_or_i64(d, d, n); +} + +static void gen_shrnt16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + gen_shrnt_i64(MO_16, d, n, shr); +} + +static void gen_shrnt32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + gen_shrnt_i64(MO_32, d, n, shr); +} + +static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr) +{ + tcg_gen_shri_i64(n, n, shr); + tcg_gen_deposit_i64(d, d, n, 32, 32); +} + +static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + uint64_t mask = MAKE_64BIT_MASK(0, halfbits); + + tcg_gen_shli_vec(vece, n, n, halfbits - shr); + tcg_gen_dupi_vec(vece, t, mask); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 }; +static const GVecGen2i shrnt_ops[3] = { + { .fni8 = gen_shrnt16_i64, + .fniv = gen_shrnt_vec, + .opt_opc = shrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_shrnt_h, + .vece = MO_16 }, + { .fni8 = gen_shrnt32_i64, + .fniv = gen_shrnt_vec, + .opt_opc = shrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_shrnt_s, + .vece = MO_32 }, + { .fni8 = gen_shrnt64_i64, + .fniv = gen_shrnt_vec, + .opt_opc = shrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_shrnt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SHRNT, aa64_sve2, do_shr_narrow, a, shrnt_ops) + +static const GVecGen2i rshrnb_ops[3] = { + { .fno = gen_helper_sve2_rshrnb_h }, + { .fno = gen_helper_sve2_rshrnb_s }, + { .fno = gen_helper_sve2_rshrnb_d }, +}; +TRANS_FEAT(RSHRNB, aa64_sve2, do_shr_narrow, a, rshrnb_ops) + +static const GVecGen2i rshrnt_ops[3] = { + { .fno = gen_helper_sve2_rshrnt_h }, + { .fno = gen_helper_sve2_rshrnt_s }, + { .fno = gen_helper_sve2_rshrnt_d }, +}; +TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops) + +static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + + tcg_gen_sari_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, 0); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_umin_vec(vece, d, n, t); + tcg_temp_free_vec(t); +} + +static const TCGOpcode sqshrunb_vec_list[] = { + INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_umin_vec, 0 +}; +static const GVecGen2i sqshrunb_ops[3] = { + { .fniv = gen_sqshrunb_vec, + .opt_opc = sqshrunb_vec_list, + .fno = gen_helper_sve2_sqshrunb_h, + .vece = MO_16 }, + { .fniv = gen_sqshrunb_vec, + .opt_opc = sqshrunb_vec_list, + .fno = gen_helper_sve2_sqshrunb_s, + .vece = MO_32 }, + { .fniv = gen_sqshrunb_vec, + .opt_opc = sqshrunb_vec_list, + .fno = gen_helper_sve2_sqshrunb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops) + +static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + + tcg_gen_sari_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, 0); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const TCGOpcode sqshrunt_vec_list[] = { + INDEX_op_shli_vec, INDEX_op_sari_vec, + INDEX_op_smax_vec, INDEX_op_umin_vec, 0 +}; +static const GVecGen2i sqshrunt_ops[3] = { + { .fniv = gen_sqshrunt_vec, + .opt_opc = sqshrunt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrunt_h, + .vece = MO_16 }, + { .fniv = gen_sqshrunt_vec, + .opt_opc = sqshrunt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrunt_s, + .vece = MO_32 }, + { .fniv = gen_sqshrunt_vec, + .opt_opc = sqshrunt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrunt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQSHRUNT, aa64_sve2, do_shr_narrow, a, sqshrunt_ops) + +static const GVecGen2i sqrshrunb_ops[3] = { + { .fno = gen_helper_sve2_sqrshrunb_h }, + { .fno = gen_helper_sve2_sqrshrunb_s }, + { .fno = gen_helper_sve2_sqrshrunb_d }, +}; +TRANS_FEAT(SQRSHRUNB, aa64_sve2, do_shr_narrow, a, sqrshrunb_ops) + +static const GVecGen2i sqrshrunt_ops[3] = { + { .fno = gen_helper_sve2_sqrshrunt_h }, + { .fno = gen_helper_sve2_sqrshrunt_s }, + { .fno = gen_helper_sve2_sqrshrunt_d }, +}; +TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops) + +static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = MAKE_64BIT_MASK(0, halfbits - 1); + int64_t min = -max - 1; + + tcg_gen_sari_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, min); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_smin_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_and_vec(vece, d, n, t); + tcg_temp_free_vec(t); +} + +static const TCGOpcode sqshrnb_vec_list[] = { + INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_smin_vec, 0 +}; +static const GVecGen2i sqshrnb_ops[3] = { + { .fniv = gen_sqshrnb_vec, + .opt_opc = sqshrnb_vec_list, + .fno = gen_helper_sve2_sqshrnb_h, + .vece = MO_16 }, + { .fniv = gen_sqshrnb_vec, + .opt_opc = sqshrnb_vec_list, + .fno = gen_helper_sve2_sqshrnb_s, + .vece = MO_32 }, + { .fniv = gen_sqshrnb_vec, + .opt_opc = sqshrnb_vec_list, + .fno = gen_helper_sve2_sqshrnb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops) + +static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + int64_t max = MAKE_64BIT_MASK(0, halfbits - 1); + int64_t min = -max - 1; + + tcg_gen_sari_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, min); + tcg_gen_smax_vec(vece, n, n, t); + tcg_gen_dupi_vec(vece, t, max); + tcg_gen_smin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const TCGOpcode sqshrnt_vec_list[] = { + INDEX_op_shli_vec, INDEX_op_sari_vec, + INDEX_op_smax_vec, INDEX_op_smin_vec, 0 +}; +static const GVecGen2i sqshrnt_ops[3] = { + { .fniv = gen_sqshrnt_vec, + .opt_opc = sqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrnt_h, + .vece = MO_16 }, + { .fniv = gen_sqshrnt_vec, + .opt_opc = sqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrnt_s, + .vece = MO_32 }, + { .fniv = gen_sqshrnt_vec, + .opt_opc = sqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_sqshrnt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(SQSHRNT, aa64_sve2, do_shr_narrow, a, sqshrnt_ops) + +static const GVecGen2i sqrshrnb_ops[3] = { + { .fno = gen_helper_sve2_sqrshrnb_h }, + { .fno = gen_helper_sve2_sqrshrnb_s }, + { .fno = gen_helper_sve2_sqrshrnb_d }, +}; +TRANS_FEAT(SQRSHRNB, aa64_sve2, do_shr_narrow, a, sqrshrnb_ops) + +static const GVecGen2i sqrshrnt_ops[3] = { + { .fno = gen_helper_sve2_sqrshrnt_h }, + { .fno = gen_helper_sve2_sqrshrnt_s }, + { .fno = gen_helper_sve2_sqrshrnt_d }, +}; +TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops) + +static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + + tcg_gen_shri_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_umin_vec(vece, d, n, t); + tcg_temp_free_vec(t); +} + +static const TCGOpcode uqshrnb_vec_list[] = { + INDEX_op_shri_vec, INDEX_op_umin_vec, 0 +}; +static const GVecGen2i uqshrnb_ops[3] = { + { .fniv = gen_uqshrnb_vec, + .opt_opc = uqshrnb_vec_list, + .fno = gen_helper_sve2_uqshrnb_h, + .vece = MO_16 }, + { .fniv = gen_uqshrnb_vec, + .opt_opc = uqshrnb_vec_list, + .fno = gen_helper_sve2_uqshrnb_s, + .vece = MO_32 }, + { .fniv = gen_uqshrnb_vec, + .opt_opc = uqshrnb_vec_list, + .fno = gen_helper_sve2_uqshrnb_d, + .vece = MO_64 }, +}; +TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops) + +static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d, + TCGv_vec n, int64_t shr) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + int halfbits = 4 << vece; + + tcg_gen_shri_vec(vece, n, n, shr); + tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits)); + tcg_gen_umin_vec(vece, n, n, t); + tcg_gen_shli_vec(vece, n, n, halfbits); + tcg_gen_bitsel_vec(vece, d, t, d, n); + tcg_temp_free_vec(t); +} + +static const TCGOpcode uqshrnt_vec_list[] = { + INDEX_op_shli_vec, INDEX_op_shri_vec, INDEX_op_umin_vec, 0 +}; +static const GVecGen2i uqshrnt_ops[3] = { + { .fniv = gen_uqshrnt_vec, + .opt_opc = uqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_uqshrnt_h, + .vece = MO_16 }, + { .fniv = gen_uqshrnt_vec, + .opt_opc = uqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_uqshrnt_s, + .vece = MO_32 }, + { .fniv = gen_uqshrnt_vec, + .opt_opc = uqshrnt_vec_list, + .load_dest = true, + .fno = gen_helper_sve2_uqshrnt_d, + .vece = MO_64 }, +}; +TRANS_FEAT(UQSHRNT, aa64_sve2, do_shr_narrow, a, uqshrnt_ops) + +static const GVecGen2i uqrshrnb_ops[3] = { + { .fno = gen_helper_sve2_uqrshrnb_h }, + { .fno = gen_helper_sve2_uqrshrnb_s }, + { .fno = gen_helper_sve2_uqrshrnb_d }, +}; +TRANS_FEAT(UQRSHRNB, aa64_sve2, do_shr_narrow, a, uqrshrnb_ops) + +static const GVecGen2i uqrshrnt_ops[3] = { + { .fno = gen_helper_sve2_uqrshrnt_h }, + { .fno = gen_helper_sve2_uqrshrnt_s }, + { .fno = gen_helper_sve2_uqrshrnt_d }, +}; +TRANS_FEAT(UQRSHRNT, aa64_sve2, do_shr_narrow, a, uqrshrnt_ops) + +#define DO_SVE2_ZZZ_NARROW(NAME, name) \ + static gen_helper_gvec_3 * const name##_fns[4] = { \ + NULL, gen_helper_sve2_##name##_h, \ + gen_helper_sve2_##name##_s, gen_helper_sve2_##name##_d, \ + }; \ + TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzz, \ + name##_fns[a->esz], a, 0) + +DO_SVE2_ZZZ_NARROW(ADDHNB, addhnb) +DO_SVE2_ZZZ_NARROW(ADDHNT, addhnt) +DO_SVE2_ZZZ_NARROW(RADDHNB, raddhnb) +DO_SVE2_ZZZ_NARROW(RADDHNT, raddhnt) + +DO_SVE2_ZZZ_NARROW(SUBHNB, subhnb) +DO_SVE2_ZZZ_NARROW(SUBHNT, subhnt) +DO_SVE2_ZZZ_NARROW(RSUBHNB, rsubhnb) +DO_SVE2_ZZZ_NARROW(RSUBHNT, rsubhnt) + +static gen_helper_gvec_flags_4 * const match_fns[4] = { + gen_helper_sve2_match_ppzz_b, gen_helper_sve2_match_ppzz_h, NULL, NULL +}; +TRANS_FEAT_NONSTREAMING(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz]) + +static gen_helper_gvec_flags_4 * const nmatch_fns[4] = { + gen_helper_sve2_nmatch_ppzz_b, gen_helper_sve2_nmatch_ppzz_h, NULL, NULL +}; +TRANS_FEAT_NONSTREAMING(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz]) + +static gen_helper_gvec_4 * const histcnt_fns[4] = { + NULL, NULL, gen_helper_sve2_histcnt_s, gen_helper_sve2_histcnt_d +}; +TRANS_FEAT_NONSTREAMING(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz, + histcnt_fns[a->esz], a, 0) + +TRANS_FEAT_NONSTREAMING(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz, + a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0) + +DO_ZPZZ_FP(FADDP, aa64_sve2, sve2_faddp_zpzz) +DO_ZPZZ_FP(FMAXNMP, aa64_sve2, sve2_fmaxnmp_zpzz) +DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz) +DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz) +DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz) + +/* + * SVE Integer Multiply-Add (unpredicated) + */ + +TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz, + gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra, + 0, FPST_FPCR) +TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz, + gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra, + 0, FPST_FPCR) + +static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = { + NULL, gen_helper_sve2_sqdmlal_zzzw_h, + gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d, +}; +TRANS_FEAT(SQDMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlal_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(SQDMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlal_zzzw_fns[a->esz], a, 3) +TRANS_FEAT(SQDMLALBT, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlal_zzzw_fns[a->esz], a, 2) + +static gen_helper_gvec_4 * const sqdmlsl_zzzw_fns[] = { + NULL, gen_helper_sve2_sqdmlsl_zzzw_h, + gen_helper_sve2_sqdmlsl_zzzw_s, gen_helper_sve2_sqdmlsl_zzzw_d, +}; +TRANS_FEAT(SQDMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlsl_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(SQDMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlsl_zzzw_fns[a->esz], a, 3) +TRANS_FEAT(SQDMLSLBT, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqdmlsl_zzzw_fns[a->esz], a, 2) + +static gen_helper_gvec_4 * const sqrdmlah_fns[] = { + gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h, + gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d, +}; +TRANS_FEAT(SQRDMLAH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqrdmlah_fns[a->esz], a, 0) + +static gen_helper_gvec_4 * const sqrdmlsh_fns[] = { + gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h, + gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d, +}; +TRANS_FEAT(SQRDMLSH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz, + sqrdmlsh_fns[a->esz], a, 0) + +static gen_helper_gvec_4 * const smlal_zzzw_fns[] = { + NULL, gen_helper_sve2_smlal_zzzw_h, + gen_helper_sve2_smlal_zzzw_s, gen_helper_sve2_smlal_zzzw_d, +}; +TRANS_FEAT(SMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + smlal_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(SMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + smlal_zzzw_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const umlal_zzzw_fns[] = { + NULL, gen_helper_sve2_umlal_zzzw_h, + gen_helper_sve2_umlal_zzzw_s, gen_helper_sve2_umlal_zzzw_d, +}; +TRANS_FEAT(UMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + umlal_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(UMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + umlal_zzzw_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const smlsl_zzzw_fns[] = { + NULL, gen_helper_sve2_smlsl_zzzw_h, + gen_helper_sve2_smlsl_zzzw_s, gen_helper_sve2_smlsl_zzzw_d, +}; +TRANS_FEAT(SMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + smlsl_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(SMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + smlsl_zzzw_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const umlsl_zzzw_fns[] = { + NULL, gen_helper_sve2_umlsl_zzzw_h, + gen_helper_sve2_umlsl_zzzw_s, gen_helper_sve2_umlsl_zzzw_d, +}; +TRANS_FEAT(UMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + umlsl_zzzw_fns[a->esz], a, 0) +TRANS_FEAT(UMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz, + umlsl_zzzw_fns[a->esz], a, 1) + +static gen_helper_gvec_4 * const cmla_fns[] = { + gen_helper_sve2_cmla_zzzz_b, gen_helper_sve2_cmla_zzzz_h, + gen_helper_sve2_cmla_zzzz_s, gen_helper_sve2_cmla_zzzz_d, +}; +TRANS_FEAT(CMLA_zzzz, aa64_sve2, gen_gvec_ool_zzzz, + cmla_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot) + +static gen_helper_gvec_4 * const cdot_fns[] = { + NULL, NULL, gen_helper_sve2_cdot_zzzz_s, gen_helper_sve2_cdot_zzzz_d +}; +TRANS_FEAT(CDOT_zzzz, aa64_sve2, gen_gvec_ool_zzzz, + cdot_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot) + +static gen_helper_gvec_4 * const sqrdcmlah_fns[] = { + gen_helper_sve2_sqrdcmlah_zzzz_b, gen_helper_sve2_sqrdcmlah_zzzz_h, + gen_helper_sve2_sqrdcmlah_zzzz_s, gen_helper_sve2_sqrdcmlah_zzzz_d, +}; +TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz, + sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot) + +TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0) + +TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz, + gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt) + +TRANS_FEAT_NONSTREAMING(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz, + gen_helper_crypto_aese, a, false) +TRANS_FEAT_NONSTREAMING(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz, + gen_helper_crypto_aese, a, true) + +TRANS_FEAT_NONSTREAMING(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, + gen_helper_crypto_sm4e, a, 0) +TRANS_FEAT_NONSTREAMING(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, + gen_helper_crypto_sm4ekey, a, 0) + +TRANS_FEAT_NONSTREAMING(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz, + gen_gvec_rax1, a) + +TRANS_FEAT(FCVTNT_sh, aa64_sve2, gen_gvec_fpst_arg_zpz, + gen_helper_sve2_fcvtnt_sh, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTNT_ds, aa64_sve2, gen_gvec_fpst_arg_zpz, + gen_helper_sve2_fcvtnt_ds, a, 0, FPST_FPCR) + +TRANS_FEAT(BFCVTNT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz, + gen_helper_sve_bfcvtnt, a, 0, FPST_FPCR) + +TRANS_FEAT(FCVTLT_hs, aa64_sve2, gen_gvec_fpst_arg_zpz, + gen_helper_sve2_fcvtlt_hs, a, 0, FPST_FPCR) +TRANS_FEAT(FCVTLT_sd, aa64_sve2, gen_gvec_fpst_arg_zpz, + gen_helper_sve2_fcvtlt_sd, a, 0, FPST_FPCR) + +TRANS_FEAT(FCVTX_ds, aa64_sve2, do_frint_mode, a, + float_round_to_odd, gen_helper_sve_fcvt_ds) +TRANS_FEAT(FCVTXNT_ds, aa64_sve2, do_frint_mode, a, + float_round_to_odd, gen_helper_sve2_fcvtnt_ds) + +static gen_helper_gvec_3_ptr * const flogb_fns[] = { + NULL, gen_helper_flogb_h, + gen_helper_flogb_s, gen_helper_flogb_d +}; +TRANS_FEAT(FLOGB, aa64_sve2, gen_gvec_fpst_arg_zpz, flogb_fns[a->esz], + a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) + +static bool do_FMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sub, bool sel) +{ + return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzzw_s, + a->rd, a->rn, a->rm, a->ra, + (sel << 1) | sub, cpu_env); +} + +TRANS_FEAT(FMLALB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, false) +TRANS_FEAT(FMLALT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, true) +TRANS_FEAT(FMLSLB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, false) +TRANS_FEAT(FMLSLT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, true) + +static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel) +{ + return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s, + a->rd, a->rn, a->rm, a->ra, + (a->index << 2) | (sel << 1) | sub, cpu_env); +} + +TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false) +TRANS_FEAT(FMLALT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, true) +TRANS_FEAT(FMLSLB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, false) +TRANS_FEAT(FMLSLT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, true) + +TRANS_FEAT_NONSTREAMING(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_smmla_b, a, 0) +TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_usmmla_b, a, 0) +TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_ummla_b, a, 0) + +TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_bfdot, a, 0) +TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_bfdot_idx, a) + +TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_bfmmla, a, 0) + +static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel) +{ + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal, + a->rd, a->rn, a->rm, a->ra, sel, FPST_FPCR); +} + +TRANS_FEAT(BFMLALB_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, false) +TRANS_FEAT(BFMLALT_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, true) + +static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel) +{ + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal_idx, + a->rd, a->rn, a->rm, a->ra, + (a->index << 1) | sel, FPST_FPCR); +} + +TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false) +TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true) + +static bool trans_PSEL(DisasContext *s, arg_psel *a) +{ + int vl = vec_full_reg_size(s); + int pl = pred_gvec_reg_size(s); + int elements = vl >> a->esz; + TCGv_i64 tmp, didx, dbit; + TCGv_ptr ptr; + + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i64(); + dbit = tcg_temp_new_i64(); + didx = tcg_temp_new_i64(); + ptr = tcg_temp_new_ptr(); + + /* Compute the predicate element. */ + tcg_gen_addi_i64(tmp, cpu_reg(s, a->rv), a->imm); + if (is_power_of_2(elements)) { + tcg_gen_andi_i64(tmp, tmp, elements - 1); + } else { + tcg_gen_remu_i64(tmp, tmp, tcg_constant_i64(elements)); + } + + /* Extract the predicate byte and bit indices. */ + tcg_gen_shli_i64(tmp, tmp, a->esz); + tcg_gen_andi_i64(dbit, tmp, 7); + tcg_gen_shri_i64(didx, tmp, 3); + if (HOST_BIG_ENDIAN) { + tcg_gen_xori_i64(didx, didx, 7); + } + + /* Load the predicate word. */ + tcg_gen_trunc_i64_ptr(ptr, didx); + tcg_gen_add_ptr(ptr, ptr, cpu_env); + tcg_gen_ld8u_i64(tmp, ptr, pred_full_reg_offset(s, a->pm)); + + /* Extract the predicate bit and replicate to MO_64. */ + tcg_gen_shr_i64(tmp, tmp, dbit); + tcg_gen_andi_i64(tmp, tmp, 1); + tcg_gen_neg_i64(tmp, tmp); + + /* Apply to either copy the source, or write zeros. */ + tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd), + pred_full_reg_offset(s, a->pn), tmp, pl, pl); + + tcg_temp_free_i64(tmp); + tcg_temp_free_i64(dbit); + tcg_temp_free_i64(didx); + tcg_temp_free_ptr(ptr); + return true; +} + +static void gen_sclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a) +{ + tcg_gen_smax_i32(d, a, n); + tcg_gen_smin_i32(d, d, m); +} + +static void gen_sclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a) +{ + tcg_gen_smax_i64(d, a, n); + tcg_gen_smin_i64(d, d, m); +} + +static void gen_sclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec a) +{ + tcg_gen_smax_vec(vece, d, a, n); + tcg_gen_smin_vec(vece, d, d, m); +} + +static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const TCGOpcode vecop[] = { + INDEX_op_smin_vec, INDEX_op_smax_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_b, + .opt_opc = vecop, + .vece = MO_8 }, + { .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_h, + .opt_opc = vecop, + .vece = MO_16 }, + { .fni4 = gen_sclamp_i32, + .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_s, + .opt_opc = vecop, + .vece = MO_32 }, + { .fni8 = gen_sclamp_i64, + .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_d, + .opt_opc = vecop, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 } + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); +} + +TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a) + +static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a) +{ + tcg_gen_umax_i32(d, a, n); + tcg_gen_umin_i32(d, d, m); +} + +static void gen_uclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a) +{ + tcg_gen_umax_i64(d, a, n); + tcg_gen_umin_i64(d, d, m); +} + +static void gen_uclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec a) +{ + tcg_gen_umax_vec(vece, d, a, n); + tcg_gen_umin_vec(vece, d, d, m); +} + +static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const TCGOpcode vecop[] = { + INDEX_op_umin_vec, INDEX_op_umax_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_b, + .opt_opc = vecop, + .vece = MO_8 }, + { .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_h, + .opt_opc = vecop, + .vece = MO_16 }, + { .fni4 = gen_uclamp_i32, + .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_s, + .opt_opc = vecop, + .vece = MO_32 }, + { .fni8 = gen_uclamp_i64, + .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_d, + .opt_opc = vecop, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 } + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); +} + +TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a) diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c new file mode 100644 index 0000000..5c5d58d --- /dev/null +++ b/target/arm/tcg/translate-vfp.c @@ -0,0 +1,3619 @@ +/* + * ARM translation: AArch32 VFP instructions + * + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2005-2007 CodeSourcery + * Copyright (c) 2007 OpenedHand, Ltd. + * Copyright (c) 2019 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "exec/exec-all.h" +#include "exec/gen-icount.h" +#include "translate.h" +#include "translate-a32.h" + +/* Include the generated VFP decoder */ +#include "decode-vfp.c.inc" +#include "decode-vfp-uncond.c.inc" + +static inline void vfp_load_reg64(TCGv_i64 var, int reg) +{ + tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg)); +} + +static inline void vfp_store_reg64(TCGv_i64 var, int reg) +{ + tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg)); +} + +static inline void vfp_load_reg32(TCGv_i32 var, int reg) +{ + tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg)); +} + +static inline void vfp_store_reg32(TCGv_i32 var, int reg) +{ + tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg)); +} + +/* + * The imm8 encodes the sign bit, enough bits to represent an exponent in + * the range 01....1xx to 10....0xx, and the most significant 4 bits of + * the mantissa; see VFPExpandImm() in the v8 ARM ARM. + */ +uint64_t vfp_expand_imm(int size, uint8_t imm8) +{ + uint64_t imm; + + switch (size) { + case MO_64: + imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) | + (extract32(imm8, 6, 1) ? 0x3fc0 : 0x4000) | + extract32(imm8, 0, 6); + imm <<= 48; + break; + case MO_32: + imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) | + (extract32(imm8, 6, 1) ? 0x3e00 : 0x4000) | + (extract32(imm8, 0, 6) << 3); + imm <<= 16; + break; + case MO_16: + imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) | + (extract32(imm8, 6, 1) ? 0x3000 : 0x4000) | + (extract32(imm8, 0, 6) << 6); + break; + default: + g_assert_not_reached(); + } + return imm; +} + +/* + * Return the offset of a 16-bit half of the specified VFP single-precision + * register. If top is true, returns the top 16 bits; otherwise the bottom + * 16 bits. + */ +static inline long vfp_f16_offset(unsigned reg, bool top) +{ + long offs = vfp_reg_offset(false, reg); +#if HOST_BIG_ENDIAN + if (!top) { + offs += 2; + } +#else + if (top) { + offs += 2; + } +#endif + return offs; +} + +/* + * Generate code for M-profile lazy FP state preservation if needed; + * this corresponds to the pseudocode PreserveFPState() function. + */ +static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update) +{ + if (s->v7m_lspact) { + /* + * Lazy state saving affects external memory and also the NVIC, + * so we must mark it as an IO operation for icount (and cause + * this to be the last insn in the TB). + */ + if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { + s->base.is_jmp = DISAS_UPDATE_EXIT; + gen_io_start(); + } + gen_helper_v7m_preserve_fp_state(cpu_env); + /* + * If the preserve_fp_state helper doesn't throw an exception + * then it will clear LSPACT; we don't need to repeat this for + * any further FP insns in this TB. + */ + s->v7m_lspact = false; + /* + * The helper might have zeroed VPR, so we do not know the + * correct value for the MVE_NO_PRED TB flag any more. + * If we're about to create a new fp context then that + * will precisely determine the MVE_NO_PRED value (see + * gen_update_fp_context()). Otherwise, we must: + * - set s->mve_no_pred to false, so this instruction + * is generated to use helper functions + * - end the TB now, without chaining to the next TB + */ + if (skip_context_update || !s->v7m_new_fp_ctxt_needed) { + s->mve_no_pred = false; + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + } + } +} + +/* + * Generate code for M-profile FP context handling: update the + * ownership of the FP context, and create a new context if + * necessary. This corresponds to the parts of the pseudocode + * ExecuteFPCheck() after the inital PreserveFPState() call. + */ +static void gen_update_fp_context(DisasContext *s) +{ + /* Update ownership of FP context: set FPCCR.S to match current state */ + if (s->v8m_fpccr_s_wrong) { + TCGv_i32 tmp; + + tmp = load_cpu_field(v7m.fpccr[M_REG_S]); + if (s->v8m_secure) { + tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK); + } else { + tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK); + } + store_cpu_field(tmp, v7m.fpccr[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v8m_fpccr_s_wrong = false; + } + + if (s->v7m_new_fp_ctxt_needed) { + /* + * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, + * the FPSCR, and VPR. + */ + TCGv_i32 control, fpscr; + uint32_t bits = R_V7M_CONTROL_FPCA_MASK; + + fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + if (dc_isar_feature(aa32_mve, s)) { + store_cpu_field(tcg_constant_i32(0), v7m.vpr); + } + /* + * We just updated the FPSCR and VPR. Some of this state is cached + * in the MVE_NO_PRED TB flag. We want to avoid having to end the + * TB here, which means we need the new value of the MVE_NO_PRED + * flag to be exactly known here and the same for all executions. + * Luckily FPDSCR.LTPSIZE is always constant 4 and the VPR is + * always set to 0, so the new MVE_NO_PRED flag is always 1 + * if and only if we have MVE. + * + * (The other FPSCR state cached in TB flags is VECLEN and VECSTRIDE, + * but those do not exist for M-profile, so are not relevant here.) + */ + s->mve_no_pred = dc_isar_feature(aa32_mve, s); + + if (s->v8m_secure) { + bits |= R_V7M_CONTROL_SFPA_MASK; + } + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_ori_i32(control, control, bits); + store_cpu_field(control, v7m.control[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v7m_new_fp_ctxt_needed = false; + } +} + +/* + * Check that VFP access is enabled, A-profile specific version. + * + * If VFP is enabled, return true. If not, emit code to generate an + * appropriate exception and return false. + * The ignore_vfp_enabled argument specifies that we should ignore + * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX + * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns. + */ +static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled) +{ + if (s->fp_excp_el) { + /* + * The full syndrome is only used for HSR when HCPTR traps: + * For v8, when TA==0, coproc is RES0. + * For v7, any use of a Floating-point instruction or access + * to a Floating-point Extension register that is trapped to + * Hyp mode because of a trap configured in the HCPTR sets + * this field to 0xA. + */ + int coproc = arm_dc_feature(s, ARM_FEATURE_V8) ? 0 : 0xa; + uint32_t syn = syn_fp_access_trap(1, 0xe, false, coproc); + + gen_exception_insn_el(s, 0, EXCP_UDEF, syn, s->fp_excp_el); + return false; + } + + /* + * Note that rebuild_hflags_a32 has already accounted for being in EL0 + * and the higher EL in A64 mode, etc. Unlike A64 mode, there do not + * appear to be any insns which touch VFP which are allowed. + */ + if (s->sme_trap_nonstreaming) { + gen_exception_insn(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_Streaming, + curr_insn_len(s) == 2)); + return false; + } + + if (!s->vfp_enabled && !ignore_vfp_enabled) { + assert(!arm_dc_feature(s, ARM_FEATURE_M)); + unallocated_encoding(s); + return false; + } + return true; +} + +/* + * Check that VFP access is enabled, M-profile specific version. + * + * If VFP is enabled, do the necessary M-profile lazy-FP handling and then + * return true. If not, emit code to generate an appropriate exception and + * return false. + * skip_context_update is true to skip the "update FP context" part of this. + */ +bool vfp_access_check_m(DisasContext *s, bool skip_context_update) +{ + if (s->fp_excp_el) { + /* + * M-profile mostly catches the "FPU disabled" case early, in + * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) + * which do coprocessor-checks are outside the large ranges of + * the encoding space handled by the patterns in m-nocp.decode, + * and for them we may need to raise NOCP here. + */ + gen_exception_insn_el(s, 0, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return false; + } + + /* Handle M-profile lazy FP state mechanics */ + + /* Trigger lazy-state preservation if necessary */ + gen_preserve_fp_state(s, skip_context_update); + + if (!skip_context_update) { + /* Update ownership of FP context and create new FP context if needed */ + gen_update_fp_context(s); + } + + return true; +} + +/* + * The most usual kind of VFP access check, for everything except + * FMXR/FMRX to the always-available special registers. + */ +bool vfp_access_check(DisasContext *s) +{ + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return vfp_access_check_m(s, false); + } else { + return vfp_access_check_a(s, false); + } +} + +static bool trans_VSEL(DisasContext *s, arg_VSEL *a) +{ + uint32_t rd, rn, rm; + int sz = a->sz; + + if (!dc_isar_feature(aa32_vsel, s)) { + return false; + } + + if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && + ((a->vm | a->vn | a->vd) & 0x10)) { + return false; + } + + rd = a->vd; + rn = a->vn; + rm = a->vm; + + if (!vfp_access_check(s)) { + return true; + } + + if (sz == 3) { + TCGv_i64 frn, frm, dest; + TCGv_i64 tmp, zero, zf, nf, vf; + + zero = tcg_constant_i64(0); + + frn = tcg_temp_new_i64(); + frm = tcg_temp_new_i64(); + dest = tcg_temp_new_i64(); + + zf = tcg_temp_new_i64(); + nf = tcg_temp_new_i64(); + vf = tcg_temp_new_i64(); + + tcg_gen_extu_i32_i64(zf, cpu_ZF); + tcg_gen_ext_i32_i64(nf, cpu_NF); + tcg_gen_ext_i32_i64(vf, cpu_VF); + + vfp_load_reg64(frn, rn); + vfp_load_reg64(frm, rm); + switch (a->cc) { + case 0: /* eq: Z */ + tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero, frn, frm); + break; + case 1: /* vs: V */ + tcg_gen_movcond_i64(TCG_COND_LT, dest, vf, zero, frn, frm); + break; + case 2: /* ge: N == V -> N ^ V == 0 */ + tmp = tcg_temp_new_i64(); + tcg_gen_xor_i64(tmp, vf, nf); + tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, frn, frm); + tcg_temp_free_i64(tmp); + break; + case 3: /* gt: !Z && N == V */ + tcg_gen_movcond_i64(TCG_COND_NE, dest, zf, zero, frn, frm); + tmp = tcg_temp_new_i64(); + tcg_gen_xor_i64(tmp, vf, nf); + tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, dest, frm); + tcg_temp_free_i64(tmp); + break; + } + vfp_store_reg64(dest, rd); + tcg_temp_free_i64(frn); + tcg_temp_free_i64(frm); + tcg_temp_free_i64(dest); + + tcg_temp_free_i64(zf); + tcg_temp_free_i64(nf); + tcg_temp_free_i64(vf); + } else { + TCGv_i32 frn, frm, dest; + TCGv_i32 tmp, zero; + + zero = tcg_constant_i32(0); + + frn = tcg_temp_new_i32(); + frm = tcg_temp_new_i32(); + dest = tcg_temp_new_i32(); + vfp_load_reg32(frn, rn); + vfp_load_reg32(frm, rm); + switch (a->cc) { + case 0: /* eq: Z */ + tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero, frn, frm); + break; + case 1: /* vs: V */ + tcg_gen_movcond_i32(TCG_COND_LT, dest, cpu_VF, zero, frn, frm); + break; + case 2: /* ge: N == V -> N ^ V == 0 */ + tmp = tcg_temp_new_i32(); + tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF); + tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, frn, frm); + tcg_temp_free_i32(tmp); + break; + case 3: /* gt: !Z && N == V */ + tcg_gen_movcond_i32(TCG_COND_NE, dest, cpu_ZF, zero, frn, frm); + tmp = tcg_temp_new_i32(); + tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF); + tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, dest, frm); + tcg_temp_free_i32(tmp); + break; + } + /* For fp16 the top half is always zeroes */ + if (sz == 1) { + tcg_gen_andi_i32(dest, dest, 0xffff); + } + vfp_store_reg32(dest, rd); + tcg_temp_free_i32(frn); + tcg_temp_free_i32(frm); + tcg_temp_free_i32(dest); + } + + return true; +} + +/* + * Table for converting the most common AArch32 encoding of + * rounding mode to arm_fprounding order (which matches the + * common AArch64 order); see ARM ARM pseudocode FPDecodeRM(). + */ +static const uint8_t fp_decode_rm[] = { + FPROUNDING_TIEAWAY, + FPROUNDING_TIEEVEN, + FPROUNDING_POSINF, + FPROUNDING_NEGINF, +}; + +static bool trans_VRINT(DisasContext *s, arg_VRINT *a) +{ + uint32_t rd, rm; + int sz = a->sz; + TCGv_ptr fpst; + TCGv_i32 tcg_rmode; + int rounding = fp_decode_rm[a->rm]; + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && + ((a->vm | a->vd) & 0x10)) { + return false; + } + + rd = a->vd; + rm = a->vm; + + if (!vfp_access_check(s)) { + return true; + } + + if (sz == 1) { + fpst = fpstatus_ptr(FPST_FPCR_F16); + } else { + fpst = fpstatus_ptr(FPST_FPCR); + } + + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + + if (sz == 3) { + TCGv_i64 tcg_op; + TCGv_i64 tcg_res; + tcg_op = tcg_temp_new_i64(); + tcg_res = tcg_temp_new_i64(); + vfp_load_reg64(tcg_op, rm); + gen_helper_rintd(tcg_res, tcg_op, fpst); + vfp_store_reg64(tcg_res, rd); + tcg_temp_free_i64(tcg_op); + tcg_temp_free_i64(tcg_res); + } else { + TCGv_i32 tcg_op; + TCGv_i32 tcg_res; + tcg_op = tcg_temp_new_i32(); + tcg_res = tcg_temp_new_i32(); + vfp_load_reg32(tcg_op, rm); + if (sz == 1) { + gen_helper_rinth(tcg_res, tcg_op, fpst); + } else { + gen_helper_rints(tcg_res, tcg_op, fpst); + } + vfp_store_reg32(tcg_res, rd); + tcg_temp_free_i32(tcg_op); + tcg_temp_free_i32(tcg_res); + } + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + tcg_temp_free_i32(tcg_rmode); + + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT(DisasContext *s, arg_VCVT *a) +{ + uint32_t rd, rm; + int sz = a->sz; + TCGv_ptr fpst; + TCGv_i32 tcg_rmode, tcg_shift; + int rounding = fp_decode_rm[a->rm]; + bool is_signed = a->op; + + if (!dc_isar_feature(aa32_vcvt_dr, s)) { + return false; + } + + if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + rd = a->vd; + rm = a->vm; + + if (!vfp_access_check(s)) { + return true; + } + + if (sz == 1) { + fpst = fpstatus_ptr(FPST_FPCR_F16); + } else { + fpst = fpstatus_ptr(FPST_FPCR); + } + + tcg_shift = tcg_constant_i32(0); + + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + + if (sz == 3) { + TCGv_i64 tcg_double, tcg_res; + TCGv_i32 tcg_tmp; + tcg_double = tcg_temp_new_i64(); + tcg_res = tcg_temp_new_i64(); + tcg_tmp = tcg_temp_new_i32(); + vfp_load_reg64(tcg_double, rm); + if (is_signed) { + gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst); + } else { + gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst); + } + tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res); + vfp_store_reg32(tcg_tmp, rd); + tcg_temp_free_i32(tcg_tmp); + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_double); + } else { + TCGv_i32 tcg_single, tcg_res; + tcg_single = tcg_temp_new_i32(); + tcg_res = tcg_temp_new_i32(); + vfp_load_reg32(tcg_single, rm); + if (sz == 1) { + if (is_signed) { + gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst); + } else { + gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst); + } + } else { + if (is_signed) { + gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst); + } else { + gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst); + } + } + vfp_store_reg32(tcg_res, rd); + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_single); + } + + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + tcg_temp_free_i32(tcg_rmode); + + tcg_temp_free_ptr(fpst); + + return true; +} + +bool mve_skip_vmov(DisasContext *s, int vn, int index, int size) +{ + /* + * In a CPU with MVE, the VMOV (vector lane to general-purpose register) + * and VMOV (general-purpose register to vector lane) insns are not + * predicated, but they are subject to beatwise execution if they are + * not in an IT block. + * + * Since our implementation always executes all 4 beats in one tick, + * this means only that if PSR.ECI says we should not be executing + * the beat corresponding to the lane of the vector register being + * accessed then we should skip performing the move, and that we need + * to do the usual check for bad ECI state and advance of ECI state. + * + * Note that if PSR.ECI is non-zero then we cannot be in an IT block. + * + * Return true if this VMOV scalar <-> gpreg should be skipped because + * the MVE PSR.ECI state says we skip the beat where the store happens. + */ + + /* Calculate the byte offset into Qn which we're going to access */ + int ofs = (index << size) + ((vn & 1) * 8); + + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + return ofs < 4; + case ECI_A0A1: + return ofs < 8; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return ofs < 12; + default: + g_assert_not_reached(); + } +} + +static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a) +{ + /* VMOV scalar to general purpose register */ + TCGv_i32 tmp; + + /* + * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has + * all sizes, whether the CPU has fp or not. + */ + if (!dc_isar_feature(aa32_mve, s)) { + if (a->size == MO_32 + ? !dc_isar_feature(aa32_fpsp_v2, s) + : !arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) { + return false; + } + + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + + if (!vfp_access_check(s)) { + return true; + } + + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vn, a->index, + a->size | (a->u ? 0 : MO_SIGN)); + store_reg(s, a->rt, tmp); + } + + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } + return true; +} + +static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a) +{ + /* VMOV general purpose register to scalar */ + TCGv_i32 tmp; + + /* + * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has + * all sizes, whether the CPU has fp or not. + */ + if (!dc_isar_feature(aa32_mve, s)) { + if (a->size == MO_32 + ? !dc_isar_feature(aa32_fpsp_v2, s) + : !arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) { + return false; + } + + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + + if (!vfp_access_check(s)) { + return true; + } + + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = load_reg(s, a->rt); + write_neon_element32(tmp, a->vn, a->index, a->size); + tcg_temp_free_i32(tmp); + } + + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } + return true; +} + +static bool trans_VDUP(DisasContext *s, arg_VDUP *a) +{ + /* VDUP (general purpose register) */ + TCGv_i32 tmp; + int size, vec_size; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) { + return false; + } + + if (a->b && a->e) { + return false; + } + + if (a->q && (a->vn & 1)) { + return false; + } + + vec_size = a->q ? 16 : 8; + if (a->b) { + size = 0; + } else if (a->e) { + size = 1; + } else { + size = 2; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = load_reg(s, a->rt); + tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(a->vn), + vec_size, vec_size, tmp); + tcg_temp_free_i32(tmp); + + return true; +} + +static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) +{ + TCGv_i32 tmp; + bool ignore_vfp_enabled = false; + + if (arm_dc_feature(s, ARM_FEATURE_M)) { + /* M profile version was already handled in m-nocp.decode */ + return false; + } + + if (!dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + switch (a->reg) { + case ARM_VFP_FPSID: + /* + * VFPv2 allows access to FPSID from userspace; VFPv3 restricts + * all ID registers to privileged access only. + */ + if (IS_USER(s) && dc_isar_feature(aa32_fpsp_v3, s)) { + return false; + } + ignore_vfp_enabled = true; + break; + case ARM_VFP_MVFR0: + case ARM_VFP_MVFR1: + if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_MVFR)) { + return false; + } + ignore_vfp_enabled = true; + break; + case ARM_VFP_MVFR2: + if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + ignore_vfp_enabled = true; + break; + case ARM_VFP_FPSCR: + break; + case ARM_VFP_FPEXC: + if (IS_USER(s)) { + return false; + } + ignore_vfp_enabled = true; + break; + case ARM_VFP_FPINST: + case ARM_VFP_FPINST2: + /* Not present in VFPv3 */ + if (IS_USER(s) || dc_isar_feature(aa32_fpsp_v3, s)) { + return false; + } + break; + default: + return false; + } + + /* + * Call vfp_access_check_a() directly, because we need to tell + * it to ignore FPEXC.EN for some register accesses. + */ + if (!vfp_access_check_a(s, ignore_vfp_enabled)) { + return true; + } + + if (a->l) { + /* VMRS, move VFP special register to gp register */ + switch (a->reg) { + case ARM_VFP_MVFR0: + case ARM_VFP_MVFR1: + case ARM_VFP_MVFR2: + case ARM_VFP_FPSID: + if (s->current_el == 1) { + gen_set_condexec(s); + gen_update_pc(s, 0); + gen_helper_check_hcr_el2_trap(cpu_env, + tcg_constant_i32(a->rt), + tcg_constant_i32(a->reg)); + } + /* fall through */ + case ARM_VFP_FPEXC: + case ARM_VFP_FPINST: + case ARM_VFP_FPINST2: + tmp = load_cpu_field(vfp.xregs[a->reg]); + break; + case ARM_VFP_FPSCR: + if (a->rt == 15) { + tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + } else { + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + } + break; + default: + g_assert_not_reached(); + } + + if (a->rt == 15) { + /* Set the 4 flag bits in the CPSR. */ + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp); + } else { + store_reg(s, a->rt, tmp); + } + } else { + /* VMSR, move gp register to VFP special register */ + switch (a->reg) { + case ARM_VFP_FPSID: + case ARM_VFP_MVFR0: + case ARM_VFP_MVFR1: + case ARM_VFP_MVFR2: + /* Writes are ignored. */ + break; + case ARM_VFP_FPSCR: + tmp = load_reg(s, a->rt); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + gen_lookup_tb(s); + break; + case ARM_VFP_FPEXC: + /* + * TODO: VFP subarchitecture support. + * For now, keep the EN bit only + */ + tmp = load_reg(s, a->rt); + tcg_gen_andi_i32(tmp, tmp, 1 << 30); + store_cpu_field(tmp, vfp.xregs[a->reg]); + gen_lookup_tb(s); + break; + case ARM_VFP_FPINST: + case ARM_VFP_FPINST2: + tmp = load_reg(s, a->rt); + store_cpu_field(tmp, vfp.xregs[a->reg]); + break; + default: + g_assert_not_reached(); + } + } + + return true; +} + + +static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a) +{ + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (a->rt == 15) { + /* UNPREDICTABLE; we choose to UNDEF */ + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (a->l) { + /* VFP to general purpose register */ + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vn); + tcg_gen_andi_i32(tmp, tmp, 0xffff); + store_reg(s, a->rt, tmp); + } else { + /* general purpose register to VFP */ + tmp = load_reg(s, a->rt); + tcg_gen_andi_i32(tmp, tmp, 0xffff); + vfp_store_reg32(tmp, a->vn); + tcg_temp_free_i32(tmp); + } + + return true; +} + +static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a) +{ + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (a->l) { + /* VFP to general purpose register */ + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vn); + if (a->rt == 15) { + /* Set the 4 flag bits in the CPSR. */ + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp); + } else { + store_reg(s, a->rt, tmp); + } + } else { + /* general purpose register to VFP */ + tmp = load_reg(s, a->rt); + vfp_store_reg32(tmp, a->vn); + tcg_temp_free_i32(tmp); + } + + return true; +} + +static bool trans_VMOV_64_sp(DisasContext *s, arg_VMOV_64_sp *a) +{ + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + /* + * VMOV between two general-purpose registers and two single precision + * floating point registers + */ + if (!vfp_access_check(s)) { + return true; + } + + if (a->op) { + /* fpreg to gpreg */ + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + store_reg(s, a->rt, tmp); + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm + 1); + store_reg(s, a->rt2, tmp); + } else { + /* gpreg to fpreg */ + tmp = load_reg(s, a->rt); + vfp_store_reg32(tmp, a->vm); + tcg_temp_free_i32(tmp); + tmp = load_reg(s, a->rt2); + vfp_store_reg32(tmp, a->vm + 1); + tcg_temp_free_i32(tmp); + } + + return true; +} + +static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a) +{ + TCGv_i32 tmp; + + /* + * VMOV between two general-purpose registers and one double precision + * floating point register. Note that this does not require support + * for double precision arithmetic. + */ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (a->op) { + /* fpreg to gpreg */ + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm * 2); + store_reg(s, a->rt, tmp); + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm * 2 + 1); + store_reg(s, a->rt2, tmp); + } else { + /* gpreg to fpreg */ + tmp = load_reg(s, a->rt); + vfp_store_reg32(tmp, a->vm * 2); + tcg_temp_free_i32(tmp); + tmp = load_reg(s, a->rt2); + vfp_store_reg32(tmp, a->vm * 2 + 1); + tcg_temp_free_i32(tmp); + } + + return true; +} + +static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a) +{ + uint32_t offset; + TCGv_i32 addr, tmp; + + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */ + offset = a->imm << 1; + if (!a->u) { + offset = -offset; + } + + /* For thumb, use of PC is UNPREDICTABLE. */ + addr = add_reg_for_lit(s, a->rn, offset); + tmp = tcg_temp_new_i32(); + if (a->l) { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN); + vfp_store_reg32(tmp, a->vd); + } else { + vfp_load_reg32(tmp, a->vd); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN); + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(addr); + + return true; +} + +static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a) +{ + uint32_t offset; + TCGv_i32 addr, tmp; + + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + offset = a->imm << 2; + if (!a->u) { + offset = -offset; + } + + /* For thumb, use of PC is UNPREDICTABLE. */ + addr = add_reg_for_lit(s, a->rn, offset); + tmp = tcg_temp_new_i32(); + if (a->l) { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + vfp_store_reg32(tmp, a->vd); + } else { + vfp_load_reg32(tmp, a->vd); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(addr); + + return true; +} + +static bool trans_VLDR_VSTR_dp(DisasContext *s, arg_VLDR_VSTR_dp *a) +{ + uint32_t offset; + TCGv_i32 addr; + TCGv_i64 tmp; + + /* Note that this does not require support for double arithmetic. */ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + offset = a->imm << 2; + if (!a->u) { + offset = -offset; + } + + /* For thumb, use of PC is UNPREDICTABLE. */ + addr = add_reg_for_lit(s, a->rn, offset); + tmp = tcg_temp_new_i64(); + if (a->l) { + gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4); + vfp_store_reg64(tmp, a->vd); + } else { + vfp_load_reg64(tmp, a->vd); + gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4); + } + tcg_temp_free_i64(tmp); + tcg_temp_free_i32(addr); + + return true; +} + +static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a) +{ + uint32_t offset; + TCGv_i32 addr, tmp; + int i, n; + + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + n = a->imm; + + if (n == 0 || (a->vd + n) > 32) { + /* + * UNPREDICTABLE cases for bad immediates: we choose to + * UNDEF to avoid generating huge numbers of TCG ops + */ + return false; + } + if (a->rn == 15 && a->w) { + /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */ + return false; + } + + s->eci_handled = true; + + if (!vfp_access_check(s)) { + return true; + } + + /* For thumb, use of PC is UNPREDICTABLE. */ + addr = add_reg_for_lit(s, a->rn, 0); + if (a->p) { + /* pre-decrement */ + tcg_gen_addi_i32(addr, addr, -(a->imm << 2)); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + /* + * Here 'addr' is the lowest address we will store to, + * and is either the old SP (if post-increment) or + * the new SP (if pre-decrement). For post-increment + * where the old value is below the limit and the new + * value is above, it is UNKNOWN whether the limit check + * triggers; we choose to trigger. + */ + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + offset = 4; + tmp = tcg_temp_new_i32(); + for (i = 0; i < n; i++) { + if (a->l) { + /* load */ + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + vfp_store_reg32(tmp, a->vd + i); + } else { + /* store */ + vfp_load_reg32(tmp, a->vd + i); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + } + tcg_gen_addi_i32(addr, addr, offset); + } + tcg_temp_free_i32(tmp); + if (a->w) { + /* writeback */ + if (a->p) { + offset = -offset * n; + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + + clear_eci_state(s); + return true; +} + +static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a) +{ + uint32_t offset; + TCGv_i32 addr; + TCGv_i64 tmp; + int i, n; + + /* Note that this does not require support for double arithmetic. */ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return false; + } + + n = a->imm >> 1; + + if (n == 0 || (a->vd + n) > 32 || n > 16) { + /* + * UNPREDICTABLE cases for bad immediates: we choose to + * UNDEF to avoid generating huge numbers of TCG ops + */ + return false; + } + if (a->rn == 15 && a->w) { + /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */ + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd + n) > 16) { + return false; + } + + s->eci_handled = true; + + if (!vfp_access_check(s)) { + return true; + } + + /* For thumb, use of PC is UNPREDICTABLE. */ + addr = add_reg_for_lit(s, a->rn, 0); + if (a->p) { + /* pre-decrement */ + tcg_gen_addi_i32(addr, addr, -(a->imm << 2)); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + /* + * Here 'addr' is the lowest address we will store to, + * and is either the old SP (if post-increment) or + * the new SP (if pre-decrement). For post-increment + * where the old value is below the limit and the new + * value is above, it is UNKNOWN whether the limit check + * triggers; we choose to trigger. + */ + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + offset = 8; + tmp = tcg_temp_new_i64(); + for (i = 0; i < n; i++) { + if (a->l) { + /* load */ + gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4); + vfp_store_reg64(tmp, a->vd + i); + } else { + /* store */ + vfp_load_reg64(tmp, a->vd + i); + gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4); + } + tcg_gen_addi_i32(addr, addr, offset); + } + tcg_temp_free_i64(tmp); + if (a->w) { + /* writeback */ + if (a->p) { + offset = -offset * n; + } else if (a->imm & 1) { + offset = 4; + } else { + offset = 0; + } + + if (offset != 0) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + + clear_eci_state(s); + return true; +} + +/* + * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp(). + * The callback should emit code to write a value to vd. If + * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd + * will contain the old value of the relevant VFP register; + * otherwise it must be written to only. + */ +typedef void VFPGen3OpSPFn(TCGv_i32 vd, + TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst); +typedef void VFPGen3OpDPFn(TCGv_i64 vd, + TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst); + +/* + * Types for callbacks for do_vfp_2op_sp() and do_vfp_2op_dp(). + * The callback should emit code to write a value to vd (which + * should be written to only). + */ +typedef void VFPGen2OpSPFn(TCGv_i32 vd, TCGv_i32 vm); +typedef void VFPGen2OpDPFn(TCGv_i64 vd, TCGv_i64 vm); + +/* + * Return true if the specified S reg is in a scalar bank + * (ie if it is s0..s7) + */ +static inline bool vfp_sreg_is_scalar(int reg) +{ + return (reg & 0x18) == 0; +} + +/* + * Return true if the specified D reg is in a scalar bank + * (ie if it is d0..d3 or d16..d19) + */ +static inline bool vfp_dreg_is_scalar(int reg) +{ + return (reg & 0xc) == 0; +} + +/* + * Advance the S reg number forwards by delta within its bank + * (ie increment the low 3 bits but leave the rest the same) + */ +static inline int vfp_advance_sreg(int reg, int delta) +{ + return ((reg + delta) & 0x7) | (reg & ~0x7); +} + +/* + * Advance the D reg number forwards by delta within its bank + * (ie increment the low 2 bits but leave the rest the same) + */ +static inline int vfp_advance_dreg(int reg, int delta) +{ + return ((reg + delta) & 0x3) | (reg & ~0x3); +} + +/* + * Perform a 3-operand VFP data processing instruction. fn is the + * callback to do the actual operation; this function deals with the + * code to handle looping around for VFP vector processing. + */ +static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn, + int vd, int vn, int vm, bool reads_vd) +{ + uint32_t delta_m = 0; + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i32 f0, f1, fd; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_sreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = s->vec_stride + 1; + + if (vfp_sreg_is_scalar(vm)) { + /* mixed scalar/vector */ + delta_m = 0; + } else { + /* vector */ + delta_m = delta_d; + } + } + } + + f0 = tcg_temp_new_i32(); + f1 = tcg_temp_new_i32(); + fd = tcg_temp_new_i32(); + fpst = fpstatus_ptr(FPST_FPCR); + + vfp_load_reg32(f0, vn); + vfp_load_reg32(f1, vm); + + for (;;) { + if (reads_vd) { + vfp_load_reg32(fd, vd); + } + fn(fd, f0, f1, fpst); + vfp_store_reg32(fd, vd); + + if (veclen == 0) { + break; + } + + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_sreg(vd, delta_d); + vn = vfp_advance_sreg(vn, delta_d); + vfp_load_reg32(f0, vn); + if (delta_m) { + vm = vfp_advance_sreg(vm, delta_m); + vfp_load_reg32(f1, vm); + } + } + + tcg_temp_free_i32(f0); + tcg_temp_free_i32(f1); + tcg_temp_free_i32(fd); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn, + int vd, int vn, int vm, bool reads_vd) +{ + /* + * Do a half-precision operation. Functionally this is + * the same as do_vfp_3op_sp(), except: + * - it uses the FPST_FPCR_F16 + * - it doesn't need the VFP vector handling (fp16 is a + * v8 feature, and in v8 VFP vectors don't exist) + * - it does the aa32_fp16_arith feature test + */ + TCGv_i32 f0, f1, fd; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + f0 = tcg_temp_new_i32(); + f1 = tcg_temp_new_i32(); + fd = tcg_temp_new_i32(); + fpst = fpstatus_ptr(FPST_FPCR_F16); + + vfp_load_reg32(f0, vn); + vfp_load_reg32(f1, vm); + + if (reads_vd) { + vfp_load_reg32(fd, vd); + } + fn(fd, f0, f1, fpst); + vfp_store_reg32(fd, vd); + + tcg_temp_free_i32(f0); + tcg_temp_free_i32(f1); + tcg_temp_free_i32(fd); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn, + int vd, int vn, int vm, bool reads_vd) +{ + uint32_t delta_m = 0; + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i64 f0, f1, fd; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vn | vm) & 0x10)) { + return false; + } + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_dreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = (s->vec_stride >> 1) + 1; + + if (vfp_dreg_is_scalar(vm)) { + /* mixed scalar/vector */ + delta_m = 0; + } else { + /* vector */ + delta_m = delta_d; + } + } + } + + f0 = tcg_temp_new_i64(); + f1 = tcg_temp_new_i64(); + fd = tcg_temp_new_i64(); + fpst = fpstatus_ptr(FPST_FPCR); + + vfp_load_reg64(f0, vn); + vfp_load_reg64(f1, vm); + + for (;;) { + if (reads_vd) { + vfp_load_reg64(fd, vd); + } + fn(fd, f0, f1, fpst); + vfp_store_reg64(fd, vd); + + if (veclen == 0) { + break; + } + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_dreg(vd, delta_d); + vn = vfp_advance_dreg(vn, delta_d); + vfp_load_reg64(f0, vn); + if (delta_m) { + vm = vfp_advance_dreg(vm, delta_m); + vfp_load_reg64(f1, vm); + } + } + + tcg_temp_free_i64(f0); + tcg_temp_free_i64(f1); + tcg_temp_free_i64(fd); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm) +{ + uint32_t delta_m = 0; + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i32 f0, fd; + + /* Note that the caller must check the aa32_fpsp_v2 feature. */ + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_sreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = s->vec_stride + 1; + + if (vfp_sreg_is_scalar(vm)) { + /* mixed scalar/vector */ + delta_m = 0; + } else { + /* vector */ + delta_m = delta_d; + } + } + } + + f0 = tcg_temp_new_i32(); + fd = tcg_temp_new_i32(); + + vfp_load_reg32(f0, vm); + + for (;;) { + fn(fd, f0); + vfp_store_reg32(fd, vd); + + if (veclen == 0) { + break; + } + + if (delta_m == 0) { + /* single source one-many */ + while (veclen--) { + vd = vfp_advance_sreg(vd, delta_d); + vfp_store_reg32(fd, vd); + } + break; + } + + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_sreg(vd, delta_d); + vm = vfp_advance_sreg(vm, delta_m); + vfp_load_reg32(f0, vm); + } + + tcg_temp_free_i32(f0); + tcg_temp_free_i32(fd); + + return true; +} + +static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm) +{ + /* + * Do a half-precision operation. Functionally this is + * the same as do_vfp_2op_sp(), except: + * - it doesn't need the VFP vector handling (fp16 is a + * v8 feature, and in v8 VFP vectors don't exist) + * - it does the aa32_fp16_arith feature test + */ + TCGv_i32 f0; + + /* Note that the caller must check the aa32_fp16_arith feature */ + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + f0 = tcg_temp_new_i32(); + vfp_load_reg32(f0, vm); + fn(f0, f0); + vfp_store_reg32(f0, vd); + tcg_temp_free_i32(f0); + + return true; +} + +static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm) +{ + uint32_t delta_m = 0; + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i64 f0, fd; + + /* Note that the caller must check the aa32_fpdp_v2 feature. */ + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vm) & 0x10)) { + return false; + } + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_dreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = (s->vec_stride >> 1) + 1; + + if (vfp_dreg_is_scalar(vm)) { + /* mixed scalar/vector */ + delta_m = 0; + } else { + /* vector */ + delta_m = delta_d; + } + } + } + + f0 = tcg_temp_new_i64(); + fd = tcg_temp_new_i64(); + + vfp_load_reg64(f0, vm); + + for (;;) { + fn(fd, f0); + vfp_store_reg64(fd, vd); + + if (veclen == 0) { + break; + } + + if (delta_m == 0) { + /* single source one-many */ + while (veclen--) { + vd = vfp_advance_dreg(vd, delta_d); + vfp_store_reg64(fd, vd); + } + break; + } + + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_dreg(vd, delta_d); + vd = vfp_advance_dreg(vm, delta_m); + vfp_load_reg64(f0, vm); + } + + tcg_temp_free_i64(f0); + tcg_temp_free_i64(fd); + + return true; +} + +static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* Note that order of inputs to the add matters for NaNs */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_mulh(tmp, vn, vm, fpst); + gen_helper_vfp_addh(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a) +{ + return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true); +} + +static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* Note that order of inputs to the add matters for NaNs */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_muls(tmp, vn, vm, fpst); + gen_helper_vfp_adds(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a) +{ + return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true); +} + +static void gen_VMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst) +{ + /* Note that order of inputs to the add matters for NaNs */ + TCGv_i64 tmp = tcg_temp_new_i64(); + + gen_helper_vfp_muld(tmp, vn, vm, fpst); + gen_helper_vfp_addd(vd, vd, tmp, fpst); + tcg_temp_free_i64(tmp); +} + +static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a) +{ + return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true); +} + +static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* + * VMLS: vd = vd + -(vn * vm) + * Note that order of inputs to the add matters for NaNs. + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_mulh(tmp, vn, vm, fpst); + gen_helper_vfp_negh(tmp, tmp); + gen_helper_vfp_addh(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a) +{ + return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true); +} + +static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* + * VMLS: vd = vd + -(vn * vm) + * Note that order of inputs to the add matters for NaNs. + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_muls(tmp, vn, vm, fpst); + gen_helper_vfp_negs(tmp, tmp); + gen_helper_vfp_adds(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VMLS_sp(DisasContext *s, arg_VMLS_sp *a) +{ + return do_vfp_3op_sp(s, gen_VMLS_sp, a->vd, a->vn, a->vm, true); +} + +static void gen_VMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst) +{ + /* + * VMLS: vd = vd + -(vn * vm) + * Note that order of inputs to the add matters for NaNs. + */ + TCGv_i64 tmp = tcg_temp_new_i64(); + + gen_helper_vfp_muld(tmp, vn, vm, fpst); + gen_helper_vfp_negd(tmp, tmp); + gen_helper_vfp_addd(vd, vd, tmp, fpst); + tcg_temp_free_i64(tmp); +} + +static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a) +{ + return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* + * VNMLS: -fd + (fn * fm) + * Note that it isn't valid to replace (-A + B) with (B - A) or similar + * plausible looking simplifications because this will give wrong results + * for NaNs. + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_mulh(tmp, vn, vm, fpst); + gen_helper_vfp_negh(vd, vd); + gen_helper_vfp_addh(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a) +{ + return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* + * VNMLS: -fd + (fn * fm) + * Note that it isn't valid to replace (-A + B) with (B - A) or similar + * plausible looking simplifications because this will give wrong results + * for NaNs. + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_muls(tmp, vn, vm, fpst); + gen_helper_vfp_negs(vd, vd); + gen_helper_vfp_adds(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VNMLS_sp(DisasContext *s, arg_VNMLS_sp *a) +{ + return do_vfp_3op_sp(s, gen_VNMLS_sp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst) +{ + /* + * VNMLS: -fd + (fn * fm) + * Note that it isn't valid to replace (-A + B) with (B - A) or similar + * plausible looking simplifications because this will give wrong results + * for NaNs. + */ + TCGv_i64 tmp = tcg_temp_new_i64(); + + gen_helper_vfp_muld(tmp, vn, vm, fpst); + gen_helper_vfp_negd(vd, vd); + gen_helper_vfp_addd(vd, vd, tmp, fpst); + tcg_temp_free_i64(tmp); +} + +static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a) +{ + return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* VNMLA: -fd + -(fn * fm) */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_mulh(tmp, vn, vm, fpst); + gen_helper_vfp_negh(tmp, tmp); + gen_helper_vfp_negh(vd, vd); + gen_helper_vfp_addh(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a) +{ + return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* VNMLA: -fd + -(fn * fm) */ + TCGv_i32 tmp = tcg_temp_new_i32(); + + gen_helper_vfp_muls(tmp, vn, vm, fpst); + gen_helper_vfp_negs(tmp, tmp); + gen_helper_vfp_negs(vd, vd); + gen_helper_vfp_adds(vd, vd, tmp, fpst); + tcg_temp_free_i32(tmp); +} + +static bool trans_VNMLA_sp(DisasContext *s, arg_VNMLA_sp *a) +{ + return do_vfp_3op_sp(s, gen_VNMLA_sp, a->vd, a->vn, a->vm, true); +} + +static void gen_VNMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst) +{ + /* VNMLA: -fd + (fn * fm) */ + TCGv_i64 tmp = tcg_temp_new_i64(); + + gen_helper_vfp_muld(tmp, vn, vm, fpst); + gen_helper_vfp_negd(tmp, tmp); + gen_helper_vfp_negd(vd, vd); + gen_helper_vfp_addd(vd, vd, tmp, fpst); + tcg_temp_free_i64(tmp); +} + +static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a) +{ + return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true); +} + +static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a) +{ + return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false); +} + +static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a) +{ + return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false); +} + +static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a) +{ + return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false); +} + +static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* VNMUL: -(fn * fm) */ + gen_helper_vfp_mulh(vd, vn, vm, fpst); + gen_helper_vfp_negh(vd, vd); +} + +static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a) +{ + return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false); +} + +static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst) +{ + /* VNMUL: -(fn * fm) */ + gen_helper_vfp_muls(vd, vn, vm, fpst); + gen_helper_vfp_negs(vd, vd); +} + +static bool trans_VNMUL_sp(DisasContext *s, arg_VNMUL_sp *a) +{ + return do_vfp_3op_sp(s, gen_VNMUL_sp, a->vd, a->vn, a->vm, false); +} + +static void gen_VNMUL_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst) +{ + /* VNMUL: -(fn * fm) */ + gen_helper_vfp_muld(vd, vn, vm, fpst); + gen_helper_vfp_negd(vd, vd); +} + +static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a) +{ + return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false); +} + +static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a) +{ + return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false); +} + +static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a) +{ + return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false); +} + +static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a) +{ + return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false); +} + +static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a) +{ + return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false); +} + +static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a) +{ + return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false); +} + +static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a) +{ + return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false); +} + +static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a) +{ + return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false); +} + +static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a) +{ + return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false); +} + +static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a) +{ + return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false); +} + +static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_hp(s, gen_helper_vfp_minnumh, + a->vd, a->vn, a->vm, false); +} + +static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh, + a->vd, a->vn, a->vm, false); +} + +static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_sp(s, gen_helper_vfp_minnums, + a->vd, a->vn, a->vm, false); +} + +static bool trans_VMAXNM_sp(DisasContext *s, arg_VMAXNM_sp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_sp(s, gen_helper_vfp_maxnums, + a->vd, a->vn, a->vm, false); +} + +static bool trans_VMINNM_dp(DisasContext *s, arg_VMINNM_dp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_dp(s, gen_helper_vfp_minnumd, + a->vd, a->vn, a->vm, false); +} + +static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a) +{ + if (!dc_isar_feature(aa32_vminmaxnm, s)) { + return false; + } + return do_vfp_3op_dp(s, gen_helper_vfp_maxnumd, + a->vd, a->vn, a->vm, false); +} + +static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d) +{ + /* + * VFNMA : fd = muladd(-fd, fn, fm) + * VFNMS : fd = muladd(-fd, -fn, fm) + * VFMA : fd = muladd( fd, fn, fm) + * VFMS : fd = muladd( fd, -fn, fm) + * + * These are fused multiply-add, and must be done as one floating + * point operation with no rounding between the multiplication and + * addition steps. NB that doing the negations here as separate + * steps is correct : an input NaN should come out with its sign + * bit flipped if it is a negated-input. + */ + TCGv_ptr fpst; + TCGv_i32 vn, vm, vd; + + /* + * Present in VFPv4 only, and only with the FP16 extension. + * Note that we can't rely on the SIMDFMAC check alone, because + * in a Neon-no-VFP core that ID register field will be non-zero. + */ + if (!dc_isar_feature(aa32_fp16_arith, s) || + !dc_isar_feature(aa32_simdfmac, s) || + !dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vn = tcg_temp_new_i32(); + vm = tcg_temp_new_i32(); + vd = tcg_temp_new_i32(); + + vfp_load_reg32(vn, a->vn); + vfp_load_reg32(vm, a->vm); + if (neg_n) { + /* VFNMS, VFMS */ + gen_helper_vfp_negh(vn, vn); + } + vfp_load_reg32(vd, a->vd); + if (neg_d) { + /* VFNMA, VFNMS */ + gen_helper_vfp_negh(vd, vd); + } + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst); + vfp_store_reg32(vd, a->vd); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(vn); + tcg_temp_free_i32(vm); + tcg_temp_free_i32(vd); + + return true; +} + +static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d) +{ + /* + * VFNMA : fd = muladd(-fd, fn, fm) + * VFNMS : fd = muladd(-fd, -fn, fm) + * VFMA : fd = muladd( fd, fn, fm) + * VFMS : fd = muladd( fd, -fn, fm) + * + * These are fused multiply-add, and must be done as one floating + * point operation with no rounding between the multiplication and + * addition steps. NB that doing the negations here as separate + * steps is correct : an input NaN should come out with its sign + * bit flipped if it is a negated-input. + */ + TCGv_ptr fpst; + TCGv_i32 vn, vm, vd; + + /* + * Present in VFPv4 only. + * Note that we can't rely on the SIMDFMAC check alone, because + * in a Neon-no-VFP core that ID register field will be non-zero. + */ + if (!dc_isar_feature(aa32_simdfmac, s) || + !dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + /* + * In v7A, UNPREDICTABLE with non-zero vector length/stride; from + * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A. + */ + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vn = tcg_temp_new_i32(); + vm = tcg_temp_new_i32(); + vd = tcg_temp_new_i32(); + + vfp_load_reg32(vn, a->vn); + vfp_load_reg32(vm, a->vm); + if (neg_n) { + /* VFNMS, VFMS */ + gen_helper_vfp_negs(vn, vn); + } + vfp_load_reg32(vd, a->vd); + if (neg_d) { + /* VFNMA, VFNMS */ + gen_helper_vfp_negs(vd, vd); + } + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_vfp_muladds(vd, vn, vm, vd, fpst); + vfp_store_reg32(vd, a->vd); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(vn); + tcg_temp_free_i32(vm); + tcg_temp_free_i32(vd); + + return true; +} + +static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d) +{ + /* + * VFNMA : fd = muladd(-fd, fn, fm) + * VFNMS : fd = muladd(-fd, -fn, fm) + * VFMA : fd = muladd( fd, fn, fm) + * VFMS : fd = muladd( fd, -fn, fm) + * + * These are fused multiply-add, and must be done as one floating + * point operation with no rounding between the multiplication and + * addition steps. NB that doing the negations here as separate + * steps is correct : an input NaN should come out with its sign + * bit flipped if it is a negated-input. + */ + TCGv_ptr fpst; + TCGv_i64 vn, vm, vd; + + /* + * Present in VFPv4 only. + * Note that we can't rely on the SIMDFMAC check alone, because + * in a Neon-no-VFP core that ID register field will be non-zero. + */ + if (!dc_isar_feature(aa32_simdfmac, s) || + !dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + /* + * In v7A, UNPREDICTABLE with non-zero vector length/stride; from + * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A. + */ + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vn = tcg_temp_new_i64(); + vm = tcg_temp_new_i64(); + vd = tcg_temp_new_i64(); + + vfp_load_reg64(vn, a->vn); + vfp_load_reg64(vm, a->vm); + if (neg_n) { + /* VFNMS, VFMS */ + gen_helper_vfp_negd(vn, vn); + } + vfp_load_reg64(vd, a->vd); + if (neg_d) { + /* VFNMA, VFNMS */ + gen_helper_vfp_negd(vd, vd); + } + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_vfp_muladdd(vd, vn, vm, vd, fpst); + vfp_store_reg64(vd, a->vd); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(vn); + tcg_temp_free_i64(vm); + tcg_temp_free_i64(vd); + + return true; +} + +#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD) \ + static bool trans_##INSN##_##PREC(DisasContext *s, \ + arg_##INSN##_##PREC *a) \ + { \ + return do_vfm_##PREC(s, a, NEGN, NEGD); \ + } + +#define MAKE_VFM_TRANS_FNS(PREC) \ + MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \ + MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \ + MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \ + MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true) + +MAKE_VFM_TRANS_FNS(hp) +MAKE_VFM_TRANS_FNS(sp) +MAKE_VFM_TRANS_FNS(dp) + +static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a) +{ + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vfp_store_reg32(tcg_constant_i32(vfp_expand_imm(MO_16, a->imm)), a->vd); + return true; +} + +static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a) +{ + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i32 fd; + uint32_t vd; + + vd = a->vd; + + if (!dc_isar_feature(aa32_fpsp_v3, s)) { + return false; + } + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_sreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = s->vec_stride + 1; + } + } + + fd = tcg_constant_i32(vfp_expand_imm(MO_32, a->imm)); + + for (;;) { + vfp_store_reg32(fd, vd); + + if (veclen == 0) { + break; + } + + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_sreg(vd, delta_d); + } + + return true; +} + +static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a) +{ + uint32_t delta_d = 0; + int veclen = s->vec_len; + TCGv_i64 fd; + uint32_t vd; + + vd = a->vd; + + if (!dc_isar_feature(aa32_fpdp_v3, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (vd & 0x10)) { + return false; + } + + if (!dc_isar_feature(aa32_fpshvec, s) && + (veclen != 0 || s->vec_stride != 0)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (veclen > 0) { + /* Figure out what type of vector operation this is. */ + if (vfp_dreg_is_scalar(vd)) { + /* scalar */ + veclen = 0; + } else { + delta_d = (s->vec_stride >> 1) + 1; + } + } + + fd = tcg_constant_i64(vfp_expand_imm(MO_64, a->imm)); + + for (;;) { + vfp_store_reg64(fd, vd); + + if (veclen == 0) { + break; + } + + /* Set up the operands for the next iteration */ + veclen--; + vd = vfp_advance_dreg(vd, delta_d); + } + + return true; +} + +#define DO_VFP_2OP(INSN, PREC, FN, CHECK) \ + static bool trans_##INSN##_##PREC(DisasContext *s, \ + arg_##INSN##_##PREC *a) \ + { \ + if (!dc_isar_feature(CHECK, s)) { \ + return false; \ + } \ + return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \ + } + +#define DO_VFP_VMOV(INSN, PREC, FN) \ + static bool trans_##INSN##_##PREC(DisasContext *s, \ + arg_##INSN##_##PREC *a) \ + { \ + if (!dc_isar_feature(aa32_fp##PREC##_v2, s) && \ + !dc_isar_feature(aa32_mve, s)) { \ + return false; \ + } \ + return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \ + } + +DO_VFP_VMOV(VMOV_reg, sp, tcg_gen_mov_i32) +DO_VFP_VMOV(VMOV_reg, dp, tcg_gen_mov_i64) + +DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh, aa32_fp16_arith) +DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss, aa32_fpsp_v2) +DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd, aa32_fpdp_v2) + +DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh, aa32_fp16_arith) +DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs, aa32_fpsp_v2) +DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd, aa32_fpdp_v2) + +static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm) +{ + gen_helper_vfp_sqrth(vd, vm, cpu_env); +} + +static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm) +{ + gen_helper_vfp_sqrts(vd, vm, cpu_env); +} + +static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm) +{ + gen_helper_vfp_sqrtd(vd, vm, cpu_env); +} + +DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp, aa32_fp16_arith) +DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp, aa32_fpsp_v2) +DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp, aa32_fpdp_v2) + +static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a) +{ + TCGv_i32 vd, vm; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + /* Vm/M bits must be zero for the Z variant */ + if (a->z && a->vm != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vd = tcg_temp_new_i32(); + vm = tcg_temp_new_i32(); + + vfp_load_reg32(vd, a->vd); + if (a->z) { + tcg_gen_movi_i32(vm, 0); + } else { + vfp_load_reg32(vm, a->vm); + } + + if (a->e) { + gen_helper_vfp_cmpeh(vd, vm, cpu_env); + } else { + gen_helper_vfp_cmph(vd, vm, cpu_env); + } + + tcg_temp_free_i32(vd); + tcg_temp_free_i32(vm); + + return true; +} + +static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a) +{ + TCGv_i32 vd, vm; + + if (!dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + /* Vm/M bits must be zero for the Z variant */ + if (a->z && a->vm != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vd = tcg_temp_new_i32(); + vm = tcg_temp_new_i32(); + + vfp_load_reg32(vd, a->vd); + if (a->z) { + tcg_gen_movi_i32(vm, 0); + } else { + vfp_load_reg32(vm, a->vm); + } + + if (a->e) { + gen_helper_vfp_cmpes(vd, vm, cpu_env); + } else { + gen_helper_vfp_cmps(vd, vm, cpu_env); + } + + tcg_temp_free_i32(vd); + tcg_temp_free_i32(vm); + + return true; +} + +static bool trans_VCMP_dp(DisasContext *s, arg_VCMP_dp *a) +{ + TCGv_i64 vd, vm; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* Vm/M bits must be zero for the Z variant */ + if (a->z && a->vm != 0) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vd = tcg_temp_new_i64(); + vm = tcg_temp_new_i64(); + + vfp_load_reg64(vd, a->vd); + if (a->z) { + tcg_gen_movi_i64(vm, 0); + } else { + vfp_load_reg64(vm, a->vm); + } + + if (a->e) { + gen_helper_vfp_cmped(vd, vm, cpu_env); + } else { + gen_helper_vfp_cmpd(vd, vm, cpu_env); + } + + tcg_temp_free_i64(vd); + tcg_temp_free_i64(vm); + + return true; +} + +static bool trans_VCVT_f32_f16(DisasContext *s, arg_VCVT_f32_f16 *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp_mode; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + ahp_mode = get_ahp_flag(); + tmp = tcg_temp_new_i32(); + /* The T bit tells us if we want the low or high 16 bits of Vm */ + tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t)); + gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp_mode); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_i32(ahp_mode); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp_mode; + TCGv_i32 tmp; + TCGv_i64 vd; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_fp16_dpconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + ahp_mode = get_ahp_flag(); + tmp = tcg_temp_new_i32(); + /* The T bit tells us if we want the low or high 16 bits of Vm */ + tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t)); + vd = tcg_temp_new_i64(); + gen_helper_vfp_fcvt_f16_to_f64(vd, tmp, fpst, ahp_mode); + vfp_store_reg64(vd, a->vd); + tcg_temp_free_i32(ahp_mode); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + tcg_temp_free_i64(vd); + return true; +} + +static bool trans_VCVT_b16_f32(DisasContext *s, arg_VCVT_b16_f32 *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_bf16, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + tmp = tcg_temp_new_i32(); + + vfp_load_reg32(tmp, a->vm); + gen_helper_bfcvt(tmp, tmp, fpst); + tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t)); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp_mode; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + ahp_mode = get_ahp_flag(); + tmp = tcg_temp_new_i32(); + + vfp_load_reg32(tmp, a->vm); + gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp_mode); + tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t)); + tcg_temp_free_i32(ahp_mode); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp_mode; + TCGv_i32 tmp; + TCGv_i64 vm; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_fp16_dpconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + ahp_mode = get_ahp_flag(); + tmp = tcg_temp_new_i32(); + vm = tcg_temp_new_i64(); + + vfp_load_reg64(vm, a->vm); + gen_helper_vfp_fcvt_f64_to_f16(tmp, vm, fpst, ahp_mode); + tcg_temp_free_i64(vm); + tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t)); + tcg_temp_free_i32(ahp_mode); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_rinth(tmp, tmp, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_rints(tmp, tmp, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a) +{ + TCGv_ptr fpst; + TCGv_i64 tmp; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i64(); + vfp_load_reg64(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_rintd(tmp, tmp, fpst); + vfp_store_reg64(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(tmp); + return true; +} + +static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + TCGv_i32 tcg_rmode; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR_F16); + tcg_rmode = tcg_const_i32(float_round_to_zero); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_helper_rinth(tmp, tmp, fpst); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + TCGv_i32 tcg_rmode; + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + tcg_rmode = tcg_const_i32(float_round_to_zero); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_helper_rints(tmp, tmp, fpst); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a) +{ + TCGv_ptr fpst; + TCGv_i64 tmp; + TCGv_i32 tcg_rmode; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i64(); + vfp_load_reg64(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + tcg_rmode = tcg_const_i32(float_round_to_zero); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + gen_helper_rintd(tmp, tmp, fpst); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst); + vfp_store_reg64(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(tmp); + tcg_temp_free_i32(tcg_rmode); + return true; +} + +static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR_F16); + gen_helper_rinth_exact(tmp, tmp, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a) +{ + TCGv_ptr fpst; + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i32(); + vfp_load_reg32(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_rints_exact(tmp, tmp, fpst); + vfp_store_reg32(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tmp); + return true; +} + +static bool trans_VRINTX_dp(DisasContext *s, arg_VRINTX_dp *a) +{ + TCGv_ptr fpst; + TCGv_i64 tmp; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_vrint, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i64(); + vfp_load_reg64(tmp, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + gen_helper_rintd_exact(tmp, tmp, fpst); + vfp_store_reg64(tmp, a->vd); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(tmp); + return true; +} + +static bool trans_VCVT_sp(DisasContext *s, arg_VCVT_sp *a) +{ + TCGv_i64 vd; + TCGv_i32 vm; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vm = tcg_temp_new_i32(); + vd = tcg_temp_new_i64(); + vfp_load_reg32(vm, a->vm); + gen_helper_vfp_fcvtds(vd, vm, cpu_env); + vfp_store_reg64(vd, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_i64(vd); + return true; +} + +static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a) +{ + TCGv_i64 vm; + TCGv_i32 vd; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vd = tcg_temp_new_i32(); + vm = tcg_temp_new_i64(); + vfp_load_reg64(vm, a->vm); + gen_helper_vfp_fcvtsd(vd, vm, cpu_env); + vfp_store_reg32(vd, a->vd); + tcg_temp_free_i32(vd); + tcg_temp_free_i64(vm); + return true; +} + +static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a) +{ + TCGv_i32 vm; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vm = tcg_temp_new_i32(); + vfp_load_reg32(vm, a->vm); + fpst = fpstatus_ptr(FPST_FPCR_F16); + if (a->s) { + /* i32 -> f16 */ + gen_helper_vfp_sitoh(vm, vm, fpst); + } else { + /* u32 -> f16 */ + gen_helper_vfp_uitoh(vm, vm, fpst); + } + vfp_store_reg32(vm, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a) +{ + TCGv_i32 vm; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vm = tcg_temp_new_i32(); + vfp_load_reg32(vm, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + if (a->s) { + /* i32 -> f32 */ + gen_helper_vfp_sitos(vm, vm, fpst); + } else { + /* u32 -> f32 */ + gen_helper_vfp_uitos(vm, vm, fpst); + } + vfp_store_reg32(vm, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a) +{ + TCGv_i32 vm; + TCGv_i64 vd; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vm = tcg_temp_new_i32(); + vd = tcg_temp_new_i64(); + vfp_load_reg32(vm, a->vm); + fpst = fpstatus_ptr(FPST_FPCR); + if (a->s) { + /* i32 -> f64 */ + gen_helper_vfp_sitod(vd, vm, fpst); + } else { + /* u32 -> f64 */ + gen_helper_vfp_uitod(vd, vm, fpst); + } + vfp_store_reg64(vd, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_i64(vd); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a) +{ + TCGv_i32 vd; + TCGv_i64 vm; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + if (!dc_isar_feature(aa32_jscvt, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + vm = tcg_temp_new_i64(); + vd = tcg_temp_new_i32(); + vfp_load_reg64(vm, a->vm); + gen_helper_vjcvt(vd, vm, cpu_env); + vfp_store_reg32(vd, a->vd); + tcg_temp_free_i64(vm); + tcg_temp_free_i32(vd); + return true; +} + +static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a) +{ + TCGv_i32 vd, shift; + TCGv_ptr fpst; + int frac_bits; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm); + + vd = tcg_temp_new_i32(); + vfp_load_reg32(vd, a->vd); + + fpst = fpstatus_ptr(FPST_FPCR_F16); + shift = tcg_constant_i32(frac_bits); + + /* Switch on op:U:sx bits */ + switch (a->opc) { + case 0: + gen_helper_vfp_shtoh_round_to_nearest(vd, vd, shift, fpst); + break; + case 1: + gen_helper_vfp_sltoh_round_to_nearest(vd, vd, shift, fpst); + break; + case 2: + gen_helper_vfp_uhtoh_round_to_nearest(vd, vd, shift, fpst); + break; + case 3: + gen_helper_vfp_ultoh_round_to_nearest(vd, vd, shift, fpst); + break; + case 4: + gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst); + break; + case 5: + gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst); + break; + case 6: + gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst); + break; + case 7: + gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst); + break; + default: + g_assert_not_reached(); + } + + vfp_store_reg32(vd, a->vd); + tcg_temp_free_i32(vd); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a) +{ + TCGv_i32 vd, shift; + TCGv_ptr fpst; + int frac_bits; + + if (!dc_isar_feature(aa32_fpsp_v3, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm); + + vd = tcg_temp_new_i32(); + vfp_load_reg32(vd, a->vd); + + fpst = fpstatus_ptr(FPST_FPCR); + shift = tcg_constant_i32(frac_bits); + + /* Switch on op:U:sx bits */ + switch (a->opc) { + case 0: + gen_helper_vfp_shtos_round_to_nearest(vd, vd, shift, fpst); + break; + case 1: + gen_helper_vfp_sltos_round_to_nearest(vd, vd, shift, fpst); + break; + case 2: + gen_helper_vfp_uhtos_round_to_nearest(vd, vd, shift, fpst); + break; + case 3: + gen_helper_vfp_ultos_round_to_nearest(vd, vd, shift, fpst); + break; + case 4: + gen_helper_vfp_toshs_round_to_zero(vd, vd, shift, fpst); + break; + case 5: + gen_helper_vfp_tosls_round_to_zero(vd, vd, shift, fpst); + break; + case 6: + gen_helper_vfp_touhs_round_to_zero(vd, vd, shift, fpst); + break; + case 7: + gen_helper_vfp_touls_round_to_zero(vd, vd, shift, fpst); + break; + default: + g_assert_not_reached(); + } + + vfp_store_reg32(vd, a->vd); + tcg_temp_free_i32(vd); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a) +{ + TCGv_i64 vd; + TCGv_i32 shift; + TCGv_ptr fpst; + int frac_bits; + + if (!dc_isar_feature(aa32_fpdp_v3, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm); + + vd = tcg_temp_new_i64(); + vfp_load_reg64(vd, a->vd); + + fpst = fpstatus_ptr(FPST_FPCR); + shift = tcg_constant_i32(frac_bits); + + /* Switch on op:U:sx bits */ + switch (a->opc) { + case 0: + gen_helper_vfp_shtod_round_to_nearest(vd, vd, shift, fpst); + break; + case 1: + gen_helper_vfp_sltod_round_to_nearest(vd, vd, shift, fpst); + break; + case 2: + gen_helper_vfp_uhtod_round_to_nearest(vd, vd, shift, fpst); + break; + case 3: + gen_helper_vfp_ultod_round_to_nearest(vd, vd, shift, fpst); + break; + case 4: + gen_helper_vfp_toshd_round_to_zero(vd, vd, shift, fpst); + break; + case 5: + gen_helper_vfp_tosld_round_to_zero(vd, vd, shift, fpst); + break; + case 6: + gen_helper_vfp_touhd_round_to_zero(vd, vd, shift, fpst); + break; + case 7: + gen_helper_vfp_tould_round_to_zero(vd, vd, shift, fpst); + break; + default: + g_assert_not_reached(); + } + + vfp_store_reg64(vd, a->vd); + tcg_temp_free_i64(vd); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a) +{ + TCGv_i32 vm; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR_F16); + vm = tcg_temp_new_i32(); + vfp_load_reg32(vm, a->vm); + + if (a->s) { + if (a->rz) { + gen_helper_vfp_tosizh(vm, vm, fpst); + } else { + gen_helper_vfp_tosih(vm, vm, fpst); + } + } else { + if (a->rz) { + gen_helper_vfp_touizh(vm, vm, fpst); + } else { + gen_helper_vfp_touih(vm, vm, fpst); + } + } + vfp_store_reg32(vm, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a) +{ + TCGv_i32 vm; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpsp_v2, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + vm = tcg_temp_new_i32(); + vfp_load_reg32(vm, a->vm); + + if (a->s) { + if (a->rz) { + gen_helper_vfp_tosizs(vm, vm, fpst); + } else { + gen_helper_vfp_tosis(vm, vm, fpst); + } + } else { + if (a->rz) { + gen_helper_vfp_touizs(vm, vm, fpst); + } else { + gen_helper_vfp_touis(vm, vm, fpst); + } + } + vfp_store_reg32(vm, a->vd); + tcg_temp_free_i32(vm); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a) +{ + TCGv_i32 vd; + TCGv_i64 vm; + TCGv_ptr fpst; + + if (!dc_isar_feature(aa32_fpdp_v2, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = fpstatus_ptr(FPST_FPCR); + vm = tcg_temp_new_i64(); + vd = tcg_temp_new_i32(); + vfp_load_reg64(vm, a->vm); + + if (a->s) { + if (a->rz) { + gen_helper_vfp_tosizd(vd, vm, fpst); + } else { + gen_helper_vfp_tosid(vd, vm, fpst); + } + } else { + if (a->rz) { + gen_helper_vfp_touizd(vd, vm, fpst); + } else { + gen_helper_vfp_touid(vd, vm, fpst); + } + } + vfp_store_reg32(vd, a->vd); + tcg_temp_free_i32(vd); + tcg_temp_free_i64(vm); + tcg_temp_free_ptr(fpst); + return true; +} + +static bool trans_VINS(DisasContext *s, arg_VINS *a) +{ + TCGv_i32 rd, rm; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* Insert low half of Vm into high half of Vd */ + rm = tcg_temp_new_i32(); + rd = tcg_temp_new_i32(); + vfp_load_reg32(rm, a->vm); + vfp_load_reg32(rd, a->vd); + tcg_gen_deposit_i32(rd, rd, rm, 16, 16); + vfp_store_reg32(rd, a->vd); + tcg_temp_free_i32(rm); + tcg_temp_free_i32(rd); + return true; +} + +static bool trans_VMOVX(DisasContext *s, arg_VINS *a) +{ + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_fp16_arith, s)) { + return false; + } + + if (s->vec_len != 0 || s->vec_stride != 0) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* Set Vd to high half of Vm */ + rm = tcg_temp_new_i32(); + vfp_load_reg32(rm, a->vm); + tcg_gen_shri_i32(rm, rm, 16); + vfp_store_reg32(rm, a->vd); + tcg_temp_free_i32(rm); + return true; +} diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c new file mode 100644 index 0000000..c23a346 --- /dev/null +++ b/target/arm/tcg/translate.c @@ -0,0 +1,9990 @@ +/* + * ARM translation + * + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2005-2007 CodeSourcery + * Copyright (c) 2007 OpenedHand, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ +#include "qemu/osdep.h" + +#include "cpu.h" +#include "internals.h" +#include "disas/disas.h" +#include "exec/exec-all.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "qemu/log.h" +#include "qemu/bitops.h" +#include "arm_ldst.h" +#include "semihosting/semihost.h" +#include "exec/helper-proto.h" +#include "exec/helper-gen.h" +#include "exec/log.h" +#include "cpregs.h" + + +#define ENABLE_ARCH_4T arm_dc_feature(s, ARM_FEATURE_V4T) +#define ENABLE_ARCH_5 arm_dc_feature(s, ARM_FEATURE_V5) +/* currently all emulated v5 cores are also v5TE, so don't bother */ +#define ENABLE_ARCH_5TE arm_dc_feature(s, ARM_FEATURE_V5) +#define ENABLE_ARCH_5J dc_isar_feature(aa32_jazelle, s) +#define ENABLE_ARCH_6 arm_dc_feature(s, ARM_FEATURE_V6) +#define ENABLE_ARCH_6K arm_dc_feature(s, ARM_FEATURE_V6K) +#define ENABLE_ARCH_6T2 arm_dc_feature(s, ARM_FEATURE_THUMB2) +#define ENABLE_ARCH_7 arm_dc_feature(s, ARM_FEATURE_V7) +#define ENABLE_ARCH_8 arm_dc_feature(s, ARM_FEATURE_V8) + +#include "translate.h" +#include "translate-a32.h" + +/* These are TCG temporaries used only by the legacy iwMMXt decoder */ +static TCGv_i64 cpu_V0, cpu_V1, cpu_M0; +/* These are TCG globals which alias CPUARMState fields */ +static TCGv_i32 cpu_R[16]; +TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF; +TCGv_i64 cpu_exclusive_addr; +TCGv_i64 cpu_exclusive_val; + +#include "exec/gen-icount.h" + +static const char * const regnames[] = + { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" }; + + +/* initialize TCG globals. */ +void arm_translate_init(void) +{ + int i; + + for (i = 0; i < 16; i++) { + cpu_R[i] = tcg_global_mem_new_i32(cpu_env, + offsetof(CPUARMState, regs[i]), + regnames[i]); + } + cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF"); + cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF"); + cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF"); + cpu_ZF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, ZF), "ZF"); + + cpu_exclusive_addr = tcg_global_mem_new_i64(cpu_env, + offsetof(CPUARMState, exclusive_addr), "exclusive_addr"); + cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env, + offsetof(CPUARMState, exclusive_val), "exclusive_val"); + + a64_translate_init(); +} + +uint64_t asimd_imm_const(uint32_t imm, int cmode, int op) +{ + /* Expand the encoded constant as per AdvSIMDExpandImm pseudocode */ + switch (cmode) { + case 0: case 1: + /* no-op */ + break; + case 2: case 3: + imm <<= 8; + break; + case 4: case 5: + imm <<= 16; + break; + case 6: case 7: + imm <<= 24; + break; + case 8: case 9: + imm |= imm << 16; + break; + case 10: case 11: + imm = (imm << 8) | (imm << 24); + break; + case 12: + imm = (imm << 8) | 0xff; + break; + case 13: + imm = (imm << 16) | 0xffff; + break; + case 14: + if (op) { + /* + * This and cmode == 15 op == 1 are the only cases where + * the top and bottom 32 bits of the encoded constant differ. + */ + uint64_t imm64 = 0; + int n; + + for (n = 0; n < 8; n++) { + if (imm & (1 << n)) { + imm64 |= (0xffULL << (n * 8)); + } + } + return imm64; + } + imm |= (imm << 8) | (imm << 16) | (imm << 24); + break; + case 15: + if (op) { + /* Reserved encoding for AArch32; valid for AArch64 */ + uint64_t imm64 = (uint64_t)(imm & 0x3f) << 48; + if (imm & 0x80) { + imm64 |= 0x8000000000000000ULL; + } + if (imm & 0x40) { + imm64 |= 0x3fc0000000000000ULL; + } else { + imm64 |= 0x4000000000000000ULL; + } + return imm64; + } + imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19) + | ((imm & 0x40) ? (0x1f << 25) : (1 << 30)); + break; + } + if (op) { + imm = ~imm; + } + return dup_const(MO_32, imm); +} + +/* Generate a label used for skipping this instruction */ +void arm_gen_condlabel(DisasContext *s) +{ + if (!s->condjmp) { + s->condlabel = gen_disas_label(s); + s->condjmp = 1; + } +} + +/* Flags for the disas_set_da_iss info argument: + * lower bits hold the Rt register number, higher bits are flags. + */ +typedef enum ISSInfo { + ISSNone = 0, + ISSRegMask = 0x1f, + ISSInvalid = (1 << 5), + ISSIsAcqRel = (1 << 6), + ISSIsWrite = (1 << 7), + ISSIs16Bit = (1 << 8), +} ISSInfo; + +/* + * Store var into env + offset to a member with size bytes. + * Free var after use. + */ +void store_cpu_offset(TCGv_i32 var, int offset, int size) +{ + switch (size) { + case 1: + tcg_gen_st8_i32(var, cpu_env, offset); + break; + case 4: + tcg_gen_st_i32(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i32(var); +} + +/* Save the syndrome information for a Data Abort */ +static void disas_set_da_iss(DisasContext *s, MemOp memop, ISSInfo issinfo) +{ + uint32_t syn; + int sas = memop & MO_SIZE; + bool sse = memop & MO_SIGN; + bool is_acqrel = issinfo & ISSIsAcqRel; + bool is_write = issinfo & ISSIsWrite; + bool is_16bit = issinfo & ISSIs16Bit; + int srt = issinfo & ISSRegMask; + + if (issinfo & ISSInvalid) { + /* Some callsites want to conditionally provide ISS info, + * eg "only if this was not a writeback" + */ + return; + } + + if (srt == 15) { + /* For AArch32, insns where the src/dest is R15 never generate + * ISS information. Catching that here saves checking at all + * the call sites. + */ + return; + } + + syn = syn_data_abort_with_iss(0, sas, sse, srt, 0, is_acqrel, + 0, 0, 0, is_write, 0, is_16bit); + disas_set_insn_syndrome(s, syn); +} + +static inline int get_a32_user_mem_index(DisasContext *s) +{ + /* Return the core mmu_idx to use for A32/T32 "unprivileged load/store" + * insns: + * if PL2, UNPREDICTABLE (we choose to implement as if PL0) + * otherwise, access as if at PL0. + */ + switch (s->mmu_idx) { + case ARMMMUIdx_E3: + case ARMMMUIdx_E2: /* this one is UNPREDICTABLE */ + case ARMMMUIdx_E10_0: + case ARMMMUIdx_E10_1: + case ARMMMUIdx_E10_1_PAN: + return arm_to_core_mmu_idx(ARMMMUIdx_E10_0); + case ARMMMUIdx_MUser: + case ARMMMUIdx_MPriv: + return arm_to_core_mmu_idx(ARMMMUIdx_MUser); + case ARMMMUIdx_MUserNegPri: + case ARMMMUIdx_MPrivNegPri: + return arm_to_core_mmu_idx(ARMMMUIdx_MUserNegPri); + case ARMMMUIdx_MSUser: + case ARMMMUIdx_MSPriv: + return arm_to_core_mmu_idx(ARMMMUIdx_MSUser); + case ARMMMUIdx_MSUserNegPri: + case ARMMMUIdx_MSPrivNegPri: + return arm_to_core_mmu_idx(ARMMMUIdx_MSUserNegPri); + default: + g_assert_not_reached(); + } +} + +/* The pc_curr difference for an architectural jump. */ +static target_long jmp_diff(DisasContext *s, target_long diff) +{ + return diff + (s->thumb ? 4 : 8); +} + +static void gen_pc_plus_diff(DisasContext *s, TCGv_i32 var, target_long diff) +{ + assert(s->pc_save != -1); + if (TARGET_TB_PCREL) { + tcg_gen_addi_i32(var, cpu_R[15], (s->pc_curr - s->pc_save) + diff); + } else { + tcg_gen_movi_i32(var, s->pc_curr + diff); + } +} + +/* Set a variable to the value of a CPU register. */ +void load_reg_var(DisasContext *s, TCGv_i32 var, int reg) +{ + if (reg == 15) { + gen_pc_plus_diff(s, var, jmp_diff(s, 0)); + } else { + tcg_gen_mov_i32(var, cpu_R[reg]); + } +} + +/* + * Create a new temp, REG + OFS, except PC is ALIGN(PC, 4). + * This is used for load/store for which use of PC implies (literal), + * or ADD that implies ADR. + */ +TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + + if (reg == 15) { + /* + * This address is computed from an aligned PC: + * subtract off the low bits. + */ + gen_pc_plus_diff(s, tmp, jmp_diff(s, ofs - (s->pc_curr & 3))); + } else { + tcg_gen_addi_i32(tmp, cpu_R[reg], ofs); + } + return tmp; +} + +/* Set a CPU register. The source must be a temporary and will be + marked as dead. */ +void store_reg(DisasContext *s, int reg, TCGv_i32 var) +{ + if (reg == 15) { + /* In Thumb mode, we must ignore bit 0. + * In ARM mode, for ARMv4 and ARMv5, it is UNPREDICTABLE if bits [1:0] + * are not 0b00, but for ARMv6 and above, we must ignore bits [1:0]. + * We choose to ignore [1:0] in ARM mode for all architecture versions. + */ + tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3); + s->base.is_jmp = DISAS_JUMP; + s->pc_save = -1; + } else if (reg == 13 && arm_dc_feature(s, ARM_FEATURE_M)) { + /* For M-profile SP bits [1:0] are always zero */ + tcg_gen_andi_i32(var, var, ~3); + } + tcg_gen_mov_i32(cpu_R[reg], var); + tcg_temp_free_i32(var); +} + +/* + * Variant of store_reg which applies v8M stack-limit checks before updating + * SP. If the check fails this will result in an exception being taken. + * We disable the stack checks for CONFIG_USER_ONLY because we have + * no idea what the stack limits should be in that case. + * If stack checking is not being done this just acts like store_reg(). + */ +static void store_sp_checked(DisasContext *s, TCGv_i32 var) +{ +#ifndef CONFIG_USER_ONLY + if (s->v8m_stackcheck) { + gen_helper_v8m_stackcheck(cpu_env, var); + } +#endif + store_reg(s, 13, var); +} + +/* Value extensions. */ +#define gen_uxtb(var) tcg_gen_ext8u_i32(var, var) +#define gen_uxth(var) tcg_gen_ext16u_i32(var, var) +#define gen_sxtb(var) tcg_gen_ext8s_i32(var, var) +#define gen_sxth(var) tcg_gen_ext16s_i32(var, var) + +#define gen_sxtb16(var) gen_helper_sxtb16(var, var) +#define gen_uxtb16(var) gen_helper_uxtb16(var, var) + +void gen_set_cpsr(TCGv_i32 var, uint32_t mask) +{ + gen_helper_cpsr_write(cpu_env, var, tcg_constant_i32(mask)); +} + +static void gen_rebuild_hflags(DisasContext *s, bool new_el) +{ + bool m_profile = arm_dc_feature(s, ARM_FEATURE_M); + + if (new_el) { + if (m_profile) { + gen_helper_rebuild_hflags_m32_newel(cpu_env); + } else { + gen_helper_rebuild_hflags_a32_newel(cpu_env); + } + } else { + TCGv_i32 tcg_el = tcg_constant_i32(s->current_el); + if (m_profile) { + gen_helper_rebuild_hflags_m32(cpu_env, tcg_el); + } else { + gen_helper_rebuild_hflags_a32(cpu_env, tcg_el); + } + } +} + +static void gen_exception_internal(int excp) +{ + assert(excp_is_internal(excp)); + gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp)); +} + +static void gen_singlestep_exception(DisasContext *s) +{ + /* We just completed step of an insn. Move from Active-not-pending + * to Active-pending, and then also take the swstep exception. + * This corresponds to making the (IMPDEF) choice to prioritize + * swstep exceptions over asynchronous exceptions taken to an exception + * level where debug is disabled. This choice has the advantage that + * we do not need to maintain internal state corresponding to the + * ISV/EX syndrome bits between completion of the step and generation + * of the exception, and our syndrome information is always correct. + */ + gen_ss_advance(s); + gen_swstep_exception(s, 1, s->is_ldex); + s->base.is_jmp = DISAS_NORETURN; +} + +void clear_eci_state(DisasContext *s) +{ + /* + * Clear any ECI/ICI state: used when a load multiple/store + * multiple insn executes. + */ + if (s->eci) { + store_cpu_field_constant(0, condexec_bits); + s->eci = 0; + } +} + +static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 tmp1 = tcg_temp_new_i32(); + TCGv_i32 tmp2 = tcg_temp_new_i32(); + tcg_gen_ext16s_i32(tmp1, a); + tcg_gen_ext16s_i32(tmp2, b); + tcg_gen_mul_i32(tmp1, tmp1, tmp2); + tcg_temp_free_i32(tmp2); + tcg_gen_sari_i32(a, a, 16); + tcg_gen_sari_i32(b, b, 16); + tcg_gen_mul_i32(b, b, a); + tcg_gen_mov_i32(a, tmp1); + tcg_temp_free_i32(tmp1); +} + +/* Byteswap each halfword. */ +void gen_rev16(TCGv_i32 dest, TCGv_i32 var) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + TCGv_i32 mask = tcg_constant_i32(0x00ff00ff); + tcg_gen_shri_i32(tmp, var, 8); + tcg_gen_and_i32(tmp, tmp, mask); + tcg_gen_and_i32(var, var, mask); + tcg_gen_shli_i32(var, var, 8); + tcg_gen_or_i32(dest, var, tmp); + tcg_temp_free_i32(tmp); +} + +/* Byteswap low halfword and sign extend. */ +static void gen_revsh(TCGv_i32 dest, TCGv_i32 var) +{ + tcg_gen_bswap16_i32(var, var, TCG_BSWAP_OS); +} + +/* Dual 16-bit add. Result placed in t0 and t1 is marked as dead. + tmp = (t0 ^ t1) & 0x8000; + t0 &= ~0x8000; + t1 &= ~0x8000; + t0 = (t0 + t1) ^ tmp; + */ + +static void gen_add16(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_xor_i32(tmp, t0, t1); + tcg_gen_andi_i32(tmp, tmp, 0x8000); + tcg_gen_andi_i32(t0, t0, ~0x8000); + tcg_gen_andi_i32(t1, t1, ~0x8000); + tcg_gen_add_i32(t0, t0, t1); + tcg_gen_xor_i32(dest, t0, tmp); + tcg_temp_free_i32(tmp); +} + +/* Set N and Z flags from var. */ +static inline void gen_logic_CC(TCGv_i32 var) +{ + tcg_gen_mov_i32(cpu_NF, var); + tcg_gen_mov_i32(cpu_ZF, var); +} + +/* dest = T0 + T1 + CF. */ +static void gen_add_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + tcg_gen_add_i32(dest, t0, t1); + tcg_gen_add_i32(dest, dest, cpu_CF); +} + +/* dest = T0 - T1 + CF - 1. */ +static void gen_sub_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + tcg_gen_sub_i32(dest, t0, t1); + tcg_gen_add_i32(dest, dest, cpu_CF); + tcg_gen_subi_i32(dest, dest, 1); +} + +/* dest = T0 + T1. Compute C, N, V and Z flags */ +static void gen_add_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_movi_i32(tmp, 0); + tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, t1, tmp); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0); + tcg_gen_xor_i32(tmp, t0, t1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp); + tcg_temp_free_i32(tmp); + tcg_gen_mov_i32(dest, cpu_NF); +} + +/* dest = T0 + T1 + CF. Compute C, N, V and Z flags */ +static void gen_adc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + if (TCG_TARGET_HAS_add2_i32) { + tcg_gen_movi_i32(tmp, 0); + tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp); + tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp); + } else { + TCGv_i64 q0 = tcg_temp_new_i64(); + TCGv_i64 q1 = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(q0, t0); + tcg_gen_extu_i32_i64(q1, t1); + tcg_gen_add_i64(q0, q0, q1); + tcg_gen_extu_i32_i64(q1, cpu_CF); + tcg_gen_add_i64(q0, q0, q1); + tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0); + tcg_temp_free_i64(q0); + tcg_temp_free_i64(q1); + } + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0); + tcg_gen_xor_i32(tmp, t0, t1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp); + tcg_temp_free_i32(tmp); + tcg_gen_mov_i32(dest, cpu_NF); +} + +/* dest = T0 - T1. Compute C, N, V and Z flags */ +static void gen_sub_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp; + tcg_gen_sub_i32(cpu_NF, t0, t1); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); + tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1); + tcg_gen_xor_i32(cpu_VF, cpu_NF, t0); + tmp = tcg_temp_new_i32(); + tcg_gen_xor_i32(tmp, t0, t1); + tcg_gen_and_i32(cpu_VF, cpu_VF, tmp); + tcg_temp_free_i32(tmp); + tcg_gen_mov_i32(dest, cpu_NF); +} + +/* dest = T0 + ~T1 + CF. Compute C, N, V and Z flags */ +static void gen_sbc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_not_i32(tmp, t1); + gen_adc_CC(dest, t0, tmp); + tcg_temp_free_i32(tmp); +} + +#define GEN_SHIFT(name) \ +static void gen_##name(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) \ +{ \ + TCGv_i32 tmpd = tcg_temp_new_i32(); \ + TCGv_i32 tmp1 = tcg_temp_new_i32(); \ + TCGv_i32 zero = tcg_constant_i32(0); \ + tcg_gen_andi_i32(tmp1, t1, 0x1f); \ + tcg_gen_##name##_i32(tmpd, t0, tmp1); \ + tcg_gen_andi_i32(tmp1, t1, 0xe0); \ + tcg_gen_movcond_i32(TCG_COND_NE, dest, tmp1, zero, zero, tmpd); \ + tcg_temp_free_i32(tmpd); \ + tcg_temp_free_i32(tmp1); \ +} +GEN_SHIFT(shl) +GEN_SHIFT(shr) +#undef GEN_SHIFT + +static void gen_sar(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 tmp1 = tcg_temp_new_i32(); + + tcg_gen_andi_i32(tmp1, t1, 0xff); + tcg_gen_umin_i32(tmp1, tmp1, tcg_constant_i32(31)); + tcg_gen_sar_i32(dest, t0, tmp1); + tcg_temp_free_i32(tmp1); +} + +static void shifter_out_im(TCGv_i32 var, int shift) +{ + tcg_gen_extract_i32(cpu_CF, var, shift, 1); +} + +/* Shift by immediate. Includes special handling for shift == 0. */ +static inline void gen_arm_shift_im(TCGv_i32 var, int shiftop, + int shift, int flags) +{ + switch (shiftop) { + case 0: /* LSL */ + if (shift != 0) { + if (flags) + shifter_out_im(var, 32 - shift); + tcg_gen_shli_i32(var, var, shift); + } + break; + case 1: /* LSR */ + if (shift == 0) { + if (flags) { + tcg_gen_shri_i32(cpu_CF, var, 31); + } + tcg_gen_movi_i32(var, 0); + } else { + if (flags) + shifter_out_im(var, shift - 1); + tcg_gen_shri_i32(var, var, shift); + } + break; + case 2: /* ASR */ + if (shift == 0) + shift = 32; + if (flags) + shifter_out_im(var, shift - 1); + if (shift == 32) + shift = 31; + tcg_gen_sari_i32(var, var, shift); + break; + case 3: /* ROR/RRX */ + if (shift != 0) { + if (flags) + shifter_out_im(var, shift - 1); + tcg_gen_rotri_i32(var, var, shift); break; + } else { + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_shli_i32(tmp, cpu_CF, 31); + if (flags) + shifter_out_im(var, 0); + tcg_gen_shri_i32(var, var, 1); + tcg_gen_or_i32(var, var, tmp); + tcg_temp_free_i32(tmp); + } + } +}; + +static inline void gen_arm_shift_reg(TCGv_i32 var, int shiftop, + TCGv_i32 shift, int flags) +{ + if (flags) { + switch (shiftop) { + case 0: gen_helper_shl_cc(var, cpu_env, var, shift); break; + case 1: gen_helper_shr_cc(var, cpu_env, var, shift); break; + case 2: gen_helper_sar_cc(var, cpu_env, var, shift); break; + case 3: gen_helper_ror_cc(var, cpu_env, var, shift); break; + } + } else { + switch (shiftop) { + case 0: + gen_shl(var, var, shift); + break; + case 1: + gen_shr(var, var, shift); + break; + case 2: + gen_sar(var, var, shift); + break; + case 3: tcg_gen_andi_i32(shift, shift, 0x1f); + tcg_gen_rotr_i32(var, var, shift); break; + } + } + tcg_temp_free_i32(shift); +} + +/* + * Generate a conditional based on ARM condition code cc. + * This is common between ARM and Aarch64 targets. + */ +void arm_test_cc(DisasCompare *cmp, int cc) +{ + TCGv_i32 value; + TCGCond cond; + bool global = true; + + switch (cc) { + case 0: /* eq: Z */ + case 1: /* ne: !Z */ + cond = TCG_COND_EQ; + value = cpu_ZF; + break; + + case 2: /* cs: C */ + case 3: /* cc: !C */ + cond = TCG_COND_NE; + value = cpu_CF; + break; + + case 4: /* mi: N */ + case 5: /* pl: !N */ + cond = TCG_COND_LT; + value = cpu_NF; + break; + + case 6: /* vs: V */ + case 7: /* vc: !V */ + cond = TCG_COND_LT; + value = cpu_VF; + break; + + case 8: /* hi: C && !Z */ + case 9: /* ls: !C || Z -> !(C && !Z) */ + cond = TCG_COND_NE; + value = tcg_temp_new_i32(); + global = false; + /* CF is 1 for C, so -CF is an all-bits-set mask for C; + ZF is non-zero for !Z; so AND the two subexpressions. */ + tcg_gen_neg_i32(value, cpu_CF); + tcg_gen_and_i32(value, value, cpu_ZF); + break; + + case 10: /* ge: N == V -> N ^ V == 0 */ + case 11: /* lt: N != V -> N ^ V != 0 */ + /* Since we're only interested in the sign bit, == 0 is >= 0. */ + cond = TCG_COND_GE; + value = tcg_temp_new_i32(); + global = false; + tcg_gen_xor_i32(value, cpu_VF, cpu_NF); + break; + + case 12: /* gt: !Z && N == V */ + case 13: /* le: Z || N != V */ + cond = TCG_COND_NE; + value = tcg_temp_new_i32(); + global = false; + /* (N == V) is equal to the sign bit of ~(NF ^ VF). Propagate + * the sign bit then AND with ZF to yield the result. */ + tcg_gen_xor_i32(value, cpu_VF, cpu_NF); + tcg_gen_sari_i32(value, value, 31); + tcg_gen_andc_i32(value, cpu_ZF, value); + break; + + case 14: /* always */ + case 15: /* always */ + /* Use the ALWAYS condition, which will fold early. + * It doesn't matter what we use for the value. */ + cond = TCG_COND_ALWAYS; + value = cpu_ZF; + goto no_invert; + + default: + fprintf(stderr, "Bad condition code 0x%x\n", cc); + abort(); + } + + if (cc & 1) { + cond = tcg_invert_cond(cond); + } + + no_invert: + cmp->cond = cond; + cmp->value = value; + cmp->value_global = global; +} + +void arm_free_cc(DisasCompare *cmp) +{ + if (!cmp->value_global) { + tcg_temp_free_i32(cmp->value); + } +} + +void arm_jump_cc(DisasCompare *cmp, TCGLabel *label) +{ + tcg_gen_brcondi_i32(cmp->cond, cmp->value, 0, label); +} + +void arm_gen_test_cc(int cc, TCGLabel *label) +{ + DisasCompare cmp; + arm_test_cc(&cmp, cc); + arm_jump_cc(&cmp, label); + arm_free_cc(&cmp); +} + +void gen_set_condexec(DisasContext *s) +{ + if (s->condexec_mask) { + uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1); + + store_cpu_field_constant(val, condexec_bits); + } +} + +void gen_update_pc(DisasContext *s, target_long diff) +{ + gen_pc_plus_diff(s, cpu_R[15], diff); + s->pc_save = s->pc_curr + diff; +} + +/* Set PC and Thumb state from var. var is marked as dead. */ +static inline void gen_bx(DisasContext *s, TCGv_i32 var) +{ + s->base.is_jmp = DISAS_JUMP; + tcg_gen_andi_i32(cpu_R[15], var, ~1); + tcg_gen_andi_i32(var, var, 1); + store_cpu_field(var, thumb); + s->pc_save = -1; +} + +/* + * Set PC and Thumb state from var. var is marked as dead. + * For M-profile CPUs, include logic to detect exception-return + * branches and handle them. This is needed for Thumb POP/LDM to PC, LDR to PC, + * and BX reg, and no others, and happens only for code in Handler mode. + * The Security Extension also requires us to check for the FNC_RETURN + * which signals a function return from non-secure state; this can happen + * in both Handler and Thread mode. + * To avoid having to do multiple comparisons in inline generated code, + * we make the check we do here loose, so it will match for EXC_RETURN + * in Thread mode. For system emulation do_v7m_exception_exit() checks + * for these spurious cases and returns without doing anything (giving + * the same behaviour as for a branch to a non-magic address). + * + * In linux-user mode it is unclear what the right behaviour for an + * attempted FNC_RETURN should be, because in real hardware this will go + * directly to Secure code (ie not the Linux kernel) which will then treat + * the error in any way it chooses. For QEMU we opt to make the FNC_RETURN + * attempt behave the way it would on a CPU without the security extension, + * which is to say "like a normal branch". That means we can simply treat + * all branches as normal with no magic address behaviour. + */ +static inline void gen_bx_excret(DisasContext *s, TCGv_i32 var) +{ + /* Generate the same code here as for a simple bx, but flag via + * s->base.is_jmp that we need to do the rest of the work later. + */ + gen_bx(s, var); +#ifndef CONFIG_USER_ONLY + if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY) || + (s->v7m_handler_mode && arm_dc_feature(s, ARM_FEATURE_M))) { + s->base.is_jmp = DISAS_BX_EXCRET; + } +#endif +} + +static inline void gen_bx_excret_final_code(DisasContext *s) +{ + /* Generate the code to finish possible exception return and end the TB */ + DisasLabel excret_label = gen_disas_label(s); + uint32_t min_magic; + + if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY)) { + /* Covers FNC_RETURN and EXC_RETURN magic */ + min_magic = FNC_RETURN_MIN_MAGIC; + } else { + /* EXC_RETURN magic only */ + min_magic = EXC_RETURN_MIN_MAGIC; + } + + /* Is the new PC value in the magic range indicating exception return? */ + tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label.label); + /* No: end the TB as we would for a DISAS_JMP */ + if (s->ss_active) { + gen_singlestep_exception(s); + } else { + tcg_gen_exit_tb(NULL, 0); + } + set_disas_label(s, excret_label); + /* Yes: this is an exception return. + * At this point in runtime env->regs[15] and env->thumb will hold + * the exception-return magic number, which do_v7m_exception_exit() + * will read. Nothing else will be able to see those values because + * the cpu-exec main loop guarantees that we will always go straight + * from raising the exception to the exception-handling code. + * + * gen_ss_advance(s) does nothing on M profile currently but + * calling it is conceptually the right thing as we have executed + * this instruction (compare SWI, HVC, SMC handling). + */ + gen_ss_advance(s); + gen_exception_internal(EXCP_EXCEPTION_EXIT); +} + +static inline void gen_bxns(DisasContext *s, int rm) +{ + TCGv_i32 var = load_reg(s, rm); + + /* The bxns helper may raise an EXCEPTION_EXIT exception, so in theory + * we need to sync state before calling it, but: + * - we don't need to do gen_update_pc() because the bxns helper will + * always set the PC itself + * - we don't need to do gen_set_condexec() because BXNS is UNPREDICTABLE + * unless it's outside an IT block or the last insn in an IT block, + * so we know that condexec == 0 (already set at the top of the TB) + * is correct in the non-UNPREDICTABLE cases, and we can choose + * "zeroes the IT bits" as our UNPREDICTABLE behaviour otherwise. + */ + gen_helper_v7m_bxns(cpu_env, var); + tcg_temp_free_i32(var); + s->base.is_jmp = DISAS_EXIT; +} + +static inline void gen_blxns(DisasContext *s, int rm) +{ + TCGv_i32 var = load_reg(s, rm); + + /* We don't need to sync condexec state, for the same reason as bxns. + * We do however need to set the PC, because the blxns helper reads it. + * The blxns helper may throw an exception. + */ + gen_update_pc(s, curr_insn_len(s)); + gen_helper_v7m_blxns(cpu_env, var); + tcg_temp_free_i32(var); + s->base.is_jmp = DISAS_EXIT; +} + +/* Variant of store_reg which uses branch&exchange logic when storing + to r15 in ARM architecture v7 and above. The source must be a temporary + and will be marked as dead. */ +static inline void store_reg_bx(DisasContext *s, int reg, TCGv_i32 var) +{ + if (reg == 15 && ENABLE_ARCH_7) { + gen_bx(s, var); + } else { + store_reg(s, reg, var); + } +} + +/* Variant of store_reg which uses branch&exchange logic when storing + * to r15 in ARM architecture v5T and above. This is used for storing + * the results of a LDR/LDM/POP into r15, and corresponds to the cases + * in the ARM ARM which use the LoadWritePC() pseudocode function. */ +static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var) +{ + if (reg == 15 && ENABLE_ARCH_5) { + gen_bx_excret(s, var); + } else { + store_reg(s, reg, var); + } +} + +#ifdef CONFIG_USER_ONLY +#define IS_USER_ONLY 1 +#else +#define IS_USER_ONLY 0 +#endif + +MemOp pow2_align(unsigned i) +{ + static const MemOp mop_align[] = { + 0, MO_ALIGN_2, MO_ALIGN_4, MO_ALIGN_8, MO_ALIGN_16, + /* + * FIXME: TARGET_PAGE_BITS_MIN affects TLB_FLAGS_MASK such + * that 256-bit alignment (MO_ALIGN_32) cannot be supported: + * see get_alignment_bits(). Enforce only 128-bit alignment for now. + */ + MO_ALIGN_16 + }; + g_assert(i < ARRAY_SIZE(mop_align)); + return mop_align[i]; +} + +/* + * Abstractions of "generate code to do a guest load/store for + * AArch32", where a vaddr is always 32 bits (and is zero + * extended if we're a 64 bit core) and data is also + * 32 bits unless specifically doing a 64 bit access. + * These functions work like tcg_gen_qemu_{ld,st}* except + * that the address argument is TCGv_i32 rather than TCGv. + */ + +static TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, MemOp op) +{ + TCGv addr = tcg_temp_new(); + tcg_gen_extu_i32_tl(addr, a32); + + /* Not needed for user-mode BE32, where we use MO_BE instead. */ + if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) { + tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE))); + } + return addr; +} + +/* + * Internal routines are used for NEON cases where the endianness + * and/or alignment has already been taken into account and manipulated. + */ +void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val, + TCGv_i32 a32, int index, MemOp opc) +{ + TCGv addr = gen_aa32_addr(s, a32, opc); + tcg_gen_qemu_ld_i32(val, addr, index, opc); + tcg_temp_free(addr); +} + +void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val, + TCGv_i32 a32, int index, MemOp opc) +{ + TCGv addr = gen_aa32_addr(s, a32, opc); + tcg_gen_qemu_st_i32(val, addr, index, opc); + tcg_temp_free(addr); +} + +void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val, + TCGv_i32 a32, int index, MemOp opc) +{ + TCGv addr = gen_aa32_addr(s, a32, opc); + + tcg_gen_qemu_ld_i64(val, addr, index, opc); + + /* Not needed for user-mode BE32, where we use MO_BE instead. */ + if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) { + tcg_gen_rotri_i64(val, val, 32); + } + tcg_temp_free(addr); +} + +void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val, + TCGv_i32 a32, int index, MemOp opc) +{ + TCGv addr = gen_aa32_addr(s, a32, opc); + + /* Not needed for user-mode BE32, where we use MO_BE instead. */ + if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) { + TCGv_i64 tmp = tcg_temp_new_i64(); + tcg_gen_rotri_i64(tmp, val, 32); + tcg_gen_qemu_st_i64(tmp, addr, index, opc); + tcg_temp_free_i64(tmp); + } else { + tcg_gen_qemu_st_i64(val, addr, index, opc); + } + tcg_temp_free(addr); +} + +void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32, + int index, MemOp opc) +{ + gen_aa32_ld_internal_i32(s, val, a32, index, finalize_memop(s, opc)); +} + +void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32, + int index, MemOp opc) +{ + gen_aa32_st_internal_i32(s, val, a32, index, finalize_memop(s, opc)); +} + +void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32, + int index, MemOp opc) +{ + gen_aa32_ld_internal_i64(s, val, a32, index, finalize_memop(s, opc)); +} + +void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32, + int index, MemOp opc) +{ + gen_aa32_st_internal_i64(s, val, a32, index, finalize_memop(s, opc)); +} + +#define DO_GEN_LD(SUFF, OPC) \ + static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val, \ + TCGv_i32 a32, int index) \ + { \ + gen_aa32_ld_i32(s, val, a32, index, OPC); \ + } + +#define DO_GEN_ST(SUFF, OPC) \ + static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val, \ + TCGv_i32 a32, int index) \ + { \ + gen_aa32_st_i32(s, val, a32, index, OPC); \ + } + +static inline void gen_hvc(DisasContext *s, int imm16) +{ + /* The pre HVC helper handles cases when HVC gets trapped + * as an undefined insn by runtime configuration (ie before + * the insn really executes). + */ + gen_update_pc(s, 0); + gen_helper_pre_hvc(cpu_env); + /* Otherwise we will treat this as a real exception which + * happens after execution of the insn. (The distinction matters + * for the PC value reported to the exception handler and also + * for single stepping.) + */ + s->svc_imm = imm16; + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_HVC; +} + +static inline void gen_smc(DisasContext *s) +{ + /* As with HVC, we may take an exception either before or after + * the insn executes. + */ + gen_update_pc(s, 0); + gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa32_smc())); + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_SMC; +} + +static void gen_exception_internal_insn(DisasContext *s, int excp) +{ + gen_set_condexec(s); + gen_update_pc(s, 0); + gen_exception_internal(excp); + s->base.is_jmp = DISAS_NORETURN; +} + +static void gen_exception_el_v(int excp, uint32_t syndrome, TCGv_i32 tcg_el) +{ + gen_helper_exception_with_syndrome_el(cpu_env, tcg_constant_i32(excp), + tcg_constant_i32(syndrome), tcg_el); +} + +static void gen_exception_el(int excp, uint32_t syndrome, uint32_t target_el) +{ + gen_exception_el_v(excp, syndrome, tcg_constant_i32(target_el)); +} + +static void gen_exception(int excp, uint32_t syndrome) +{ + gen_helper_exception_with_syndrome(cpu_env, tcg_constant_i32(excp), + tcg_constant_i32(syndrome)); +} + +static void gen_exception_insn_el_v(DisasContext *s, target_long pc_diff, + int excp, uint32_t syn, TCGv_i32 tcg_el) +{ + if (s->aarch64) { + gen_a64_update_pc(s, pc_diff); + } else { + gen_set_condexec(s); + gen_update_pc(s, pc_diff); + } + gen_exception_el_v(excp, syn, tcg_el); + s->base.is_jmp = DISAS_NORETURN; +} + +void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp, + uint32_t syn, uint32_t target_el) +{ + gen_exception_insn_el_v(s, pc_diff, excp, syn, + tcg_constant_i32(target_el)); +} + +void gen_exception_insn(DisasContext *s, target_long pc_diff, + int excp, uint32_t syn) +{ + if (s->aarch64) { + gen_a64_update_pc(s, pc_diff); + } else { + gen_set_condexec(s); + gen_update_pc(s, pc_diff); + } + gen_exception(excp, syn); + s->base.is_jmp = DISAS_NORETURN; +} + +static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn) +{ + gen_set_condexec(s); + gen_update_pc(s, 0); + gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syn)); + s->base.is_jmp = DISAS_NORETURN; +} + +void unallocated_encoding(DisasContext *s) +{ + /* Unallocated and reserved encodings are uncategorized */ + gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized()); +} + +/* Force a TB lookup after an instruction that changes the CPU state. */ +void gen_lookup_tb(DisasContext *s) +{ + gen_pc_plus_diff(s, cpu_R[15], curr_insn_len(s)); + s->base.is_jmp = DISAS_EXIT; +} + +static inline void gen_hlt(DisasContext *s, int imm) +{ + /* HLT. This has two purposes. + * Architecturally, it is an external halting debug instruction. + * Since QEMU doesn't implement external debug, we treat this as + * it is required for halting debug disabled: it will UNDEF. + * Secondly, "HLT 0x3C" is a T32 semihosting trap instruction, + * and "HLT 0xF000" is an A32 semihosting syscall. These traps + * must trigger semihosting even for ARMv7 and earlier, where + * HLT was an undefined encoding. + * In system mode, we don't allow userspace access to + * semihosting, to provide some semblance of security + * (and for consistency with our 32-bit semihosting). + */ + if (semihosting_enabled(s->current_el == 0) && + (imm == (s->thumb ? 0x3c : 0xf000))) { + gen_exception_internal_insn(s, EXCP_SEMIHOST); + return; + } + + unallocated_encoding(s); +} + +/* + * Return the offset of a "full" NEON Dreg. + */ +long neon_full_reg_offset(unsigned reg) +{ + return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]); +} + +/* + * Return the offset of a 2**SIZE piece of a NEON register, at index ELE, + * where 0 is the least significant end of the register. + */ +long neon_element_offset(int reg, int element, MemOp memop) +{ + int element_size = 1 << (memop & MO_SIZE); + int ofs = element * element_size; +#if HOST_BIG_ENDIAN + /* + * Calculate the offset assuming fully little-endian, + * then XOR to account for the order of the 8-byte units. + */ + if (element_size < 8) { + ofs ^= 8 - element_size; + } +#endif + return neon_full_reg_offset(reg) + ofs; +} + +/* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */ +long vfp_reg_offset(bool dp, unsigned reg) +{ + if (dp) { + return neon_element_offset(reg, 0, MO_64); + } else { + return neon_element_offset(reg >> 1, reg & 1, MO_32); + } +} + +void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop) +{ + long off = neon_element_offset(reg, ele, memop); + + switch (memop) { + case MO_SB: + tcg_gen_ld8s_i32(dest, cpu_env, off); + break; + case MO_UB: + tcg_gen_ld8u_i32(dest, cpu_env, off); + break; + case MO_SW: + tcg_gen_ld16s_i32(dest, cpu_env, off); + break; + case MO_UW: + tcg_gen_ld16u_i32(dest, cpu_env, off); + break; + case MO_UL: + case MO_SL: + tcg_gen_ld_i32(dest, cpu_env, off); + break; + default: + g_assert_not_reached(); + } +} + +void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop) +{ + long off = neon_element_offset(reg, ele, memop); + + switch (memop) { + case MO_SL: + tcg_gen_ld32s_i64(dest, cpu_env, off); + break; + case MO_UL: + tcg_gen_ld32u_i64(dest, cpu_env, off); + break; + case MO_UQ: + tcg_gen_ld_i64(dest, cpu_env, off); + break; + default: + g_assert_not_reached(); + } +} + +void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop) +{ + long off = neon_element_offset(reg, ele, memop); + + switch (memop) { + case MO_8: + tcg_gen_st8_i32(src, cpu_env, off); + break; + case MO_16: + tcg_gen_st16_i32(src, cpu_env, off); + break; + case MO_32: + tcg_gen_st_i32(src, cpu_env, off); + break; + default: + g_assert_not_reached(); + } +} + +void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop) +{ + long off = neon_element_offset(reg, ele, memop); + + switch (memop) { + case MO_32: + tcg_gen_st32_i64(src, cpu_env, off); + break; + case MO_64: + tcg_gen_st_i64(src, cpu_env, off); + break; + default: + g_assert_not_reached(); + } +} + +#define ARM_CP_RW_BIT (1 << 20) + +static inline void iwmmxt_load_reg(TCGv_i64 var, int reg) +{ + tcg_gen_ld_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg])); +} + +static inline void iwmmxt_store_reg(TCGv_i64 var, int reg) +{ + tcg_gen_st_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg])); +} + +static inline TCGv_i32 iwmmxt_load_creg(int reg) +{ + TCGv_i32 var = tcg_temp_new_i32(); + tcg_gen_ld_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg])); + return var; +} + +static inline void iwmmxt_store_creg(int reg, TCGv_i32 var) +{ + tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg])); + tcg_temp_free_i32(var); +} + +static inline void gen_op_iwmmxt_movq_wRn_M0(int rn) +{ + iwmmxt_store_reg(cpu_M0, rn); +} + +static inline void gen_op_iwmmxt_movq_M0_wRn(int rn) +{ + iwmmxt_load_reg(cpu_M0, rn); +} + +static inline void gen_op_iwmmxt_orq_M0_wRn(int rn) +{ + iwmmxt_load_reg(cpu_V1, rn); + tcg_gen_or_i64(cpu_M0, cpu_M0, cpu_V1); +} + +static inline void gen_op_iwmmxt_andq_M0_wRn(int rn) +{ + iwmmxt_load_reg(cpu_V1, rn); + tcg_gen_and_i64(cpu_M0, cpu_M0, cpu_V1); +} + +static inline void gen_op_iwmmxt_xorq_M0_wRn(int rn) +{ + iwmmxt_load_reg(cpu_V1, rn); + tcg_gen_xor_i64(cpu_M0, cpu_M0, cpu_V1); +} + +#define IWMMXT_OP(name) \ +static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \ +{ \ + iwmmxt_load_reg(cpu_V1, rn); \ + gen_helper_iwmmxt_##name(cpu_M0, cpu_M0, cpu_V1); \ +} + +#define IWMMXT_OP_ENV(name) \ +static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \ +{ \ + iwmmxt_load_reg(cpu_V1, rn); \ + gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0, cpu_V1); \ +} + +#define IWMMXT_OP_ENV_SIZE(name) \ +IWMMXT_OP_ENV(name##b) \ +IWMMXT_OP_ENV(name##w) \ +IWMMXT_OP_ENV(name##l) + +#define IWMMXT_OP_ENV1(name) \ +static inline void gen_op_iwmmxt_##name##_M0(void) \ +{ \ + gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0); \ +} + +IWMMXT_OP(maddsq) +IWMMXT_OP(madduq) +IWMMXT_OP(sadb) +IWMMXT_OP(sadw) +IWMMXT_OP(mulslw) +IWMMXT_OP(mulshw) +IWMMXT_OP(mululw) +IWMMXT_OP(muluhw) +IWMMXT_OP(macsw) +IWMMXT_OP(macuw) + +IWMMXT_OP_ENV_SIZE(unpackl) +IWMMXT_OP_ENV_SIZE(unpackh) + +IWMMXT_OP_ENV1(unpacklub) +IWMMXT_OP_ENV1(unpackluw) +IWMMXT_OP_ENV1(unpacklul) +IWMMXT_OP_ENV1(unpackhub) +IWMMXT_OP_ENV1(unpackhuw) +IWMMXT_OP_ENV1(unpackhul) +IWMMXT_OP_ENV1(unpacklsb) +IWMMXT_OP_ENV1(unpacklsw) +IWMMXT_OP_ENV1(unpacklsl) +IWMMXT_OP_ENV1(unpackhsb) +IWMMXT_OP_ENV1(unpackhsw) +IWMMXT_OP_ENV1(unpackhsl) + +IWMMXT_OP_ENV_SIZE(cmpeq) +IWMMXT_OP_ENV_SIZE(cmpgtu) +IWMMXT_OP_ENV_SIZE(cmpgts) + +IWMMXT_OP_ENV_SIZE(mins) +IWMMXT_OP_ENV_SIZE(minu) +IWMMXT_OP_ENV_SIZE(maxs) +IWMMXT_OP_ENV_SIZE(maxu) + +IWMMXT_OP_ENV_SIZE(subn) +IWMMXT_OP_ENV_SIZE(addn) +IWMMXT_OP_ENV_SIZE(subu) +IWMMXT_OP_ENV_SIZE(addu) +IWMMXT_OP_ENV_SIZE(subs) +IWMMXT_OP_ENV_SIZE(adds) + +IWMMXT_OP_ENV(avgb0) +IWMMXT_OP_ENV(avgb1) +IWMMXT_OP_ENV(avgw0) +IWMMXT_OP_ENV(avgw1) + +IWMMXT_OP_ENV(packuw) +IWMMXT_OP_ENV(packul) +IWMMXT_OP_ENV(packuq) +IWMMXT_OP_ENV(packsw) +IWMMXT_OP_ENV(packsl) +IWMMXT_OP_ENV(packsq) + +static void gen_op_iwmmxt_set_mup(void) +{ + TCGv_i32 tmp; + tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]); + tcg_gen_ori_i32(tmp, tmp, 2); + store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]); +} + +static void gen_op_iwmmxt_set_cup(void) +{ + TCGv_i32 tmp; + tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]); + tcg_gen_ori_i32(tmp, tmp, 1); + store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]); +} + +static void gen_op_iwmmxt_setpsr_nz(void) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + gen_helper_iwmmxt_setpsr_nz(tmp, cpu_M0); + store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCASF]); +} + +static inline void gen_op_iwmmxt_addl_M0_wRn(int rn) +{ + iwmmxt_load_reg(cpu_V1, rn); + tcg_gen_ext32u_i64(cpu_V1, cpu_V1); + tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1); +} + +static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn, + TCGv_i32 dest) +{ + int rd; + uint32_t offset; + TCGv_i32 tmp; + + rd = (insn >> 16) & 0xf; + tmp = load_reg(s, rd); + + offset = (insn & 0xff) << ((insn >> 7) & 2); + if (insn & (1 << 24)) { + /* Pre indexed */ + if (insn & (1 << 23)) + tcg_gen_addi_i32(tmp, tmp, offset); + else + tcg_gen_addi_i32(tmp, tmp, -offset); + tcg_gen_mov_i32(dest, tmp); + if (insn & (1 << 21)) + store_reg(s, rd, tmp); + else + tcg_temp_free_i32(tmp); + } else if (insn & (1 << 21)) { + /* Post indexed */ + tcg_gen_mov_i32(dest, tmp); + if (insn & (1 << 23)) + tcg_gen_addi_i32(tmp, tmp, offset); + else + tcg_gen_addi_i32(tmp, tmp, -offset); + store_reg(s, rd, tmp); + } else if (!(insn & (1 << 23))) + return 1; + return 0; +} + +static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest) +{ + int rd = (insn >> 0) & 0xf; + TCGv_i32 tmp; + + if (insn & (1 << 8)) { + if (rd < ARM_IWMMXT_wCGR0 || rd > ARM_IWMMXT_wCGR3) { + return 1; + } else { + tmp = iwmmxt_load_creg(rd); + } + } else { + tmp = tcg_temp_new_i32(); + iwmmxt_load_reg(cpu_V0, rd); + tcg_gen_extrl_i64_i32(tmp, cpu_V0); + } + tcg_gen_andi_i32(tmp, tmp, mask); + tcg_gen_mov_i32(dest, tmp); + tcg_temp_free_i32(tmp); + return 0; +} + +/* Disassemble an iwMMXt instruction. Returns nonzero if an error occurred + (ie. an undefined instruction). */ +static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn) +{ + int rd, wrd; + int rdhi, rdlo, rd0, rd1, i; + TCGv_i32 addr; + TCGv_i32 tmp, tmp2, tmp3; + + if ((insn & 0x0e000e00) == 0x0c000000) { + if ((insn & 0x0fe00ff0) == 0x0c400000) { + wrd = insn & 0xf; + rdlo = (insn >> 12) & 0xf; + rdhi = (insn >> 16) & 0xf; + if (insn & ARM_CP_RW_BIT) { /* TMRRC */ + iwmmxt_load_reg(cpu_V0, wrd); + tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0); + tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0); + } else { /* TMCRR */ + tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]); + iwmmxt_store_reg(cpu_V0, wrd); + gen_op_iwmmxt_set_mup(); + } + return 0; + } + + wrd = (insn >> 12) & 0xf; + addr = tcg_temp_new_i32(); + if (gen_iwmmxt_address(s, insn, addr)) { + tcg_temp_free_i32(addr); + return 1; + } + if (insn & ARM_CP_RW_BIT) { + if ((insn >> 28) == 0xf) { /* WLDRW wCx */ + tmp = tcg_temp_new_i32(); + gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); + iwmmxt_store_creg(wrd, tmp); + } else { + i = 1; + if (insn & (1 << 8)) { + if (insn & (1 << 22)) { /* WLDRD */ + gen_aa32_ld64(s, cpu_M0, addr, get_mem_index(s)); + i = 0; + } else { /* WLDRW wRd */ + tmp = tcg_temp_new_i32(); + gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); + } + } else { + tmp = tcg_temp_new_i32(); + if (insn & (1 << 22)) { /* WLDRH */ + gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); + } else { /* WLDRB */ + gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); + } + } + if (i) { + tcg_gen_extu_i32_i64(cpu_M0, tmp); + tcg_temp_free_i32(tmp); + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + } + } else { + if ((insn >> 28) == 0xf) { /* WSTRW wCx */ + tmp = iwmmxt_load_creg(wrd); + gen_aa32_st32(s, tmp, addr, get_mem_index(s)); + } else { + gen_op_iwmmxt_movq_M0_wRn(wrd); + tmp = tcg_temp_new_i32(); + if (insn & (1 << 8)) { + if (insn & (1 << 22)) { /* WSTRD */ + gen_aa32_st64(s, cpu_M0, addr, get_mem_index(s)); + } else { /* WSTRW wRd */ + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + gen_aa32_st32(s, tmp, addr, get_mem_index(s)); + } + } else { + if (insn & (1 << 22)) { /* WSTRH */ + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + gen_aa32_st16(s, tmp, addr, get_mem_index(s)); + } else { /* WSTRB */ + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + gen_aa32_st8(s, tmp, addr, get_mem_index(s)); + } + } + } + tcg_temp_free_i32(tmp); + } + tcg_temp_free_i32(addr); + return 0; + } + + if ((insn & 0x0f000000) != 0x0e000000) + return 1; + + switch (((insn >> 12) & 0xf00) | ((insn >> 4) & 0xff)) { + case 0x000: /* WOR */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 0) & 0xf; + rd1 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + gen_op_iwmmxt_orq_M0_wRn(rd1); + gen_op_iwmmxt_setpsr_nz(); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x011: /* TMCR */ + if (insn & 0xf) + return 1; + rd = (insn >> 12) & 0xf; + wrd = (insn >> 16) & 0xf; + switch (wrd) { + case ARM_IWMMXT_wCID: + case ARM_IWMMXT_wCASF: + break; + case ARM_IWMMXT_wCon: + gen_op_iwmmxt_set_cup(); + /* Fall through. */ + case ARM_IWMMXT_wCSSF: + tmp = iwmmxt_load_creg(wrd); + tmp2 = load_reg(s, rd); + tcg_gen_andc_i32(tmp, tmp, tmp2); + tcg_temp_free_i32(tmp2); + iwmmxt_store_creg(wrd, tmp); + break; + case ARM_IWMMXT_wCGR0: + case ARM_IWMMXT_wCGR1: + case ARM_IWMMXT_wCGR2: + case ARM_IWMMXT_wCGR3: + gen_op_iwmmxt_set_cup(); + tmp = load_reg(s, rd); + iwmmxt_store_creg(wrd, tmp); + break; + default: + return 1; + } + break; + case 0x100: /* WXOR */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 0) & 0xf; + rd1 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + gen_op_iwmmxt_xorq_M0_wRn(rd1); + gen_op_iwmmxt_setpsr_nz(); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x111: /* TMRC */ + if (insn & 0xf) + return 1; + rd = (insn >> 12) & 0xf; + wrd = (insn >> 16) & 0xf; + tmp = iwmmxt_load_creg(wrd); + store_reg(s, rd, tmp); + break; + case 0x300: /* WANDN */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 0) & 0xf; + rd1 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tcg_gen_neg_i64(cpu_M0, cpu_M0); + gen_op_iwmmxt_andq_M0_wRn(rd1); + gen_op_iwmmxt_setpsr_nz(); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x200: /* WAND */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 0) & 0xf; + rd1 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + gen_op_iwmmxt_andq_M0_wRn(rd1); + gen_op_iwmmxt_setpsr_nz(); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x810: case 0xa10: /* WMADD */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 0) & 0xf; + rd1 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + if (insn & (1 << 21)) + gen_op_iwmmxt_maddsq_M0_wRn(rd1); + else + gen_op_iwmmxt_madduq_M0_wRn(rd1); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x10e: case 0x50e: case 0x90e: case 0xd0e: /* WUNPCKIL */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + gen_op_iwmmxt_unpacklb_M0_wRn(rd1); + break; + case 1: + gen_op_iwmmxt_unpacklw_M0_wRn(rd1); + break; + case 2: + gen_op_iwmmxt_unpackll_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x10c: case 0x50c: case 0x90c: case 0xd0c: /* WUNPCKIH */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + gen_op_iwmmxt_unpackhb_M0_wRn(rd1); + break; + case 1: + gen_op_iwmmxt_unpackhw_M0_wRn(rd1); + break; + case 2: + gen_op_iwmmxt_unpackhl_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x012: case 0x112: case 0x412: case 0x512: /* WSAD */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + if (insn & (1 << 22)) + gen_op_iwmmxt_sadw_M0_wRn(rd1); + else + gen_op_iwmmxt_sadb_M0_wRn(rd1); + if (!(insn & (1 << 20))) + gen_op_iwmmxt_addl_M0_wRn(wrd); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x010: case 0x110: case 0x210: case 0x310: /* WMUL */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + if (insn & (1 << 21)) { + if (insn & (1 << 20)) + gen_op_iwmmxt_mulshw_M0_wRn(rd1); + else + gen_op_iwmmxt_mulslw_M0_wRn(rd1); + } else { + if (insn & (1 << 20)) + gen_op_iwmmxt_muluhw_M0_wRn(rd1); + else + gen_op_iwmmxt_mululw_M0_wRn(rd1); + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x410: case 0x510: case 0x610: case 0x710: /* WMAC */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + if (insn & (1 << 21)) + gen_op_iwmmxt_macsw_M0_wRn(rd1); + else + gen_op_iwmmxt_macuw_M0_wRn(rd1); + if (!(insn & (1 << 20))) { + iwmmxt_load_reg(cpu_V1, wrd); + tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1); + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x006: case 0x406: case 0x806: case 0xc06: /* WCMPEQ */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + gen_op_iwmmxt_cmpeqb_M0_wRn(rd1); + break; + case 1: + gen_op_iwmmxt_cmpeqw_M0_wRn(rd1); + break; + case 2: + gen_op_iwmmxt_cmpeql_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x800: case 0x900: case 0xc00: case 0xd00: /* WAVG2 */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + if (insn & (1 << 22)) { + if (insn & (1 << 20)) + gen_op_iwmmxt_avgw1_M0_wRn(rd1); + else + gen_op_iwmmxt_avgw0_M0_wRn(rd1); + } else { + if (insn & (1 << 20)) + gen_op_iwmmxt_avgb1_M0_wRn(rd1); + else + gen_op_iwmmxt_avgb0_M0_wRn(rd1); + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x802: case 0x902: case 0xa02: case 0xb02: /* WALIGNR */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = iwmmxt_load_creg(ARM_IWMMXT_wCGR0 + ((insn >> 20) & 3)); + tcg_gen_andi_i32(tmp, tmp, 7); + iwmmxt_load_reg(cpu_V1, rd1); + gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, tmp); + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x601: case 0x605: case 0x609: case 0x60d: /* TINSR */ + if (((insn >> 6) & 3) == 3) + return 1; + rd = (insn >> 12) & 0xf; + wrd = (insn >> 16) & 0xf; + tmp = load_reg(s, rd); + gen_op_iwmmxt_movq_M0_wRn(wrd); + switch ((insn >> 6) & 3) { + case 0: + tmp2 = tcg_constant_i32(0xff); + tmp3 = tcg_constant_i32((insn & 7) << 3); + break; + case 1: + tmp2 = tcg_constant_i32(0xffff); + tmp3 = tcg_constant_i32((insn & 3) << 4); + break; + case 2: + tmp2 = tcg_constant_i32(0xffffffff); + tmp3 = tcg_constant_i32((insn & 1) << 5); + break; + default: + g_assert_not_reached(); + } + gen_helper_iwmmxt_insr(cpu_M0, cpu_M0, tmp, tmp2, tmp3); + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x107: case 0x507: case 0x907: case 0xd07: /* TEXTRM */ + rd = (insn >> 12) & 0xf; + wrd = (insn >> 16) & 0xf; + if (rd == 15 || ((insn >> 22) & 3) == 3) + return 1; + gen_op_iwmmxt_movq_M0_wRn(wrd); + tmp = tcg_temp_new_i32(); + switch ((insn >> 22) & 3) { + case 0: + tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 7) << 3); + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + if (insn & 8) { + tcg_gen_ext8s_i32(tmp, tmp); + } else { + tcg_gen_andi_i32(tmp, tmp, 0xff); + } + break; + case 1: + tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 3) << 4); + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + if (insn & 8) { + tcg_gen_ext16s_i32(tmp, tmp); + } else { + tcg_gen_andi_i32(tmp, tmp, 0xffff); + } + break; + case 2: + tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 1) << 5); + tcg_gen_extrl_i64_i32(tmp, cpu_M0); + break; + } + store_reg(s, rd, tmp); + break; + case 0x117: case 0x517: case 0x917: case 0xd17: /* TEXTRC */ + if ((insn & 0x000ff008) != 0x0003f000 || ((insn >> 22) & 3) == 3) + return 1; + tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF); + switch ((insn >> 22) & 3) { + case 0: + tcg_gen_shri_i32(tmp, tmp, ((insn & 7) << 2) + 0); + break; + case 1: + tcg_gen_shri_i32(tmp, tmp, ((insn & 3) << 3) + 4); + break; + case 2: + tcg_gen_shri_i32(tmp, tmp, ((insn & 1) << 4) + 12); + break; + } + tcg_gen_shli_i32(tmp, tmp, 28); + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp); + break; + case 0x401: case 0x405: case 0x409: case 0x40d: /* TBCST */ + if (((insn >> 6) & 3) == 3) + return 1; + rd = (insn >> 12) & 0xf; + wrd = (insn >> 16) & 0xf; + tmp = load_reg(s, rd); + switch ((insn >> 6) & 3) { + case 0: + gen_helper_iwmmxt_bcstb(cpu_M0, tmp); + break; + case 1: + gen_helper_iwmmxt_bcstw(cpu_M0, tmp); + break; + case 2: + gen_helper_iwmmxt_bcstl(cpu_M0, tmp); + break; + } + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x113: case 0x513: case 0x913: case 0xd13: /* TANDC */ + if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3) + return 1; + tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF); + tmp2 = tcg_temp_new_i32(); + tcg_gen_mov_i32(tmp2, tmp); + switch ((insn >> 22) & 3) { + case 0: + for (i = 0; i < 7; i ++) { + tcg_gen_shli_i32(tmp2, tmp2, 4); + tcg_gen_and_i32(tmp, tmp, tmp2); + } + break; + case 1: + for (i = 0; i < 3; i ++) { + tcg_gen_shli_i32(tmp2, tmp2, 8); + tcg_gen_and_i32(tmp, tmp, tmp2); + } + break; + case 2: + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_and_i32(tmp, tmp, tmp2); + break; + } + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp); + break; + case 0x01c: case 0x41c: case 0x81c: case 0xc1c: /* WACC */ + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + gen_helper_iwmmxt_addcb(cpu_M0, cpu_M0); + break; + case 1: + gen_helper_iwmmxt_addcw(cpu_M0, cpu_M0); + break; + case 2: + gen_helper_iwmmxt_addcl(cpu_M0, cpu_M0); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x115: case 0x515: case 0x915: case 0xd15: /* TORC */ + if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3) + return 1; + tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF); + tmp2 = tcg_temp_new_i32(); + tcg_gen_mov_i32(tmp2, tmp); + switch ((insn >> 22) & 3) { + case 0: + for (i = 0; i < 7; i ++) { + tcg_gen_shli_i32(tmp2, tmp2, 4); + tcg_gen_or_i32(tmp, tmp, tmp2); + } + break; + case 1: + for (i = 0; i < 3; i ++) { + tcg_gen_shli_i32(tmp2, tmp2, 8); + tcg_gen_or_i32(tmp, tmp, tmp2); + } + break; + case 2: + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp, tmp, tmp2); + break; + } + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp); + break; + case 0x103: case 0x503: case 0x903: case 0xd03: /* TMOVMSK */ + rd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + if ((insn & 0xf) != 0 || ((insn >> 22) & 3) == 3) + return 1; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_temp_new_i32(); + switch ((insn >> 22) & 3) { + case 0: + gen_helper_iwmmxt_msbb(tmp, cpu_M0); + break; + case 1: + gen_helper_iwmmxt_msbw(tmp, cpu_M0); + break; + case 2: + gen_helper_iwmmxt_msbl(tmp, cpu_M0); + break; + } + store_reg(s, rd, tmp); + break; + case 0x106: case 0x306: case 0x506: case 0x706: /* WCMPGT */ + case 0x906: case 0xb06: case 0xd06: case 0xf06: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + if (insn & (1 << 21)) + gen_op_iwmmxt_cmpgtsb_M0_wRn(rd1); + else + gen_op_iwmmxt_cmpgtub_M0_wRn(rd1); + break; + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_cmpgtsw_M0_wRn(rd1); + else + gen_op_iwmmxt_cmpgtuw_M0_wRn(rd1); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_cmpgtsl_M0_wRn(rd1); + else + gen_op_iwmmxt_cmpgtul_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x00e: case 0x20e: case 0x40e: case 0x60e: /* WUNPCKEL */ + case 0x80e: case 0xa0e: case 0xc0e: case 0xe0e: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpacklsb_M0(); + else + gen_op_iwmmxt_unpacklub_M0(); + break; + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpacklsw_M0(); + else + gen_op_iwmmxt_unpackluw_M0(); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpacklsl_M0(); + else + gen_op_iwmmxt_unpacklul_M0(); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x00c: case 0x20c: case 0x40c: case 0x60c: /* WUNPCKEH */ + case 0x80c: case 0xa0c: case 0xc0c: case 0xe0c: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpackhsb_M0(); + else + gen_op_iwmmxt_unpackhub_M0(); + break; + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpackhsw_M0(); + else + gen_op_iwmmxt_unpackhuw_M0(); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_unpackhsl_M0(); + else + gen_op_iwmmxt_unpackhul_M0(); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x204: case 0x604: case 0xa04: case 0xe04: /* WSRL */ + case 0x214: case 0x614: case 0xa14: case 0xe14: + if (((insn >> 22) & 3) == 0) + return 1; + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_temp_new_i32(); + if (gen_iwmmxt_shift(insn, 0xff, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + switch ((insn >> 22) & 3) { + case 1: + gen_helper_iwmmxt_srlw(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 2: + gen_helper_iwmmxt_srll(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 3: + gen_helper_iwmmxt_srlq(cpu_M0, cpu_env, cpu_M0, tmp); + break; + } + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x004: case 0x404: case 0x804: case 0xc04: /* WSRA */ + case 0x014: case 0x414: case 0x814: case 0xc14: + if (((insn >> 22) & 3) == 0) + return 1; + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_temp_new_i32(); + if (gen_iwmmxt_shift(insn, 0xff, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + switch ((insn >> 22) & 3) { + case 1: + gen_helper_iwmmxt_sraw(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 2: + gen_helper_iwmmxt_sral(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 3: + gen_helper_iwmmxt_sraq(cpu_M0, cpu_env, cpu_M0, tmp); + break; + } + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x104: case 0x504: case 0x904: case 0xd04: /* WSLL */ + case 0x114: case 0x514: case 0x914: case 0xd14: + if (((insn >> 22) & 3) == 0) + return 1; + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_temp_new_i32(); + if (gen_iwmmxt_shift(insn, 0xff, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + switch ((insn >> 22) & 3) { + case 1: + gen_helper_iwmmxt_sllw(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 2: + gen_helper_iwmmxt_slll(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 3: + gen_helper_iwmmxt_sllq(cpu_M0, cpu_env, cpu_M0, tmp); + break; + } + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x304: case 0x704: case 0xb04: case 0xf04: /* WROR */ + case 0x314: case 0x714: case 0xb14: case 0xf14: + if (((insn >> 22) & 3) == 0) + return 1; + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_temp_new_i32(); + switch ((insn >> 22) & 3) { + case 1: + if (gen_iwmmxt_shift(insn, 0xf, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + gen_helper_iwmmxt_rorw(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 2: + if (gen_iwmmxt_shift(insn, 0x1f, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + gen_helper_iwmmxt_rorl(cpu_M0, cpu_env, cpu_M0, tmp); + break; + case 3: + if (gen_iwmmxt_shift(insn, 0x3f, tmp)) { + tcg_temp_free_i32(tmp); + return 1; + } + gen_helper_iwmmxt_rorq(cpu_M0, cpu_env, cpu_M0, tmp); + break; + } + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x116: case 0x316: case 0x516: case 0x716: /* WMIN */ + case 0x916: case 0xb16: case 0xd16: case 0xf16: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + if (insn & (1 << 21)) + gen_op_iwmmxt_minsb_M0_wRn(rd1); + else + gen_op_iwmmxt_minub_M0_wRn(rd1); + break; + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_minsw_M0_wRn(rd1); + else + gen_op_iwmmxt_minuw_M0_wRn(rd1); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_minsl_M0_wRn(rd1); + else + gen_op_iwmmxt_minul_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x016: case 0x216: case 0x416: case 0x616: /* WMAX */ + case 0x816: case 0xa16: case 0xc16: case 0xe16: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 0: + if (insn & (1 << 21)) + gen_op_iwmmxt_maxsb_M0_wRn(rd1); + else + gen_op_iwmmxt_maxub_M0_wRn(rd1); + break; + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_maxsw_M0_wRn(rd1); + else + gen_op_iwmmxt_maxuw_M0_wRn(rd1); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_maxsl_M0_wRn(rd1); + else + gen_op_iwmmxt_maxul_M0_wRn(rd1); + break; + case 3: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x002: case 0x102: case 0x202: case 0x302: /* WALIGNI */ + case 0x402: case 0x502: case 0x602: case 0x702: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + iwmmxt_load_reg(cpu_V1, rd1); + gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, + tcg_constant_i32((insn >> 20) & 3)); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + case 0x01a: case 0x11a: case 0x21a: case 0x31a: /* WSUB */ + case 0x41a: case 0x51a: case 0x61a: case 0x71a: + case 0x81a: case 0x91a: case 0xa1a: case 0xb1a: + case 0xc1a: case 0xd1a: case 0xe1a: case 0xf1a: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 20) & 0xf) { + case 0x0: + gen_op_iwmmxt_subnb_M0_wRn(rd1); + break; + case 0x1: + gen_op_iwmmxt_subub_M0_wRn(rd1); + break; + case 0x3: + gen_op_iwmmxt_subsb_M0_wRn(rd1); + break; + case 0x4: + gen_op_iwmmxt_subnw_M0_wRn(rd1); + break; + case 0x5: + gen_op_iwmmxt_subuw_M0_wRn(rd1); + break; + case 0x7: + gen_op_iwmmxt_subsw_M0_wRn(rd1); + break; + case 0x8: + gen_op_iwmmxt_subnl_M0_wRn(rd1); + break; + case 0x9: + gen_op_iwmmxt_subul_M0_wRn(rd1); + break; + case 0xb: + gen_op_iwmmxt_subsl_M0_wRn(rd1); + break; + default: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x01e: case 0x11e: case 0x21e: case 0x31e: /* WSHUFH */ + case 0x41e: case 0x51e: case 0x61e: case 0x71e: + case 0x81e: case 0x91e: case 0xa1e: case 0xb1e: + case 0xc1e: case 0xd1e: case 0xe1e: case 0xf1e: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + tmp = tcg_constant_i32(((insn >> 16) & 0xf0) | (insn & 0x0f)); + gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x018: case 0x118: case 0x218: case 0x318: /* WADD */ + case 0x418: case 0x518: case 0x618: case 0x718: + case 0x818: case 0x918: case 0xa18: case 0xb18: + case 0xc18: case 0xd18: case 0xe18: case 0xf18: + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 20) & 0xf) { + case 0x0: + gen_op_iwmmxt_addnb_M0_wRn(rd1); + break; + case 0x1: + gen_op_iwmmxt_addub_M0_wRn(rd1); + break; + case 0x3: + gen_op_iwmmxt_addsb_M0_wRn(rd1); + break; + case 0x4: + gen_op_iwmmxt_addnw_M0_wRn(rd1); + break; + case 0x5: + gen_op_iwmmxt_adduw_M0_wRn(rd1); + break; + case 0x7: + gen_op_iwmmxt_addsw_M0_wRn(rd1); + break; + case 0x8: + gen_op_iwmmxt_addnl_M0_wRn(rd1); + break; + case 0x9: + gen_op_iwmmxt_addul_M0_wRn(rd1); + break; + case 0xb: + gen_op_iwmmxt_addsl_M0_wRn(rd1); + break; + default: + return 1; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x008: case 0x108: case 0x208: case 0x308: /* WPACK */ + case 0x408: case 0x508: case 0x608: case 0x708: + case 0x808: case 0x908: case 0xa08: case 0xb08: + case 0xc08: case 0xd08: case 0xe08: case 0xf08: + if (!(insn & (1 << 20)) || ((insn >> 22) & 3) == 0) + return 1; + wrd = (insn >> 12) & 0xf; + rd0 = (insn >> 16) & 0xf; + rd1 = (insn >> 0) & 0xf; + gen_op_iwmmxt_movq_M0_wRn(rd0); + switch ((insn >> 22) & 3) { + case 1: + if (insn & (1 << 21)) + gen_op_iwmmxt_packsw_M0_wRn(rd1); + else + gen_op_iwmmxt_packuw_M0_wRn(rd1); + break; + case 2: + if (insn & (1 << 21)) + gen_op_iwmmxt_packsl_M0_wRn(rd1); + else + gen_op_iwmmxt_packul_M0_wRn(rd1); + break; + case 3: + if (insn & (1 << 21)) + gen_op_iwmmxt_packsq_M0_wRn(rd1); + else + gen_op_iwmmxt_packuq_M0_wRn(rd1); + break; + } + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + gen_op_iwmmxt_set_cup(); + break; + case 0x201: case 0x203: case 0x205: case 0x207: + case 0x209: case 0x20b: case 0x20d: case 0x20f: + case 0x211: case 0x213: case 0x215: case 0x217: + case 0x219: case 0x21b: case 0x21d: case 0x21f: + wrd = (insn >> 5) & 0xf; + rd0 = (insn >> 12) & 0xf; + rd1 = (insn >> 0) & 0xf; + if (rd0 == 0xf || rd1 == 0xf) + return 1; + gen_op_iwmmxt_movq_M0_wRn(wrd); + tmp = load_reg(s, rd0); + tmp2 = load_reg(s, rd1); + switch ((insn >> 16) & 0xf) { + case 0x0: /* TMIA */ + gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2); + break; + case 0x8: /* TMIAPH */ + gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2); + break; + case 0xc: case 0xd: case 0xe: case 0xf: /* TMIAxy */ + if (insn & (1 << 16)) + tcg_gen_shri_i32(tmp, tmp, 16); + if (insn & (1 << 17)) + tcg_gen_shri_i32(tmp2, tmp2, 16); + gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2); + break; + default: + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp); + return 1; + } + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp); + gen_op_iwmmxt_movq_wRn_M0(wrd); + gen_op_iwmmxt_set_mup(); + break; + default: + return 1; + } + + return 0; +} + +/* Disassemble an XScale DSP instruction. Returns nonzero if an error occurred + (ie. an undefined instruction). */ +static int disas_dsp_insn(DisasContext *s, uint32_t insn) +{ + int acc, rd0, rd1, rdhi, rdlo; + TCGv_i32 tmp, tmp2; + + if ((insn & 0x0ff00f10) == 0x0e200010) { + /* Multiply with Internal Accumulate Format */ + rd0 = (insn >> 12) & 0xf; + rd1 = insn & 0xf; + acc = (insn >> 5) & 7; + + if (acc != 0) + return 1; + + tmp = load_reg(s, rd0); + tmp2 = load_reg(s, rd1); + switch ((insn >> 16) & 0xf) { + case 0x0: /* MIA */ + gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2); + break; + case 0x8: /* MIAPH */ + gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2); + break; + case 0xc: /* MIABB */ + case 0xd: /* MIABT */ + case 0xe: /* MIATB */ + case 0xf: /* MIATT */ + if (insn & (1 << 16)) + tcg_gen_shri_i32(tmp, tmp, 16); + if (insn & (1 << 17)) + tcg_gen_shri_i32(tmp2, tmp2, 16); + gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2); + break; + default: + return 1; + } + tcg_temp_free_i32(tmp2); + tcg_temp_free_i32(tmp); + + gen_op_iwmmxt_movq_wRn_M0(acc); + return 0; + } + + if ((insn & 0x0fe00ff8) == 0x0c400000) { + /* Internal Accumulator Access Format */ + rdhi = (insn >> 16) & 0xf; + rdlo = (insn >> 12) & 0xf; + acc = insn & 7; + + if (acc != 0) + return 1; + + if (insn & ARM_CP_RW_BIT) { /* MRA */ + iwmmxt_load_reg(cpu_V0, acc); + tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0); + tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0); + tcg_gen_andi_i32(cpu_R[rdhi], cpu_R[rdhi], (1 << (40 - 32)) - 1); + } else { /* MAR */ + tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]); + iwmmxt_store_reg(cpu_V0, acc); + } + return 0; + } + + return 1; +} + +static void gen_goto_ptr(void) +{ + tcg_gen_lookup_and_goto_ptr(); +} + +/* This will end the TB but doesn't guarantee we'll return to + * cpu_loop_exec. Any live exit_requests will be processed as we + * enter the next TB. + */ +static void gen_goto_tb(DisasContext *s, int n, target_long diff) +{ + if (translator_use_goto_tb(&s->base, s->pc_curr + diff)) { + /* + * For pcrel, the pc must always be up-to-date on entry to + * the linked TB, so that it can use simple additions for all + * further adjustments. For !pcrel, the linked TB is compiled + * to know its full virtual address, so we can delay the + * update to pc to the unlinked path. A long chain of links + * can thus avoid many updates to the PC. + */ + if (TARGET_TB_PCREL) { + gen_update_pc(s, diff); + tcg_gen_goto_tb(n); + } else { + tcg_gen_goto_tb(n); + gen_update_pc(s, diff); + } + tcg_gen_exit_tb(s->base.tb, n); + } else { + gen_update_pc(s, diff); + gen_goto_ptr(); + } + s->base.is_jmp = DISAS_NORETURN; +} + +/* Jump, specifying which TB number to use if we gen_goto_tb() */ +static void gen_jmp_tb(DisasContext *s, target_long diff, int tbno) +{ + if (unlikely(s->ss_active)) { + /* An indirect jump so that we still trigger the debug exception. */ + gen_update_pc(s, diff); + s->base.is_jmp = DISAS_JUMP; + return; + } + switch (s->base.is_jmp) { + case DISAS_NEXT: + case DISAS_TOO_MANY: + case DISAS_NORETURN: + /* + * The normal case: just go to the destination TB. + * NB: NORETURN happens if we generate code like + * gen_brcondi(l); + * gen_jmp(); + * gen_set_label(l); + * gen_jmp(); + * on the second call to gen_jmp(). + */ + gen_goto_tb(s, tbno, diff); + break; + case DISAS_UPDATE_NOCHAIN: + case DISAS_UPDATE_EXIT: + /* + * We already decided we're leaving the TB for some other reason. + * Avoid using goto_tb so we really do exit back to the main loop + * and don't chain to another TB. + */ + gen_update_pc(s, diff); + gen_goto_ptr(); + s->base.is_jmp = DISAS_NORETURN; + break; + default: + /* + * We shouldn't be emitting code for a jump and also have + * is_jmp set to one of the special cases like DISAS_SWI. + */ + g_assert_not_reached(); + } +} + +static inline void gen_jmp(DisasContext *s, target_long diff) +{ + gen_jmp_tb(s, diff, 0); +} + +static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y) +{ + if (x) + tcg_gen_sari_i32(t0, t0, 16); + else + gen_sxth(t0); + if (y) + tcg_gen_sari_i32(t1, t1, 16); + else + gen_sxth(t1); + tcg_gen_mul_i32(t0, t0, t1); +} + +/* Return the mask of PSR bits set by a MSR instruction. */ +static uint32_t msr_mask(DisasContext *s, int flags, int spsr) +{ + uint32_t mask = 0; + + if (flags & (1 << 0)) { + mask |= 0xff; + } + if (flags & (1 << 1)) { + mask |= 0xff00; + } + if (flags & (1 << 2)) { + mask |= 0xff0000; + } + if (flags & (1 << 3)) { + mask |= 0xff000000; + } + + /* Mask out undefined and reserved bits. */ + mask &= aarch32_cpsr_valid_mask(s->features, s->isar); + + /* Mask out execution state. */ + if (!spsr) { + mask &= ~CPSR_EXEC; + } + + /* Mask out privileged bits. */ + if (IS_USER(s)) { + mask &= CPSR_USER; + } + return mask; +} + +/* Returns nonzero if access to the PSR is not permitted. Marks t0 as dead. */ +static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv_i32 t0) +{ + TCGv_i32 tmp; + if (spsr) { + /* ??? This is also undefined in system mode. */ + if (IS_USER(s)) + return 1; + + tmp = load_cpu_field(spsr); + tcg_gen_andi_i32(tmp, tmp, ~mask); + tcg_gen_andi_i32(t0, t0, mask); + tcg_gen_or_i32(tmp, tmp, t0); + store_cpu_field(tmp, spsr); + } else { + gen_set_cpsr(t0, mask); + } + tcg_temp_free_i32(t0); + gen_lookup_tb(s); + return 0; +} + +/* Returns nonzero if access to the PSR is not permitted. */ +static int gen_set_psr_im(DisasContext *s, uint32_t mask, int spsr, uint32_t val) +{ + TCGv_i32 tmp; + tmp = tcg_temp_new_i32(); + tcg_gen_movi_i32(tmp, val); + return gen_set_psr(s, mask, spsr, tmp); +} + +static bool msr_banked_access_decode(DisasContext *s, int r, int sysm, int rn, + int *tgtmode, int *regno) +{ + /* Decode the r and sysm fields of MSR/MRS banked accesses into + * the target mode and register number, and identify the various + * unpredictable cases. + * MSR (banked) and MRS (banked) are CONSTRAINED UNPREDICTABLE if: + * + executed in user mode + * + using R15 as the src/dest register + * + accessing an unimplemented register + * + accessing a register that's inaccessible at current PL/security state* + * + accessing a register that you could access with a different insn + * We choose to UNDEF in all these cases. + * Since we don't know which of the various AArch32 modes we are in + * we have to defer some checks to runtime. + * Accesses to Monitor mode registers from Secure EL1 (which implies + * that EL3 is AArch64) must trap to EL3. + * + * If the access checks fail this function will emit code to take + * an exception and return false. Otherwise it will return true, + * and set *tgtmode and *regno appropriately. + */ + /* These instructions are present only in ARMv8, or in ARMv7 with the + * Virtualization Extensions. + */ + if (!arm_dc_feature(s, ARM_FEATURE_V8) && + !arm_dc_feature(s, ARM_FEATURE_EL2)) { + goto undef; + } + + if (IS_USER(s) || rn == 15) { + goto undef; + } + + /* The table in the v8 ARM ARM section F5.2.3 describes the encoding + * of registers into (r, sysm). + */ + if (r) { + /* SPSRs for other modes */ + switch (sysm) { + case 0xe: /* SPSR_fiq */ + *tgtmode = ARM_CPU_MODE_FIQ; + break; + case 0x10: /* SPSR_irq */ + *tgtmode = ARM_CPU_MODE_IRQ; + break; + case 0x12: /* SPSR_svc */ + *tgtmode = ARM_CPU_MODE_SVC; + break; + case 0x14: /* SPSR_abt */ + *tgtmode = ARM_CPU_MODE_ABT; + break; + case 0x16: /* SPSR_und */ + *tgtmode = ARM_CPU_MODE_UND; + break; + case 0x1c: /* SPSR_mon */ + *tgtmode = ARM_CPU_MODE_MON; + break; + case 0x1e: /* SPSR_hyp */ + *tgtmode = ARM_CPU_MODE_HYP; + break; + default: /* unallocated */ + goto undef; + } + /* We arbitrarily assign SPSR a register number of 16. */ + *regno = 16; + } else { + /* general purpose registers for other modes */ + switch (sysm) { + case 0x0 ... 0x6: /* 0b00xxx : r8_usr ... r14_usr */ + *tgtmode = ARM_CPU_MODE_USR; + *regno = sysm + 8; + break; + case 0x8 ... 0xe: /* 0b01xxx : r8_fiq ... r14_fiq */ + *tgtmode = ARM_CPU_MODE_FIQ; + *regno = sysm; + break; + case 0x10 ... 0x11: /* 0b1000x : r14_irq, r13_irq */ + *tgtmode = ARM_CPU_MODE_IRQ; + *regno = sysm & 1 ? 13 : 14; + break; + case 0x12 ... 0x13: /* 0b1001x : r14_svc, r13_svc */ + *tgtmode = ARM_CPU_MODE_SVC; + *regno = sysm & 1 ? 13 : 14; + break; + case 0x14 ... 0x15: /* 0b1010x : r14_abt, r13_abt */ + *tgtmode = ARM_CPU_MODE_ABT; + *regno = sysm & 1 ? 13 : 14; + break; + case 0x16 ... 0x17: /* 0b1011x : r14_und, r13_und */ + *tgtmode = ARM_CPU_MODE_UND; + *regno = sysm & 1 ? 13 : 14; + break; + case 0x1c ... 0x1d: /* 0b1110x : r14_mon, r13_mon */ + *tgtmode = ARM_CPU_MODE_MON; + *regno = sysm & 1 ? 13 : 14; + break; + case 0x1e ... 0x1f: /* 0b1111x : elr_hyp, r13_hyp */ + *tgtmode = ARM_CPU_MODE_HYP; + /* Arbitrarily pick 17 for ELR_Hyp (which is not a banked LR!) */ + *regno = sysm & 1 ? 13 : 17; + break; + default: /* unallocated */ + goto undef; + } + } + + /* Catch the 'accessing inaccessible register' cases we can detect + * at translate time. + */ + switch (*tgtmode) { + case ARM_CPU_MODE_MON: + if (!arm_dc_feature(s, ARM_FEATURE_EL3) || s->ns) { + goto undef; + } + if (s->current_el == 1) { + /* If we're in Secure EL1 (which implies that EL3 is AArch64) + * then accesses to Mon registers trap to Secure EL2, if it exists, + * otherwise EL3. + */ + TCGv_i32 tcg_el; + + if (arm_dc_feature(s, ARM_FEATURE_AARCH64) && + dc_isar_feature(aa64_sel2, s)) { + /* Target EL is EL<3 minus SCR_EL3.EEL2> */ + tcg_el = load_cpu_field(cp15.scr_el3); + tcg_gen_sextract_i32(tcg_el, tcg_el, ctz32(SCR_EEL2), 1); + tcg_gen_addi_i32(tcg_el, tcg_el, 3); + } else { + tcg_el = tcg_constant_i32(3); + } + + gen_exception_insn_el_v(s, 0, EXCP_UDEF, + syn_uncategorized(), tcg_el); + tcg_temp_free_i32(tcg_el); + return false; + } + break; + case ARM_CPU_MODE_HYP: + /* + * SPSR_hyp and r13_hyp can only be accessed from Monitor mode + * (and so we can forbid accesses from EL2 or below). elr_hyp + * can be accessed also from Hyp mode, so forbid accesses from + * EL0 or EL1. + */ + if (!arm_dc_feature(s, ARM_FEATURE_EL2) || s->current_el < 2 || + (s->current_el < 3 && *regno != 17)) { + goto undef; + } + break; + default: + break; + } + + return true; + +undef: + /* If we get here then some access check did not pass */ + gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized()); + return false; +} + +static void gen_msr_banked(DisasContext *s, int r, int sysm, int rn) +{ + TCGv_i32 tcg_reg; + int tgtmode = 0, regno = 0; + + if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, ®no)) { + return; + } + + /* Sync state because msr_banked() can raise exceptions */ + gen_set_condexec(s); + gen_update_pc(s, 0); + tcg_reg = load_reg(s, rn); + gen_helper_msr_banked(cpu_env, tcg_reg, + tcg_constant_i32(tgtmode), + tcg_constant_i32(regno)); + tcg_temp_free_i32(tcg_reg); + s->base.is_jmp = DISAS_UPDATE_EXIT; +} + +static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn) +{ + TCGv_i32 tcg_reg; + int tgtmode = 0, regno = 0; + + if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, ®no)) { + return; + } + + /* Sync state because mrs_banked() can raise exceptions */ + gen_set_condexec(s); + gen_update_pc(s, 0); + tcg_reg = tcg_temp_new_i32(); + gen_helper_mrs_banked(tcg_reg, cpu_env, + tcg_constant_i32(tgtmode), + tcg_constant_i32(regno)); + store_reg(s, rn, tcg_reg); + s->base.is_jmp = DISAS_UPDATE_EXIT; +} + +/* Store value to PC as for an exception return (ie don't + * mask bits). The subsequent call to gen_helper_cpsr_write_eret() + * will do the masking based on the new value of the Thumb bit. + */ +static void store_pc_exc_ret(DisasContext *s, TCGv_i32 pc) +{ + tcg_gen_mov_i32(cpu_R[15], pc); + tcg_temp_free_i32(pc); +} + +/* Generate a v6 exception return. Marks both values as dead. */ +static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr) +{ + store_pc_exc_ret(s, pc); + /* The cpsr_write_eret helper will mask the low bits of PC + * appropriately depending on the new Thumb bit, so it must + * be called after storing the new PC. + */ + if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { + gen_io_start(); + } + gen_helper_cpsr_write_eret(cpu_env, cpsr); + tcg_temp_free_i32(cpsr); + /* Must exit loop to check un-masked IRQs */ + s->base.is_jmp = DISAS_EXIT; +} + +/* Generate an old-style exception return. Marks pc as dead. */ +static void gen_exception_return(DisasContext *s, TCGv_i32 pc) +{ + gen_rfe(s, pc, load_cpu_field(spsr)); +} + +static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz, + gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr qc_ptr = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc)); + tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, + opr_sz, max_sz, 0, fn); + tcg_temp_free_ptr(qc_ptr); +} + +void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_3_ptr * const fns[2] = { + gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 + }; + tcg_debug_assert(vece >= 1 && vece <= 2); + gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); +} + +void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_3_ptr * const fns[2] = { + gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 + }; + tcg_debug_assert(vece >= 1 && vece <= 2); + gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); +} + +#define GEN_CMP0(NAME, COND) \ + static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a) \ + { \ + tcg_gen_setcondi_i32(COND, d, a, 0); \ + tcg_gen_neg_i32(d, d); \ + } \ + static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a) \ + { \ + tcg_gen_setcondi_i64(COND, d, a, 0); \ + tcg_gen_neg_i64(d, d); \ + } \ + static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \ + { \ + TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0); \ + tcg_gen_cmp_vec(COND, vece, d, a, zero); \ + } \ + void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m, \ + uint32_t opr_sz, uint32_t max_sz) \ + { \ + const GVecGen2 op[4] = { \ + { .fno = gen_helper_gvec_##NAME##0_b, \ + .fniv = gen_##NAME##0_vec, \ + .opt_opc = vecop_list_cmp, \ + .vece = MO_8 }, \ + { .fno = gen_helper_gvec_##NAME##0_h, \ + .fniv = gen_##NAME##0_vec, \ + .opt_opc = vecop_list_cmp, \ + .vece = MO_16 }, \ + { .fni4 = gen_##NAME##0_i32, \ + .fniv = gen_##NAME##0_vec, \ + .opt_opc = vecop_list_cmp, \ + .vece = MO_32 }, \ + { .fni8 = gen_##NAME##0_i64, \ + .fniv = gen_##NAME##0_vec, \ + .opt_opc = vecop_list_cmp, \ + .prefer_i64 = TCG_TARGET_REG_BITS == 64, \ + .vece = MO_64 }, \ + }; \ + tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]); \ + } + +static const TCGOpcode vecop_list_cmp[] = { + INDEX_op_cmp_vec, 0 +}; + +GEN_CMP0(ceq, TCG_COND_EQ) +GEN_CMP0(cle, TCG_COND_LE) +GEN_CMP0(cge, TCG_COND_GE) +GEN_CMP0(clt, TCG_COND_LT) +GEN_CMP0(cgt, TCG_COND_GT) + +#undef GEN_CMP0 + +static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_sari_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_sari_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_sari_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sari_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_ssra8_i64, + .fniv = gen_ssra_vec, + .fno = gen_helper_gvec_ssra_b, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni8 = gen_ssra16_i64, + .fniv = gen_ssra_vec, + .fno = gen_helper_gvec_ssra_h, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_ssra32_i32, + .fniv = gen_ssra_vec, + .fno = gen_helper_gvec_ssra_s, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_ssra64_i64, + .fniv = gen_ssra_vec, + .fno = gen_helper_gvec_ssra_b, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize]. */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + /* + * Shifts larger than the element size are architecturally valid. + * Signed results in all sign bits. + */ + shift = MIN(shift, (8 << vece) - 1); + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); +} + +static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_shri_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_usra8_i64, + .fniv = gen_usra_vec, + .fno = gen_helper_gvec_usra_b, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8, }, + { .fni8 = gen_usra16_i64, + .fniv = gen_usra_vec, + .fno = gen_helper_gvec_usra_h, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16, }, + { .fni4 = gen_usra32_i32, + .fniv = gen_usra_vec, + .fno = gen_helper_gvec_usra_s, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32, }, + { .fni8 = gen_usra64_i64, + .fniv = gen_usra_vec, + .fno = gen_helper_gvec_usra_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_64, }, + }; + + /* tszimm encoding produces immediates in the range [1..esize]. */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + /* + * Shifts larger than the element size are architecturally valid. + * Unsigned results in all zeros as input to accumulate: nop. + */ + if (shift < (8 << vece)) { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } else { + /* Nop, but we do need to clear the tail. */ + tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); + } +} + +/* + * Shift one less than the requested amount, and the low bit is + * the rounding bit. For the 8 and 16-bit operations, because we + * mask the low bit, we can perform a normal integer shift instead + * of a vector shift. + */ +static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, sh - 1); + tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); + tcg_gen_vec_sar8i_i64(d, a, sh); + tcg_gen_vec_add8_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, sh - 1); + tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); + tcg_gen_vec_sar16i_i64(d, a, sh); + tcg_gen_vec_add16_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) +{ + TCGv_i32 t; + + /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ + if (sh == 32) { + tcg_gen_movi_i32(d, 0); + return; + } + t = tcg_temp_new_i32(); + tcg_gen_extract_i32(t, a, sh - 1, 1); + tcg_gen_sari_i32(d, a, sh); + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_extract_i64(t, a, sh - 1, 1); + tcg_gen_sari_i64(d, a, sh); + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec ones = tcg_temp_new_vec_matching(d); + + tcg_gen_shri_vec(vece, t, a, sh - 1); + tcg_gen_dupi_vec(vece, ones, 1); + tcg_gen_and_vec(vece, t, t, ones); + tcg_gen_sari_vec(vece, d, a, sh); + tcg_gen_add_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(ones); +} + +void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_srshr8_i64, + .fniv = gen_srshr_vec, + .fno = gen_helper_gvec_srshr_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni8 = gen_srshr16_i64, + .fniv = gen_srshr_vec, + .fno = gen_helper_gvec_srshr_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_srshr32_i32, + .fniv = gen_srshr_vec, + .fno = gen_helper_gvec_srshr_s, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_srshr64_i64, + .fniv = gen_srshr_vec, + .fno = gen_helper_gvec_srshr_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize] */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + if (shift == (8 << vece)) { + /* + * Shifts larger than the element size are architecturally valid. + * Signed results in all sign bits. With rounding, this produces + * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. + * I.e. always zero. + */ + tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); + } else { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } +} + +static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + gen_srshr8_i64(t, a, sh); + tcg_gen_vec_add8_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + gen_srshr16_i64(t, a, sh); + tcg_gen_vec_add16_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) +{ + TCGv_i32 t = tcg_temp_new_i32(); + + gen_srshr32_i32(t, a, sh); + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + gen_srshr64_i64(t, a, sh); + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + + gen_srshr_vec(vece, t, a, sh); + tcg_gen_add_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_srsra8_i64, + .fniv = gen_srsra_vec, + .fno = gen_helper_gvec_srsra_b, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_8 }, + { .fni8 = gen_srsra16_i64, + .fniv = gen_srsra_vec, + .fno = gen_helper_gvec_srsra_h, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_srsra32_i32, + .fniv = gen_srsra_vec, + .fno = gen_helper_gvec_srsra_s, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_srsra64_i64, + .fniv = gen_srsra_vec, + .fno = gen_helper_gvec_srsra_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize] */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + /* + * Shifts larger than the element size are architecturally valid. + * Signed results in all sign bits. With rounding, this produces + * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. + * I.e. always zero. With accumulation, this leaves D unchanged. + */ + if (shift == (8 << vece)) { + /* Nop, but we do need to clear the tail. */ + tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); + } else { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } +} + +static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, sh - 1); + tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); + tcg_gen_vec_shr8i_i64(d, a, sh); + tcg_gen_vec_add8_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, sh - 1); + tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); + tcg_gen_vec_shr16i_i64(d, a, sh); + tcg_gen_vec_add16_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) +{ + TCGv_i32 t; + + /* Handle shift by the input size for the benefit of trans_URSHR_ri */ + if (sh == 32) { + tcg_gen_extract_i32(d, a, sh - 1, 1); + return; + } + t = tcg_temp_new_i32(); + tcg_gen_extract_i32(t, a, sh - 1, 1); + tcg_gen_shri_i32(d, a, sh); + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_extract_i64(t, a, sh - 1, 1); + tcg_gen_shri_i64(d, a, sh); + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec ones = tcg_temp_new_vec_matching(d); + + tcg_gen_shri_vec(vece, t, a, shift - 1); + tcg_gen_dupi_vec(vece, ones, 1); + tcg_gen_and_vec(vece, t, t, ones); + tcg_gen_shri_vec(vece, d, a, shift); + tcg_gen_add_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(ones); +} + +void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_urshr8_i64, + .fniv = gen_urshr_vec, + .fno = gen_helper_gvec_urshr_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni8 = gen_urshr16_i64, + .fniv = gen_urshr_vec, + .fno = gen_helper_gvec_urshr_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_urshr32_i32, + .fniv = gen_urshr_vec, + .fno = gen_helper_gvec_urshr_s, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_urshr64_i64, + .fniv = gen_urshr_vec, + .fno = gen_helper_gvec_urshr_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize] */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + if (shift == (8 << vece)) { + /* + * Shifts larger than the element size are architecturally valid. + * Unsigned results in zero. With rounding, this produces a + * copy of the most significant bit. + */ + tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); + } else { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } +} + +static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + if (sh == 8) { + tcg_gen_vec_shr8i_i64(t, a, 7); + } else { + gen_urshr8_i64(t, a, sh); + } + tcg_gen_vec_add8_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + if (sh == 16) { + tcg_gen_vec_shr16i_i64(t, a, 15); + } else { + gen_urshr16_i64(t, a, sh); + } + tcg_gen_vec_add16_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) +{ + TCGv_i32 t = tcg_temp_new_i32(); + + if (sh == 32) { + tcg_gen_shri_i32(t, a, 31); + } else { + gen_urshr32_i32(t, a, sh); + } + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + if (sh == 64) { + tcg_gen_shri_i64(t, a, 63); + } else { + gen_urshr64_i64(t, a, sh); + } + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + + if (sh == (8 << vece)) { + tcg_gen_shri_vec(vece, t, a, sh - 1); + } else { + gen_urshr_vec(vece, t, a, sh); + } + tcg_gen_add_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen2i ops[4] = { + { .fni8 = gen_ursra8_i64, + .fniv = gen_ursra_vec, + .fno = gen_helper_gvec_ursra_b, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_8 }, + { .fni8 = gen_ursra16_i64, + .fniv = gen_ursra_vec, + .fno = gen_helper_gvec_ursra_h, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_ursra32_i32, + .fniv = gen_ursra_vec, + .fno = gen_helper_gvec_ursra_s, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_ursra64_i64, + .fniv = gen_ursra_vec, + .fno = gen_helper_gvec_ursra_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize] */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); +} + +static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); +} + +static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); +} + +static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh)); + tcg_gen_shri_vec(vece, t, a, sh); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); +} + +void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; + const GVecGen2i ops[4] = { + { .fni8 = gen_shr8_ins_i64, + .fniv = gen_shr_ins_vec, + .fno = gen_helper_gvec_sri_b, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni8 = gen_shr16_ins_i64, + .fniv = gen_shr_ins_vec, + .fno = gen_helper_gvec_sri_h, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_shr32_ins_i32, + .fniv = gen_shr_ins_vec, + .fno = gen_helper_gvec_sri_s, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_shr64_ins_i64, + .fniv = gen_shr_ins_vec, + .fno = gen_helper_gvec_sri_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [1..esize]. */ + tcg_debug_assert(shift > 0); + tcg_debug_assert(shift <= (8 << vece)); + + /* Shift of esize leaves destination unchanged. */ + if (shift < (8 << vece)) { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } else { + /* Nop, but we do need to clear the tail. */ + tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); + } +} + +static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); +} + +static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); +} + +static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_shli_vec(vece, t, a, sh); + tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh)); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); +} + +void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; + const GVecGen2i ops[4] = { + { .fni8 = gen_shl8_ins_i64, + .fniv = gen_shl_ins_vec, + .fno = gen_helper_gvec_sli_b, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni8 = gen_shl16_ins_i64, + .fniv = gen_shl_ins_vec, + .fno = gen_helper_gvec_sli_h, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_shl32_ins_i32, + .fniv = gen_shl_ins_vec, + .fno = gen_helper_gvec_sli_s, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_shl64_ins_i64, + .fniv = gen_shl_ins_vec, + .fno = gen_helper_gvec_sli_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + + /* tszimm encoding produces immediates in the range [0..esize-1]. */ + tcg_debug_assert(shift >= 0); + tcg_debug_assert(shift < (8 << vece)); + + if (shift == 0) { + tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); + } else { + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); + } +} + +static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_add_u8(d, d, a); +} + +static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_sub_u8(d, d, a); +} + +static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_add_u16(d, d, a); +} + +static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_sub_u16(d, d, a); +} + +static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_add_i32(d, d, a); +} + +static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_sub_i32(d, d, a); +} + +static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_add_i64(d, d, a); +} + +static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_sub_i64(d, d, a); +} + +static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_sub_vec(vece, d, d, a); +} + +/* Note that while NEON does not support VMLA and VMLS as 64-bit ops, + * these tables are shared with AArch64 which does support them. + */ +void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_mul_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fni4 = gen_mla8_i32, + .fniv = gen_mla_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni4 = gen_mla16_i32, + .fniv = gen_mla_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_mla32_i32, + .fniv = gen_mla_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_mla64_i64, + .fniv = gen_mla_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_mul_vec, INDEX_op_sub_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fni4 = gen_mls8_i32, + .fniv = gen_mls_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni4 = gen_mls16_i32, + .fniv = gen_mls_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_mls32_i32, + .fniv = gen_mls_vec, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_mls64_i64, + .fniv = gen_mls_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +/* CMTST : test is "if (X & Y != 0)". */ +static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_and_i32(d, a, b); + tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i32(d, d); +} + +void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_and_i64(d, a, b); + tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i64(d, d); +} + +static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_and_vec(vece, d, a, b); + tcg_gen_dupi_vec(vece, a, 0); + tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a); +} + +void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; + static const GVecGen3 ops[4] = { + { .fni4 = gen_helper_neon_tst_u8, + .fniv = gen_cmtst_vec, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fni4 = gen_helper_neon_tst_u16, + .fniv = gen_cmtst_vec, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_cmtst_i32, + .fniv = gen_cmtst_vec, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_cmtst_i64, + .fniv = gen_cmtst_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) +{ + TCGv_i32 lval = tcg_temp_new_i32(); + TCGv_i32 rval = tcg_temp_new_i32(); + TCGv_i32 lsh = tcg_temp_new_i32(); + TCGv_i32 rsh = tcg_temp_new_i32(); + TCGv_i32 zero = tcg_constant_i32(0); + TCGv_i32 max = tcg_constant_i32(32); + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_ext8s_i32(lsh, shift); + tcg_gen_neg_i32(rsh, lsh); + tcg_gen_shl_i32(lval, src, lsh); + tcg_gen_shr_i32(rval, src, rsh); + tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); + tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); + + tcg_temp_free_i32(lval); + tcg_temp_free_i32(rval); + tcg_temp_free_i32(lsh); + tcg_temp_free_i32(rsh); +} + +void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) +{ + TCGv_i64 lval = tcg_temp_new_i64(); + TCGv_i64 rval = tcg_temp_new_i64(); + TCGv_i64 lsh = tcg_temp_new_i64(); + TCGv_i64 rsh = tcg_temp_new_i64(); + TCGv_i64 zero = tcg_constant_i64(0); + TCGv_i64 max = tcg_constant_i64(64); + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_ext8s_i64(lsh, shift); + tcg_gen_neg_i64(rsh, lsh); + tcg_gen_shl_i64(lval, src, lsh); + tcg_gen_shr_i64(rval, src, rsh); + tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); + tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); + + tcg_temp_free_i64(lval); + tcg_temp_free_i64(rval); + tcg_temp_free_i64(lsh); + tcg_temp_free_i64(rsh); +} + +static void gen_ushl_vec(unsigned vece, TCGv_vec dst, + TCGv_vec src, TCGv_vec shift) +{ + TCGv_vec lval = tcg_temp_new_vec_matching(dst); + TCGv_vec rval = tcg_temp_new_vec_matching(dst); + TCGv_vec lsh = tcg_temp_new_vec_matching(dst); + TCGv_vec rsh = tcg_temp_new_vec_matching(dst); + TCGv_vec msk, max; + + tcg_gen_neg_vec(vece, rsh, shift); + if (vece == MO_8) { + tcg_gen_mov_vec(lsh, shift); + } else { + msk = tcg_temp_new_vec_matching(dst); + tcg_gen_dupi_vec(vece, msk, 0xff); + tcg_gen_and_vec(vece, lsh, shift, msk); + tcg_gen_and_vec(vece, rsh, rsh, msk); + tcg_temp_free_vec(msk); + } + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_shlv_vec(vece, lval, src, lsh); + tcg_gen_shrv_vec(vece, rval, src, rsh); + + max = tcg_temp_new_vec_matching(dst); + tcg_gen_dupi_vec(vece, max, 8 << vece); + + /* + * The choice of LT (signed) and GEU (unsigned) are biased toward + * the instructions of the x86_64 host. For MO_8, the whole byte + * is significant so we must use an unsigned compare; otherwise we + * have already masked to a byte and so a signed compare works. + * Other tcg hosts have a full set of comparisons and do not care. + */ + if (vece == MO_8) { + tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max); + tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max); + tcg_gen_andc_vec(vece, lval, lval, lsh); + tcg_gen_andc_vec(vece, rval, rval, rsh); + } else { + tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max); + tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max); + tcg_gen_and_vec(vece, lval, lval, lsh); + tcg_gen_and_vec(vece, rval, rval, rsh); + } + tcg_gen_or_vec(vece, dst, lval, rval); + + tcg_temp_free_vec(max); + tcg_temp_free_vec(lval); + tcg_temp_free_vec(rval); + tcg_temp_free_vec(lsh); + tcg_temp_free_vec(rsh); +} + +void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_neg_vec, INDEX_op_shlv_vec, + INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_ushl_vec, + .fno = gen_helper_gvec_ushl_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fniv = gen_ushl_vec, + .fno = gen_helper_gvec_ushl_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_ushl_i32, + .fniv = gen_ushl_vec, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_ushl_i64, + .fniv = gen_ushl_vec, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) +{ + TCGv_i32 lval = tcg_temp_new_i32(); + TCGv_i32 rval = tcg_temp_new_i32(); + TCGv_i32 lsh = tcg_temp_new_i32(); + TCGv_i32 rsh = tcg_temp_new_i32(); + TCGv_i32 zero = tcg_constant_i32(0); + TCGv_i32 max = tcg_constant_i32(31); + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_ext8s_i32(lsh, shift); + tcg_gen_neg_i32(rsh, lsh); + tcg_gen_shl_i32(lval, src, lsh); + tcg_gen_umin_i32(rsh, rsh, max); + tcg_gen_sar_i32(rval, src, rsh); + tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); + tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); + + tcg_temp_free_i32(lval); + tcg_temp_free_i32(rval); + tcg_temp_free_i32(lsh); + tcg_temp_free_i32(rsh); +} + +void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) +{ + TCGv_i64 lval = tcg_temp_new_i64(); + TCGv_i64 rval = tcg_temp_new_i64(); + TCGv_i64 lsh = tcg_temp_new_i64(); + TCGv_i64 rsh = tcg_temp_new_i64(); + TCGv_i64 zero = tcg_constant_i64(0); + TCGv_i64 max = tcg_constant_i64(63); + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_ext8s_i64(lsh, shift); + tcg_gen_neg_i64(rsh, lsh); + tcg_gen_shl_i64(lval, src, lsh); + tcg_gen_umin_i64(rsh, rsh, max); + tcg_gen_sar_i64(rval, src, rsh); + tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); + tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); + + tcg_temp_free_i64(lval); + tcg_temp_free_i64(rval); + tcg_temp_free_i64(lsh); + tcg_temp_free_i64(rsh); +} + +static void gen_sshl_vec(unsigned vece, TCGv_vec dst, + TCGv_vec src, TCGv_vec shift) +{ + TCGv_vec lval = tcg_temp_new_vec_matching(dst); + TCGv_vec rval = tcg_temp_new_vec_matching(dst); + TCGv_vec lsh = tcg_temp_new_vec_matching(dst); + TCGv_vec rsh = tcg_temp_new_vec_matching(dst); + TCGv_vec tmp = tcg_temp_new_vec_matching(dst); + + /* + * Rely on the TCG guarantee that out of range shifts produce + * unspecified results, not undefined behaviour (i.e. no trap). + * Discard out-of-range results after the fact. + */ + tcg_gen_neg_vec(vece, rsh, shift); + if (vece == MO_8) { + tcg_gen_mov_vec(lsh, shift); + } else { + tcg_gen_dupi_vec(vece, tmp, 0xff); + tcg_gen_and_vec(vece, lsh, shift, tmp); + tcg_gen_and_vec(vece, rsh, rsh, tmp); + } + + /* Bound rsh so out of bound right shift gets -1. */ + tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1); + tcg_gen_umin_vec(vece, rsh, rsh, tmp); + tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp); + + tcg_gen_shlv_vec(vece, lval, src, lsh); + tcg_gen_sarv_vec(vece, rval, src, rsh); + + /* Select in-bound left shift. */ + tcg_gen_andc_vec(vece, lval, lval, tmp); + + /* Select between left and right shift. */ + if (vece == MO_8) { + tcg_gen_dupi_vec(vece, tmp, 0); + tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval); + } else { + tcg_gen_dupi_vec(vece, tmp, 0x80); + tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval); + } + + tcg_temp_free_vec(lval); + tcg_temp_free_vec(rval); + tcg_temp_free_vec(lsh); + tcg_temp_free_vec(rsh); + tcg_temp_free_vec(tmp); +} + +void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, + INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_sshl_vec, + .fno = gen_helper_gvec_sshl_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fniv = gen_sshl_vec, + .fno = gen_helper_gvec_sshl_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_sshl_i32, + .fniv = gen_sshl_vec, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_sshl_i64, + .fniv = gen_sshl_vec, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat, + TCGv_vec a, TCGv_vec b) +{ + TCGv_vec x = tcg_temp_new_vec_matching(t); + tcg_gen_add_vec(vece, x, a, b); + tcg_gen_usadd_vec(vece, t, a, b); + tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); + tcg_gen_or_vec(vece, sat, sat, x); + tcg_temp_free_vec(x); +} + +void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_uqadd_vec, + .fno = gen_helper_gvec_uqadd_b, + .write_aofs = true, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fniv = gen_uqadd_vec, + .fno = gen_helper_gvec_uqadd_h, + .write_aofs = true, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fniv = gen_uqadd_vec, + .fno = gen_helper_gvec_uqadd_s, + .write_aofs = true, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fniv = gen_uqadd_vec, + .fno = gen_helper_gvec_uqadd_d, + .write_aofs = true, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), + rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat, + TCGv_vec a, TCGv_vec b) +{ + TCGv_vec x = tcg_temp_new_vec_matching(t); + tcg_gen_add_vec(vece, x, a, b); + tcg_gen_ssadd_vec(vece, t, a, b); + tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); + tcg_gen_or_vec(vece, sat, sat, x); + tcg_temp_free_vec(x); +} + +void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_sqadd_vec, + .fno = gen_helper_gvec_sqadd_b, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_8 }, + { .fniv = gen_sqadd_vec, + .fno = gen_helper_gvec_sqadd_h, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_16 }, + { .fniv = gen_sqadd_vec, + .fno = gen_helper_gvec_sqadd_s, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_32 }, + { .fniv = gen_sqadd_vec, + .fno = gen_helper_gvec_sqadd_d, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_64 }, + }; + tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), + rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat, + TCGv_vec a, TCGv_vec b) +{ + TCGv_vec x = tcg_temp_new_vec_matching(t); + tcg_gen_sub_vec(vece, x, a, b); + tcg_gen_ussub_vec(vece, t, a, b); + tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); + tcg_gen_or_vec(vece, sat, sat, x); + tcg_temp_free_vec(x); +} + +void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_uqsub_vec, + .fno = gen_helper_gvec_uqsub_b, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_8 }, + { .fniv = gen_uqsub_vec, + .fno = gen_helper_gvec_uqsub_h, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_16 }, + { .fniv = gen_uqsub_vec, + .fno = gen_helper_gvec_uqsub_s, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_32 }, + { .fniv = gen_uqsub_vec, + .fno = gen_helper_gvec_uqsub_d, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_64 }, + }; + tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), + rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat, + TCGv_vec a, TCGv_vec b) +{ + TCGv_vec x = tcg_temp_new_vec_matching(t); + tcg_gen_sub_vec(vece, x, a, b); + tcg_gen_sssub_vec(vece, t, a, b); + tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); + tcg_gen_or_vec(vece, sat, sat, x); + tcg_temp_free_vec(x); +} + +void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_sqsub_vec, + .fno = gen_helper_gvec_sqsub_b, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_8 }, + { .fniv = gen_sqsub_vec, + .fno = gen_helper_gvec_sqsub_h, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_16 }, + { .fniv = gen_sqsub_vec, + .fno = gen_helper_gvec_sqsub_s, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_32 }, + { .fniv = gen_sqsub_vec, + .fno = gen_helper_gvec_sqsub_d, + .opt_opc = vecop_list, + .write_aofs = true, + .vece = MO_64 }, + }; + tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), + rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 t = tcg_temp_new_i32(); + + tcg_gen_sub_i32(t, a, b); + tcg_gen_sub_i32(d, b, a); + tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); + tcg_temp_free_i32(t); +} + +static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_sub_i64(t, a, b); + tcg_gen_sub_i64(d, b, a); + tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); + tcg_temp_free_i64(t); +} + +static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + + tcg_gen_smin_vec(vece, t, a, b); + tcg_gen_smax_vec(vece, d, a, b); + tcg_gen_sub_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_sabd_vec, + .fno = gen_helper_gvec_sabd_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fniv = gen_sabd_vec, + .fno = gen_helper_gvec_sabd_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_sabd_i32, + .fniv = gen_sabd_vec, + .fno = gen_helper_gvec_sabd_s, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_sabd_i64, + .fniv = gen_sabd_vec, + .fno = gen_helper_gvec_sabd_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 t = tcg_temp_new_i32(); + + tcg_gen_sub_i32(t, a, b); + tcg_gen_sub_i32(d, b, a); + tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); + tcg_temp_free_i32(t); +} + +static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_sub_i64(t, a, b); + tcg_gen_sub_i64(d, b, a); + tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); + tcg_temp_free_i64(t); +} + +static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + + tcg_gen_umin_vec(vece, t, a, b); + tcg_gen_umax_vec(vece, d, a, b); + tcg_gen_sub_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_uabd_vec, + .fno = gen_helper_gvec_uabd_b, + .opt_opc = vecop_list, + .vece = MO_8 }, + { .fniv = gen_uabd_vec, + .fno = gen_helper_gvec_uabd_h, + .opt_opc = vecop_list, + .vece = MO_16 }, + { .fni4 = gen_uabd_i32, + .fniv = gen_uabd_vec, + .fno = gen_helper_gvec_uabd_s, + .opt_opc = vecop_list, + .vece = MO_32 }, + { .fni8 = gen_uabd_i64, + .fniv = gen_uabd_vec, + .fno = gen_helper_gvec_uabd_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 t = tcg_temp_new_i32(); + gen_sabd_i32(t, a, b); + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t = tcg_temp_new_i64(); + gen_sabd_i64(t, a, b); + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + gen_sabd_vec(vece, t, a, b); + tcg_gen_add_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sub_vec, INDEX_op_add_vec, + INDEX_op_smin_vec, INDEX_op_smax_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_saba_vec, + .fno = gen_helper_gvec_saba_b, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_8 }, + { .fniv = gen_saba_vec, + .fno = gen_helper_gvec_saba_h, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_saba_i32, + .fniv = gen_saba_vec, + .fno = gen_helper_gvec_saba_s, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_saba_i64, + .fniv = gen_saba_vec, + .fno = gen_helper_gvec_saba_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 t = tcg_temp_new_i32(); + gen_uabd_i32(t, a, b); + tcg_gen_add_i32(d, d, t); + tcg_temp_free_i32(t); +} + +static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t = tcg_temp_new_i64(); + gen_uabd_i64(t, a, b); + tcg_gen_add_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + gen_uabd_vec(vece, t, a, b); + tcg_gen_add_vec(vece, d, d, t); + tcg_temp_free_vec(t); +} + +void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_sub_vec, INDEX_op_add_vec, + INDEX_op_umin_vec, INDEX_op_umax_vec, 0 + }; + static const GVecGen3 ops[4] = { + { .fniv = gen_uaba_vec, + .fno = gen_helper_gvec_uaba_b, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_8 }, + { .fniv = gen_uaba_vec, + .fno = gen_helper_gvec_uaba_h, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_uaba_i32, + .fniv = gen_uaba_vec, + .fno = gen_helper_gvec_uaba_s, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_uaba_i64, + .fniv = gen_uaba_vec, + .fno = gen_helper_gvec_uaba_d, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .opt_opc = vecop_list, + .load_dest = true, + .vece = MO_64 }, + }; + tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); +} + +static void do_coproc_insn(DisasContext *s, int cpnum, int is64, + int opc1, int crn, int crm, int opc2, + bool isread, int rt, int rt2) +{ + uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2); + const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key); + TCGv_ptr tcg_ri = NULL; + bool need_exit_tb; + uint32_t syndrome; + + /* + * Note that since we are an implementation which takes an + * exception on a trapped conditional instruction only if the + * instruction passes its condition code check, we can take + * advantage of the clause in the ARM ARM that allows us to set + * the COND field in the instruction to 0xE in all cases. + * We could fish the actual condition out of the insn (ARM) + * or the condexec bits (Thumb) but it isn't necessary. + */ + switch (cpnum) { + case 14: + if (is64) { + syndrome = syn_cp14_rrt_trap(1, 0xe, opc1, crm, rt, rt2, + isread, false); + } else { + syndrome = syn_cp14_rt_trap(1, 0xe, opc1, opc2, crn, crm, + rt, isread, false); + } + break; + case 15: + if (is64) { + syndrome = syn_cp15_rrt_trap(1, 0xe, opc1, crm, rt, rt2, + isread, false); + } else { + syndrome = syn_cp15_rt_trap(1, 0xe, opc1, opc2, crn, crm, + rt, isread, false); + } + break; + default: + /* + * ARMv8 defines that only coprocessors 14 and 15 exist, + * so this can only happen if this is an ARMv7 or earlier CPU, + * in which case the syndrome information won't actually be + * guest visible. + */ + assert(!arm_dc_feature(s, ARM_FEATURE_V8)); + syndrome = syn_uncategorized(); + break; + } + + if (s->hstr_active && cpnum == 15 && s->current_el == 1) { + /* + * At EL1, check for a HSTR_EL2 trap, which must take precedence + * over the UNDEF for "no such register" or the UNDEF for "access + * permissions forbid this EL1 access". HSTR_EL2 traps from EL0 + * only happen if the cpreg doesn't UNDEF at EL0, so we do those in + * access_check_cp_reg(), after the checks for whether the access + * configurably trapped to EL1. + */ + uint32_t maskbit = is64 ? crm : crn; + + if (maskbit != 4 && maskbit != 14) { + /* T4 and T14 are RES0 so never cause traps */ + TCGv_i32 t; + DisasLabel over = gen_disas_label(s); + + t = load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2)); + tcg_gen_andi_i32(t, t, 1u << maskbit); + tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label); + tcg_temp_free_i32(t); + + gen_exception_insn(s, 0, EXCP_UDEF, syndrome); + set_disas_label(s, over); + } + } + + if (!ri) { + /* + * Unknown register; this might be a guest error or a QEMU + * unimplemented feature. + */ + if (is64) { + qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 " + "64 bit system register cp:%d opc1: %d crm:%d " + "(%s)\n", + isread ? "read" : "write", cpnum, opc1, crm, + s->ns ? "non-secure" : "secure"); + } else { + qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 " + "system register cp:%d opc1:%d crn:%d crm:%d " + "opc2:%d (%s)\n", + isread ? "read" : "write", cpnum, opc1, crn, + crm, opc2, s->ns ? "non-secure" : "secure"); + } + unallocated_encoding(s); + return; + } + + /* Check access permissions */ + if (!cp_access_ok(s->current_el, ri, isread)) { + unallocated_encoding(s); + return; + } + + if ((s->hstr_active && s->current_el == 0) || ri->accessfn || + (ri->fgt && s->fgt_active) || + (arm_dc_feature(s, ARM_FEATURE_XSCALE) && cpnum < 14)) { + /* + * Emit code to perform further access permissions checks at + * runtime; this may result in an exception. + * Note that on XScale all cp0..c13 registers do an access check + * call in order to handle c15_cpar. + */ + gen_set_condexec(s); + gen_update_pc(s, 0); + tcg_ri = tcg_temp_new_ptr(); + gen_helper_access_check_cp_reg(tcg_ri, cpu_env, + tcg_constant_i32(key), + tcg_constant_i32(syndrome), + tcg_constant_i32(isread)); + } else if (ri->type & ARM_CP_RAISES_EXC) { + /* + * The readfn or writefn might raise an exception; + * synchronize the CPU state in case it does. + */ + gen_set_condexec(s); + gen_update_pc(s, 0); + } + + /* Handle special cases first */ + switch (ri->type & ARM_CP_SPECIAL_MASK) { + case 0: + break; + case ARM_CP_NOP: + goto exit; + case ARM_CP_WFI: + if (isread) { + unallocated_encoding(s); + } else { + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_WFI; + } + goto exit; + default: + g_assert_not_reached(); + } + + if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) { + gen_io_start(); + } + + if (isread) { + /* Read */ + if (is64) { + TCGv_i64 tmp64; + TCGv_i32 tmp; + if (ri->type & ARM_CP_CONST) { + tmp64 = tcg_constant_i64(ri->resetvalue); + } else if (ri->readfn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + tmp64 = tcg_temp_new_i64(); + gen_helper_get_cp_reg64(tmp64, cpu_env, tcg_ri); + } else { + tmp64 = tcg_temp_new_i64(); + tcg_gen_ld_i64(tmp64, cpu_env, ri->fieldoffset); + } + tmp = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tmp, tmp64); + store_reg(s, rt, tmp); + tmp = tcg_temp_new_i32(); + tcg_gen_extrh_i64_i32(tmp, tmp64); + tcg_temp_free_i64(tmp64); + store_reg(s, rt2, tmp); + } else { + TCGv_i32 tmp; + if (ri->type & ARM_CP_CONST) { + tmp = tcg_constant_i32(ri->resetvalue); + } else if (ri->readfn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + tmp = tcg_temp_new_i32(); + gen_helper_get_cp_reg(tmp, cpu_env, tcg_ri); + } else { + tmp = load_cpu_offset(ri->fieldoffset); + } + if (rt == 15) { + /* Destination register of r15 for 32 bit loads sets + * the condition codes from the high 4 bits of the value + */ + gen_set_nzcv(tmp); + tcg_temp_free_i32(tmp); + } else { + store_reg(s, rt, tmp); + } + } + } else { + /* Write */ + if (ri->type & ARM_CP_CONST) { + /* If not forbidden by access permissions, treat as WI */ + goto exit; + } + + if (is64) { + TCGv_i32 tmplo, tmphi; + TCGv_i64 tmp64 = tcg_temp_new_i64(); + tmplo = load_reg(s, rt); + tmphi = load_reg(s, rt2); + tcg_gen_concat_i32_i64(tmp64, tmplo, tmphi); + tcg_temp_free_i32(tmplo); + tcg_temp_free_i32(tmphi); + if (ri->writefn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + gen_helper_set_cp_reg64(cpu_env, tcg_ri, tmp64); + } else { + tcg_gen_st_i64(tmp64, cpu_env, ri->fieldoffset); + } + tcg_temp_free_i64(tmp64); + } else { + TCGv_i32 tmp = load_reg(s, rt); + if (ri->writefn) { + if (!tcg_ri) { + tcg_ri = gen_lookup_cp_reg(key); + } + gen_helper_set_cp_reg(cpu_env, tcg_ri, tmp); + tcg_temp_free_i32(tmp); + } else { + store_cpu_offset(tmp, ri->fieldoffset, 4); + } + } + } + + /* I/O operations must end the TB here (whether read or write) */ + need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && + (ri->type & ARM_CP_IO)); + + if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) { + /* + * A write to any coprocessor register that ends a TB + * must rebuild the hflags for the next TB. + */ + gen_rebuild_hflags(s, ri->type & ARM_CP_NEWEL); + /* + * We default to ending the TB on a coprocessor register write, + * but allow this to be suppressed by the register definition + * (usually only necessary to work around guest bugs). + */ + need_exit_tb = true; + } + if (need_exit_tb) { + gen_lookup_tb(s); + } + + exit: + if (tcg_ri) { + tcg_temp_free_ptr(tcg_ri); + } +} + +/* Decode XScale DSP or iWMMXt insn (in the copro space, cp=0 or 1) */ +static void disas_xscale_insn(DisasContext *s, uint32_t insn) +{ + int cpnum = (insn >> 8) & 0xf; + + if (extract32(s->c15_cpar, cpnum, 1) == 0) { + unallocated_encoding(s); + } else if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) { + if (disas_iwmmxt_insn(s, insn)) { + unallocated_encoding(s); + } + } else if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) { + if (disas_dsp_insn(s, insn)) { + unallocated_encoding(s); + } + } +} + +/* Store a 64-bit value to a register pair. Clobbers val. */ +static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val) +{ + TCGv_i32 tmp; + tmp = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tmp, val); + store_reg(s, rlow, tmp); + tmp = tcg_temp_new_i32(); + tcg_gen_extrh_i64_i32(tmp, val); + store_reg(s, rhigh, tmp); +} + +/* load and add a 64-bit value from a register pair. */ +static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh) +{ + TCGv_i64 tmp; + TCGv_i32 tmpl; + TCGv_i32 tmph; + + /* Load 64-bit value rd:rn. */ + tmpl = load_reg(s, rlow); + tmph = load_reg(s, rhigh); + tmp = tcg_temp_new_i64(); + tcg_gen_concat_i32_i64(tmp, tmpl, tmph); + tcg_temp_free_i32(tmpl); + tcg_temp_free_i32(tmph); + tcg_gen_add_i64(val, val, tmp); + tcg_temp_free_i64(tmp); +} + +/* Set N and Z flags from hi|lo. */ +static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi) +{ + tcg_gen_mov_i32(cpu_NF, hi); + tcg_gen_or_i32(cpu_ZF, lo, hi); +} + +/* Load/Store exclusive instructions are implemented by remembering + the value/address loaded, and seeing if these are the same + when the store is performed. This should be sufficient to implement + the architecturally mandated semantics, and avoids having to monitor + regular stores. The compare vs the remembered value is done during + the cmpxchg operation, but we must compare the addresses manually. */ +static void gen_load_exclusive(DisasContext *s, int rt, int rt2, + TCGv_i32 addr, int size) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + MemOp opc = size | MO_ALIGN | s->be_data; + + s->is_ldex = true; + + if (size == 3) { + TCGv_i32 tmp2 = tcg_temp_new_i32(); + TCGv_i64 t64 = tcg_temp_new_i64(); + + /* + * For AArch32, architecturally the 32-bit word at the lowest + * address is always Rt and the one at addr+4 is Rt2, even if + * the CPU is big-endian. That means we don't want to do a + * gen_aa32_ld_i64(), which checks SCTLR_B as if for an + * architecturally 64-bit access, but instead do a 64-bit access + * using MO_BE if appropriate and then split the two halves. + */ + TCGv taddr = gen_aa32_addr(s, addr, opc); + + tcg_gen_qemu_ld_i64(t64, taddr, get_mem_index(s), opc); + tcg_temp_free(taddr); + tcg_gen_mov_i64(cpu_exclusive_val, t64); + if (s->be_data == MO_BE) { + tcg_gen_extr_i64_i32(tmp2, tmp, t64); + } else { + tcg_gen_extr_i64_i32(tmp, tmp2, t64); + } + tcg_temp_free_i64(t64); + + store_reg(s, rt2, tmp2); + } else { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc); + tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp); + } + + store_reg(s, rt, tmp); + tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr); +} + +static void gen_clrex(DisasContext *s) +{ + tcg_gen_movi_i64(cpu_exclusive_addr, -1); +} + +static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, + TCGv_i32 addr, int size) +{ + TCGv_i32 t0, t1, t2; + TCGv_i64 extaddr; + TCGv taddr; + TCGLabel *done_label; + TCGLabel *fail_label; + MemOp opc = size | MO_ALIGN | s->be_data; + + /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { + [addr] = {Rt}; + {Rd} = 0; + } else { + {Rd} = 1; + } */ + fail_label = gen_new_label(); + done_label = gen_new_label(); + extaddr = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(extaddr, addr); + tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label); + tcg_temp_free_i64(extaddr); + + taddr = gen_aa32_addr(s, addr, opc); + t0 = tcg_temp_new_i32(); + t1 = load_reg(s, rt); + if (size == 3) { + TCGv_i64 o64 = tcg_temp_new_i64(); + TCGv_i64 n64 = tcg_temp_new_i64(); + + t2 = load_reg(s, rt2); + + /* + * For AArch32, architecturally the 32-bit word at the lowest + * address is always Rt and the one at addr+4 is Rt2, even if + * the CPU is big-endian. Since we're going to treat this as a + * single 64-bit BE store, we need to put the two halves in the + * opposite order for BE to LE, so that they end up in the right + * places. We don't want gen_aa32_st_i64, because that checks + * SCTLR_B as if for an architectural 64-bit access. + */ + if (s->be_data == MO_BE) { + tcg_gen_concat_i32_i64(n64, t2, t1); + } else { + tcg_gen_concat_i32_i64(n64, t1, t2); + } + tcg_temp_free_i32(t2); + + tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64, + get_mem_index(s), opc); + tcg_temp_free_i64(n64); + + tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val); + tcg_gen_extrl_i64_i32(t0, o64); + + tcg_temp_free_i64(o64); + } else { + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val); + tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc); + tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2); + tcg_temp_free_i32(t2); + } + tcg_temp_free_i32(t1); + tcg_temp_free(taddr); + tcg_gen_mov_i32(cpu_R[rd], t0); + tcg_temp_free_i32(t0); + tcg_gen_br(done_label); + + gen_set_label(fail_label); + tcg_gen_movi_i32(cpu_R[rd], 1); + gen_set_label(done_label); + tcg_gen_movi_i64(cpu_exclusive_addr, -1); +} + +/* gen_srs: + * @env: CPUARMState + * @s: DisasContext + * @mode: mode field from insn (which stack to store to) + * @amode: addressing mode (DA/IA/DB/IB), encoded as per P,U bits in ARM insn + * @writeback: true if writeback bit set + * + * Generate code for the SRS (Store Return State) insn. + */ +static void gen_srs(DisasContext *s, + uint32_t mode, uint32_t amode, bool writeback) +{ + int32_t offset; + TCGv_i32 addr, tmp; + bool undef = false; + + /* SRS is: + * - trapped to EL3 if EL3 is AArch64 and we are at Secure EL1 + * and specified mode is monitor mode + * - UNDEFINED in Hyp mode + * - UNPREDICTABLE in User or System mode + * - UNPREDICTABLE if the specified mode is: + * -- not implemented + * -- not a valid mode number + * -- a mode that's at a higher exception level + * -- Monitor, if we are Non-secure + * For the UNPREDICTABLE cases we choose to UNDEF. + */ + if (s->current_el == 1 && !s->ns && mode == ARM_CPU_MODE_MON) { + gen_exception_insn_el(s, 0, EXCP_UDEF, syn_uncategorized(), 3); + return; + } + + if (s->current_el == 0 || s->current_el == 2) { + undef = true; + } + + switch (mode) { + case ARM_CPU_MODE_USR: + case ARM_CPU_MODE_FIQ: + case ARM_CPU_MODE_IRQ: + case ARM_CPU_MODE_SVC: + case ARM_CPU_MODE_ABT: + case ARM_CPU_MODE_UND: + case ARM_CPU_MODE_SYS: + break; + case ARM_CPU_MODE_HYP: + if (s->current_el == 1 || !arm_dc_feature(s, ARM_FEATURE_EL2)) { + undef = true; + } + break; + case ARM_CPU_MODE_MON: + /* No need to check specifically for "are we non-secure" because + * we've already made EL0 UNDEF and handled the trap for S-EL1; + * so if this isn't EL3 then we must be non-secure. + */ + if (s->current_el != 3) { + undef = true; + } + break; + default: + undef = true; + } + + if (undef) { + unallocated_encoding(s); + return; + } + + addr = tcg_temp_new_i32(); + /* get_r13_banked() will raise an exception if called from System mode */ + gen_set_condexec(s); + gen_update_pc(s, 0); + gen_helper_get_r13_banked(addr, cpu_env, tcg_constant_i32(mode)); + switch (amode) { + case 0: /* DA */ + offset = -4; + break; + case 1: /* IA */ + offset = 0; + break; + case 2: /* DB */ + offset = -8; + break; + case 3: /* IB */ + offset = 4; + break; + default: + g_assert_not_reached(); + } + tcg_gen_addi_i32(addr, addr, offset); + tmp = load_reg(s, 14); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + tmp = load_cpu_field(spsr); + tcg_gen_addi_i32(addr, addr, 4); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + if (writeback) { + switch (amode) { + case 0: + offset = -8; + break; + case 1: + offset = 4; + break; + case 2: + offset = -4; + break; + case 3: + offset = 0; + break; + default: + g_assert_not_reached(); + } + tcg_gen_addi_i32(addr, addr, offset); + gen_helper_set_r13_banked(cpu_env, tcg_constant_i32(mode), addr); + } + tcg_temp_free_i32(addr); + s->base.is_jmp = DISAS_UPDATE_EXIT; +} + +/* Skip this instruction if the ARM condition is false */ +static void arm_skip_unless(DisasContext *s, uint32_t cond) +{ + arm_gen_condlabel(s); + arm_gen_test_cc(cond ^ 1, s->condlabel.label); +} + + +/* + * Constant expanders used by T16/T32 decode + */ + +/* Return only the rotation part of T32ExpandImm. */ +static int t32_expandimm_rot(DisasContext *s, int x) +{ + return x & 0xc00 ? extract32(x, 7, 5) : 0; +} + +/* Return the unrotated immediate from T32ExpandImm. */ +static int t32_expandimm_imm(DisasContext *s, int x) +{ + int imm = extract32(x, 0, 8); + + switch (extract32(x, 8, 4)) { + case 0: /* XY */ + /* Nothing to do. */ + break; + case 1: /* 00XY00XY */ + imm *= 0x00010001; + break; + case 2: /* XY00XY00 */ + imm *= 0x01000100; + break; + case 3: /* XYXYXYXY */ + imm *= 0x01010101; + break; + default: + /* Rotated constant. */ + imm |= 0x80; + break; + } + return imm; +} + +static int t32_branch24(DisasContext *s, int x) +{ + /* Convert J1:J2 at x[22:21] to I2:I1, which involves I=J^~S. */ + x ^= !(x < 0) * (3 << 21); + /* Append the final zero. */ + return x << 1; +} + +static int t16_setflags(DisasContext *s) +{ + return s->condexec_mask == 0; +} + +static int t16_push_list(DisasContext *s, int x) +{ + return (x & 0xff) | (x & 0x100) << (14 - 8); +} + +static int t16_pop_list(DisasContext *s, int x) +{ + return (x & 0xff) | (x & 0x100) << (15 - 8); +} + +/* + * Include the generated decoders. + */ + +#include "decode-a32.c.inc" +#include "decode-a32-uncond.c.inc" +#include "decode-t32.c.inc" +#include "decode-t16.c.inc" + +static bool valid_cp(DisasContext *s, int cp) +{ + /* + * Return true if this coprocessor field indicates something + * that's really a possible coprocessor. + * For v7 and earlier, coprocessors 8..15 were reserved for Arm use, + * and of those only cp14 and cp15 were used for registers. + * cp10 and cp11 were used for VFP and Neon, whose decode is + * dealt with elsewhere. With the advent of fp16, cp9 is also + * now part of VFP. + * For v8A and later, the encoding has been tightened so that + * only cp14 and cp15 are valid, and other values aren't considered + * to be in the coprocessor-instruction space at all. v8M still + * permits coprocessors 0..7. + * For XScale, we must not decode the XScale cp0, cp1 space as + * a standard coprocessor insn, because we want to fall through to + * the legacy disas_xscale_insn() decoder after decodetree is done. + */ + if (arm_dc_feature(s, ARM_FEATURE_XSCALE) && (cp == 0 || cp == 1)) { + return false; + } + + if (arm_dc_feature(s, ARM_FEATURE_V8) && + !arm_dc_feature(s, ARM_FEATURE_M)) { + return cp >= 14; + } + return cp < 8 || cp >= 14; +} + +static bool trans_MCR(DisasContext *s, arg_MCR *a) +{ + if (!valid_cp(s, a->cp)) { + return false; + } + do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2, + false, a->rt, 0); + return true; +} + +static bool trans_MRC(DisasContext *s, arg_MRC *a) +{ + if (!valid_cp(s, a->cp)) { + return false; + } + do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2, + true, a->rt, 0); + return true; +} + +static bool trans_MCRR(DisasContext *s, arg_MCRR *a) +{ + if (!valid_cp(s, a->cp)) { + return false; + } + do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0, + false, a->rt, a->rt2); + return true; +} + +static bool trans_MRRC(DisasContext *s, arg_MRRC *a) +{ + if (!valid_cp(s, a->cp)) { + return false; + } + do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0, + true, a->rt, a->rt2); + return true; +} + +/* Helpers to swap operands for reverse-subtract. */ +static void gen_rsb(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_sub_i32(dst, b, a); +} + +static void gen_rsb_CC(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b) +{ + gen_sub_CC(dst, b, a); +} + +static void gen_rsc(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b) +{ + gen_sub_carry(dest, b, a); +} + +static void gen_rsc_CC(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b) +{ + gen_sbc_CC(dest, b, a); +} + +/* + * Helpers for the data processing routines. + * + * After the computation store the results back. + * This may be suppressed altogether (STREG_NONE), require a runtime + * check against the stack limits (STREG_SP_CHECK), or generate an + * exception return. Oh, or store into a register. + * + * Always return true, indicating success for a trans_* function. + */ +typedef enum { + STREG_NONE, + STREG_NORMAL, + STREG_SP_CHECK, + STREG_EXC_RET, +} StoreRegKind; + +static bool store_reg_kind(DisasContext *s, int rd, + TCGv_i32 val, StoreRegKind kind) +{ + switch (kind) { + case STREG_NONE: + tcg_temp_free_i32(val); + return true; + case STREG_NORMAL: + /* See ALUWritePC: Interworking only from a32 mode. */ + if (s->thumb) { + store_reg(s, rd, val); + } else { + store_reg_bx(s, rd, val); + } + return true; + case STREG_SP_CHECK: + store_sp_checked(s, val); + return true; + case STREG_EXC_RET: + gen_exception_return(s, val); + return true; + } + g_assert_not_reached(); +} + +/* + * Data Processing (register) + * + * Operate, with set flags, one register source, + * one immediate shifted register source, and a destination. + */ +static bool op_s_rrr_shi(DisasContext *s, arg_s_rrr_shi *a, + void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp1, tmp2; + + tmp2 = load_reg(s, a->rm); + gen_arm_shift_im(tmp2, a->shty, a->shim, logic_cc); + tmp1 = load_reg(s, a->rn); + + gen(tmp1, tmp1, tmp2); + tcg_temp_free_i32(tmp2); + + if (logic_cc) { + gen_logic_CC(tmp1); + } + return store_reg_kind(s, a->rd, tmp1, kind); +} + +static bool op_s_rxr_shi(DisasContext *s, arg_s_rrr_shi *a, + void (*gen)(TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp; + + tmp = load_reg(s, a->rm); + gen_arm_shift_im(tmp, a->shty, a->shim, logic_cc); + + gen(tmp, tmp); + if (logic_cc) { + gen_logic_CC(tmp); + } + return store_reg_kind(s, a->rd, tmp, kind); +} + +/* + * Data-processing (register-shifted register) + * + * Operate, with set flags, one register source, + * one register shifted register source, and a destination. + */ +static bool op_s_rrr_shr(DisasContext *s, arg_s_rrr_shr *a, + void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp1, tmp2; + + tmp1 = load_reg(s, a->rs); + tmp2 = load_reg(s, a->rm); + gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc); + tmp1 = load_reg(s, a->rn); + + gen(tmp1, tmp1, tmp2); + tcg_temp_free_i32(tmp2); + + if (logic_cc) { + gen_logic_CC(tmp1); + } + return store_reg_kind(s, a->rd, tmp1, kind); +} + +static bool op_s_rxr_shr(DisasContext *s, arg_s_rrr_shr *a, + void (*gen)(TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp1, tmp2; + + tmp1 = load_reg(s, a->rs); + tmp2 = load_reg(s, a->rm); + gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc); + + gen(tmp2, tmp2); + if (logic_cc) { + gen_logic_CC(tmp2); + } + return store_reg_kind(s, a->rd, tmp2, kind); +} + +/* + * Data-processing (immediate) + * + * Operate, with set flags, one register source, + * one rotated immediate, and a destination. + * + * Note that logic_cc && a->rot setting CF based on the msb of the + * immediate is the reason why we must pass in the unrotated form + * of the immediate. + */ +static bool op_s_rri_rot(DisasContext *s, arg_s_rri_rot *a, + void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp1; + uint32_t imm; + + imm = ror32(a->imm, a->rot); + if (logic_cc && a->rot) { + tcg_gen_movi_i32(cpu_CF, imm >> 31); + } + tmp1 = load_reg(s, a->rn); + + gen(tmp1, tmp1, tcg_constant_i32(imm)); + + if (logic_cc) { + gen_logic_CC(tmp1); + } + return store_reg_kind(s, a->rd, tmp1, kind); +} + +static bool op_s_rxi_rot(DisasContext *s, arg_s_rri_rot *a, + void (*gen)(TCGv_i32, TCGv_i32), + int logic_cc, StoreRegKind kind) +{ + TCGv_i32 tmp; + uint32_t imm; + + imm = ror32(a->imm, a->rot); + if (logic_cc && a->rot) { + tcg_gen_movi_i32(cpu_CF, imm >> 31); + } + + tmp = tcg_temp_new_i32(); + gen(tmp, tcg_constant_i32(imm)); + + if (logic_cc) { + gen_logic_CC(tmp); + } + return store_reg_kind(s, a->rd, tmp, kind); +} + +#define DO_ANY3(NAME, OP, L, K) \ + static bool trans_##NAME##_rrri(DisasContext *s, arg_s_rrr_shi *a) \ + { StoreRegKind k = (K); return op_s_rrr_shi(s, a, OP, L, k); } \ + static bool trans_##NAME##_rrrr(DisasContext *s, arg_s_rrr_shr *a) \ + { StoreRegKind k = (K); return op_s_rrr_shr(s, a, OP, L, k); } \ + static bool trans_##NAME##_rri(DisasContext *s, arg_s_rri_rot *a) \ + { StoreRegKind k = (K); return op_s_rri_rot(s, a, OP, L, k); } + +#define DO_ANY2(NAME, OP, L, K) \ + static bool trans_##NAME##_rxri(DisasContext *s, arg_s_rrr_shi *a) \ + { StoreRegKind k = (K); return op_s_rxr_shi(s, a, OP, L, k); } \ + static bool trans_##NAME##_rxrr(DisasContext *s, arg_s_rrr_shr *a) \ + { StoreRegKind k = (K); return op_s_rxr_shr(s, a, OP, L, k); } \ + static bool trans_##NAME##_rxi(DisasContext *s, arg_s_rri_rot *a) \ + { StoreRegKind k = (K); return op_s_rxi_rot(s, a, OP, L, k); } + +#define DO_CMP2(NAME, OP, L) \ + static bool trans_##NAME##_xrri(DisasContext *s, arg_s_rrr_shi *a) \ + { return op_s_rrr_shi(s, a, OP, L, STREG_NONE); } \ + static bool trans_##NAME##_xrrr(DisasContext *s, arg_s_rrr_shr *a) \ + { return op_s_rrr_shr(s, a, OP, L, STREG_NONE); } \ + static bool trans_##NAME##_xri(DisasContext *s, arg_s_rri_rot *a) \ + { return op_s_rri_rot(s, a, OP, L, STREG_NONE); } + +DO_ANY3(AND, tcg_gen_and_i32, a->s, STREG_NORMAL) +DO_ANY3(EOR, tcg_gen_xor_i32, a->s, STREG_NORMAL) +DO_ANY3(ORR, tcg_gen_or_i32, a->s, STREG_NORMAL) +DO_ANY3(BIC, tcg_gen_andc_i32, a->s, STREG_NORMAL) + +DO_ANY3(RSB, a->s ? gen_rsb_CC : gen_rsb, false, STREG_NORMAL) +DO_ANY3(ADC, a->s ? gen_adc_CC : gen_add_carry, false, STREG_NORMAL) +DO_ANY3(SBC, a->s ? gen_sbc_CC : gen_sub_carry, false, STREG_NORMAL) +DO_ANY3(RSC, a->s ? gen_rsc_CC : gen_rsc, false, STREG_NORMAL) + +DO_CMP2(TST, tcg_gen_and_i32, true) +DO_CMP2(TEQ, tcg_gen_xor_i32, true) +DO_CMP2(CMN, gen_add_CC, false) +DO_CMP2(CMP, gen_sub_CC, false) + +DO_ANY3(ADD, a->s ? gen_add_CC : tcg_gen_add_i32, false, + a->rd == 13 && a->rn == 13 ? STREG_SP_CHECK : STREG_NORMAL) + +/* + * Note for the computation of StoreRegKind we return out of the + * middle of the functions that are expanded by DO_ANY3, and that + * we modify a->s via that parameter before it is used by OP. + */ +DO_ANY3(SUB, a->s ? gen_sub_CC : tcg_gen_sub_i32, false, + ({ + StoreRegKind ret = STREG_NORMAL; + if (a->rd == 15 && a->s) { + /* + * See ALUExceptionReturn: + * In User mode, UNPREDICTABLE; we choose UNDEF. + * In Hyp mode, UNDEFINED. + */ + if (IS_USER(s) || s->current_el == 2) { + unallocated_encoding(s); + return true; + } + /* There is no writeback of nzcv to PSTATE. */ + a->s = 0; + ret = STREG_EXC_RET; + } else if (a->rd == 13 && a->rn == 13) { + ret = STREG_SP_CHECK; + } + ret; + })) + +DO_ANY2(MOV, tcg_gen_mov_i32, a->s, + ({ + StoreRegKind ret = STREG_NORMAL; + if (a->rd == 15 && a->s) { + /* + * See ALUExceptionReturn: + * In User mode, UNPREDICTABLE; we choose UNDEF. + * In Hyp mode, UNDEFINED. + */ + if (IS_USER(s) || s->current_el == 2) { + unallocated_encoding(s); + return true; + } + /* There is no writeback of nzcv to PSTATE. */ + a->s = 0; + ret = STREG_EXC_RET; + } else if (a->rd == 13) { + ret = STREG_SP_CHECK; + } + ret; + })) + +DO_ANY2(MVN, tcg_gen_not_i32, a->s, STREG_NORMAL) + +/* + * ORN is only available with T32, so there is no register-shifted-register + * form of the insn. Using the DO_ANY3 macro would create an unused function. + */ +static bool trans_ORN_rrri(DisasContext *s, arg_s_rrr_shi *a) +{ + return op_s_rrr_shi(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL); +} + +static bool trans_ORN_rri(DisasContext *s, arg_s_rri_rot *a) +{ + return op_s_rri_rot(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL); +} + +#undef DO_ANY3 +#undef DO_ANY2 +#undef DO_CMP2 + +static bool trans_ADR(DisasContext *s, arg_ri *a) +{ + store_reg_bx(s, a->rd, add_reg_for_lit(s, 15, a->imm)); + return true; +} + +static bool trans_MOVW(DisasContext *s, arg_MOVW *a) +{ + if (!ENABLE_ARCH_6T2) { + return false; + } + + store_reg(s, a->rd, tcg_constant_i32(a->imm)); + return true; +} + +static bool trans_MOVT(DisasContext *s, arg_MOVW *a) +{ + TCGv_i32 tmp; + + if (!ENABLE_ARCH_6T2) { + return false; + } + + tmp = load_reg(s, a->rd); + tcg_gen_ext16u_i32(tmp, tmp); + tcg_gen_ori_i32(tmp, tmp, a->imm << 16); + store_reg(s, a->rd, tmp); + return true; +} + +/* + * v8.1M MVE wide-shifts + */ +static bool do_mve_shl_ri(DisasContext *s, arg_mve_shl_ri *a, + WideShiftImmFn *fn) +{ + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + /* Decode falls through to ORR/MOV UNPREDICTABLE handling */ + return false; + } + if (a->rdahi == 15) { + /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */ + return false; + } + if (!dc_isar_feature(aa32_mve, s) || + !arm_dc_feature(s, ARM_FEATURE_M_MAIN) || + a->rdahi == 13) { + /* RdaHi == 13 is UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + if (a->shim == 0) { + a->shim = 32; + } + + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + + fn(rda, rda, a->shim); + + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + + return true; +} + +static bool trans_ASRL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, tcg_gen_sari_i64); +} + +static bool trans_LSLL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, tcg_gen_shli_i64); +} + +static bool trans_LSRL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, tcg_gen_shri_i64); +} + +static void gen_mve_sqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift) +{ + gen_helper_mve_sqshll(r, cpu_env, n, tcg_constant_i32(shift)); +} + +static bool trans_SQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, gen_mve_sqshll); +} + +static void gen_mve_uqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift) +{ + gen_helper_mve_uqshll(r, cpu_env, n, tcg_constant_i32(shift)); +} + +static bool trans_UQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, gen_mve_uqshll); +} + +static bool trans_SRSHRL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, gen_srshr64_i64); +} + +static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a) +{ + return do_mve_shl_ri(s, a, gen_urshr64_i64); +} + +static bool do_mve_shl_rr(DisasContext *s, arg_mve_shl_rr *a, WideShiftFn *fn) +{ + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + /* Decode falls through to ORR/MOV UNPREDICTABLE handling */ + return false; + } + if (a->rdahi == 15) { + /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */ + return false; + } + if (!dc_isar_feature(aa32_mve, s) || + !arm_dc_feature(s, ARM_FEATURE_M_MAIN) || + a->rdahi == 13 || a->rm == 13 || a->rm == 15 || + a->rm == a->rdahi || a->rm == a->rdalo) { + /* These rdahi/rdalo/rm cases are UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + + /* The helper takes care of the sign-extension of the low 8 bits of Rm */ + fn(rda, cpu_env, rda, cpu_R[a->rm]); + + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + + return true; +} + +static bool trans_LSLL_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_ushll); +} + +static bool trans_ASRL_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_sshrl); +} + +static bool trans_UQRSHLL64_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll); +} + +static bool trans_SQRSHRL64_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl); +} + +static bool trans_UQRSHLL48_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll48); +} + +static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a) +{ + return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48); +} + +static bool do_mve_sh_ri(DisasContext *s, arg_mve_sh_ri *a, ShiftImmFn *fn) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + /* Decode falls through to ORR/MOV UNPREDICTABLE handling */ + return false; + } + if (!dc_isar_feature(aa32_mve, s) || + !arm_dc_feature(s, ARM_FEATURE_M_MAIN) || + a->rda == 13 || a->rda == 15) { + /* These rda cases are UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + if (a->shim == 0) { + a->shim = 32; + } + fn(cpu_R[a->rda], cpu_R[a->rda], a->shim); + + return true; +} + +static bool trans_URSHR_ri(DisasContext *s, arg_mve_sh_ri *a) +{ + return do_mve_sh_ri(s, a, gen_urshr32_i32); +} + +static bool trans_SRSHR_ri(DisasContext *s, arg_mve_sh_ri *a) +{ + return do_mve_sh_ri(s, a, gen_srshr32_i32); +} + +static void gen_mve_sqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift) +{ + gen_helper_mve_sqshl(r, cpu_env, n, tcg_constant_i32(shift)); +} + +static bool trans_SQSHL_ri(DisasContext *s, arg_mve_sh_ri *a) +{ + return do_mve_sh_ri(s, a, gen_mve_sqshl); +} + +static void gen_mve_uqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift) +{ + gen_helper_mve_uqshl(r, cpu_env, n, tcg_constant_i32(shift)); +} + +static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a) +{ + return do_mve_sh_ri(s, a, gen_mve_uqshl); +} + +static bool do_mve_sh_rr(DisasContext *s, arg_mve_sh_rr *a, ShiftFn *fn) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + /* Decode falls through to ORR/MOV UNPREDICTABLE handling */ + return false; + } + if (!dc_isar_feature(aa32_mve, s) || + !arm_dc_feature(s, ARM_FEATURE_M_MAIN) || + a->rda == 13 || a->rda == 15 || a->rm == 13 || a->rm == 15 || + a->rm == a->rda) { + /* These rda/rm cases are UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + /* The helper takes care of the sign-extension of the low 8 bits of Rm */ + fn(cpu_R[a->rda], cpu_env, cpu_R[a->rda], cpu_R[a->rm]); + return true; +} + +static bool trans_SQRSHR_rr(DisasContext *s, arg_mve_sh_rr *a) +{ + return do_mve_sh_rr(s, a, gen_helper_mve_sqrshr); +} + +static bool trans_UQRSHL_rr(DisasContext *s, arg_mve_sh_rr *a) +{ + return do_mve_sh_rr(s, a, gen_helper_mve_uqrshl); +} + +/* + * Multiply and multiply accumulate + */ + +static bool op_mla(DisasContext *s, arg_s_rrrr *a, bool add) +{ + TCGv_i32 t1, t2; + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + tcg_gen_mul_i32(t1, t1, t2); + tcg_temp_free_i32(t2); + if (add) { + t2 = load_reg(s, a->ra); + tcg_gen_add_i32(t1, t1, t2); + tcg_temp_free_i32(t2); + } + if (a->s) { + gen_logic_CC(t1); + } + store_reg(s, a->rd, t1); + return true; +} + +static bool trans_MUL(DisasContext *s, arg_MUL *a) +{ + return op_mla(s, a, false); +} + +static bool trans_MLA(DisasContext *s, arg_MLA *a) +{ + return op_mla(s, a, true); +} + +static bool trans_MLS(DisasContext *s, arg_MLS *a) +{ + TCGv_i32 t1, t2; + + if (!ENABLE_ARCH_6T2) { + return false; + } + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + tcg_gen_mul_i32(t1, t1, t2); + tcg_temp_free_i32(t2); + t2 = load_reg(s, a->ra); + tcg_gen_sub_i32(t1, t2, t1); + tcg_temp_free_i32(t2); + store_reg(s, a->rd, t1); + return true; +} + +static bool op_mlal(DisasContext *s, arg_s_rrrr *a, bool uns, bool add) +{ + TCGv_i32 t0, t1, t2, t3; + + t0 = load_reg(s, a->rm); + t1 = load_reg(s, a->rn); + if (uns) { + tcg_gen_mulu2_i32(t0, t1, t0, t1); + } else { + tcg_gen_muls2_i32(t0, t1, t0, t1); + } + if (add) { + t2 = load_reg(s, a->ra); + t3 = load_reg(s, a->rd); + tcg_gen_add2_i32(t0, t1, t0, t1, t2, t3); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t3); + } + if (a->s) { + gen_logicq_cc(t0, t1); + } + store_reg(s, a->ra, t0); + store_reg(s, a->rd, t1); + return true; +} + +static bool trans_UMULL(DisasContext *s, arg_UMULL *a) +{ + return op_mlal(s, a, true, false); +} + +static bool trans_SMULL(DisasContext *s, arg_SMULL *a) +{ + return op_mlal(s, a, false, false); +} + +static bool trans_UMLAL(DisasContext *s, arg_UMLAL *a) +{ + return op_mlal(s, a, true, true); +} + +static bool trans_SMLAL(DisasContext *s, arg_SMLAL *a) +{ + return op_mlal(s, a, false, true); +} + +static bool trans_UMAAL(DisasContext *s, arg_UMAAL *a) +{ + TCGv_i32 t0, t1, t2, zero; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + t0 = load_reg(s, a->rm); + t1 = load_reg(s, a->rn); + tcg_gen_mulu2_i32(t0, t1, t0, t1); + zero = tcg_constant_i32(0); + t2 = load_reg(s, a->ra); + tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero); + tcg_temp_free_i32(t2); + t2 = load_reg(s, a->rd); + tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero); + tcg_temp_free_i32(t2); + store_reg(s, a->ra, t0); + store_reg(s, a->rd, t1); + return true; +} + +/* + * Saturating addition and subtraction + */ + +static bool op_qaddsub(DisasContext *s, arg_rrr *a, bool add, bool doub) +{ + TCGv_i32 t0, t1; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_5TE) { + return false; + } + + t0 = load_reg(s, a->rm); + t1 = load_reg(s, a->rn); + if (doub) { + gen_helper_add_saturate(t1, cpu_env, t1, t1); + } + if (add) { + gen_helper_add_saturate(t0, cpu_env, t0, t1); + } else { + gen_helper_sub_saturate(t0, cpu_env, t0, t1); + } + tcg_temp_free_i32(t1); + store_reg(s, a->rd, t0); + return true; +} + +#define DO_QADDSUB(NAME, ADD, DOUB) \ +static bool trans_##NAME(DisasContext *s, arg_rrr *a) \ +{ \ + return op_qaddsub(s, a, ADD, DOUB); \ +} + +DO_QADDSUB(QADD, true, false) +DO_QADDSUB(QSUB, false, false) +DO_QADDSUB(QDADD, true, true) +DO_QADDSUB(QDSUB, false, true) + +#undef DO_QADDSUB + +/* + * Halfword multiply and multiply accumulate + */ + +static bool op_smlaxxx(DisasContext *s, arg_rrrr *a, + int add_long, bool nt, bool mt) +{ + TCGv_i32 t0, t1, tl, th; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_5TE) { + return false; + } + + t0 = load_reg(s, a->rn); + t1 = load_reg(s, a->rm); + gen_mulxy(t0, t1, nt, mt); + tcg_temp_free_i32(t1); + + switch (add_long) { + case 0: + store_reg(s, a->rd, t0); + break; + case 1: + t1 = load_reg(s, a->ra); + gen_helper_add_setq(t0, cpu_env, t0, t1); + tcg_temp_free_i32(t1); + store_reg(s, a->rd, t0); + break; + case 2: + tl = load_reg(s, a->ra); + th = load_reg(s, a->rd); + /* Sign-extend the 32-bit product to 64 bits. */ + t1 = tcg_temp_new_i32(); + tcg_gen_sari_i32(t1, t0, 31); + tcg_gen_add2_i32(tl, th, tl, th, t0, t1); + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); + store_reg(s, a->ra, tl); + store_reg(s, a->rd, th); + break; + default: + g_assert_not_reached(); + } + return true; +} + +#define DO_SMLAX(NAME, add, nt, mt) \ +static bool trans_##NAME(DisasContext *s, arg_rrrr *a) \ +{ \ + return op_smlaxxx(s, a, add, nt, mt); \ +} + +DO_SMLAX(SMULBB, 0, 0, 0) +DO_SMLAX(SMULBT, 0, 0, 1) +DO_SMLAX(SMULTB, 0, 1, 0) +DO_SMLAX(SMULTT, 0, 1, 1) + +DO_SMLAX(SMLABB, 1, 0, 0) +DO_SMLAX(SMLABT, 1, 0, 1) +DO_SMLAX(SMLATB, 1, 1, 0) +DO_SMLAX(SMLATT, 1, 1, 1) + +DO_SMLAX(SMLALBB, 2, 0, 0) +DO_SMLAX(SMLALBT, 2, 0, 1) +DO_SMLAX(SMLALTB, 2, 1, 0) +DO_SMLAX(SMLALTT, 2, 1, 1) + +#undef DO_SMLAX + +static bool op_smlawx(DisasContext *s, arg_rrrr *a, bool add, bool mt) +{ + TCGv_i32 t0, t1; + + if (!ENABLE_ARCH_5TE) { + return false; + } + + t0 = load_reg(s, a->rn); + t1 = load_reg(s, a->rm); + /* + * Since the nominal result is product<47:16>, shift the 16-bit + * input up by 16 bits, so that the result is at product<63:32>. + */ + if (mt) { + tcg_gen_andi_i32(t1, t1, 0xffff0000); + } else { + tcg_gen_shli_i32(t1, t1, 16); + } + tcg_gen_muls2_i32(t0, t1, t0, t1); + tcg_temp_free_i32(t0); + if (add) { + t0 = load_reg(s, a->ra); + gen_helper_add_setq(t1, cpu_env, t1, t0); + tcg_temp_free_i32(t0); + } + store_reg(s, a->rd, t1); + return true; +} + +#define DO_SMLAWX(NAME, add, mt) \ +static bool trans_##NAME(DisasContext *s, arg_rrrr *a) \ +{ \ + return op_smlawx(s, a, add, mt); \ +} + +DO_SMLAWX(SMULWB, 0, 0) +DO_SMLAWX(SMULWT, 0, 1) +DO_SMLAWX(SMLAWB, 1, 0) +DO_SMLAWX(SMLAWT, 1, 1) + +#undef DO_SMLAWX + +/* + * MSR (immediate) and hints + */ + +static bool trans_YIELD(DisasContext *s, arg_YIELD *a) +{ + /* + * When running single-threaded TCG code, use the helper to ensure that + * the next round-robin scheduled vCPU gets a crack. When running in + * MTTCG we don't generate jumps to the helper as it won't affect the + * scheduling of other vCPUs. + */ + if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_YIELD; + } + return true; +} + +static bool trans_WFE(DisasContext *s, arg_WFE *a) +{ + /* + * When running single-threaded TCG code, use the helper to ensure that + * the next round-robin scheduled vCPU gets a crack. In MTTCG mode we + * just skip this instruction. Currently the SEV/SEVL instructions, + * which are *one* of many ways to wake the CPU from WFE, are not + * implemented so we can't sleep like WFI does. + */ + if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_WFE; + } + return true; +} + +static bool trans_WFI(DisasContext *s, arg_WFI *a) +{ + /* For WFI, halt the vCPU until an IRQ. */ + gen_update_pc(s, curr_insn_len(s)); + s->base.is_jmp = DISAS_WFI; + return true; +} + +static bool trans_ESB(DisasContext *s, arg_ESB *a) +{ + /* + * For M-profile, minimal-RAS ESB can be a NOP. + * Without RAS, we must implement this as NOP. + */ + if (!arm_dc_feature(s, ARM_FEATURE_M) && dc_isar_feature(aa32_ras, s)) { + /* + * QEMU does not have a source of physical SErrors, + * so we are only concerned with virtual SErrors. + * The pseudocode in the ARM for this case is + * if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then + * AArch32.vESBOperation(); + * Most of the condition can be evaluated at translation time. + * Test for EL2 present, and defer test for SEL2 to runtime. + */ + if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) { + gen_helper_vesb(cpu_env); + } + } + return true; +} + +static bool trans_NOP(DisasContext *s, arg_NOP *a) +{ + return true; +} + +static bool trans_MSR_imm(DisasContext *s, arg_MSR_imm *a) +{ + uint32_t val = ror32(a->imm, a->rot * 2); + uint32_t mask = msr_mask(s, a->mask, a->r); + + if (gen_set_psr_im(s, mask, a->r, val)) { + unallocated_encoding(s); + } + return true; +} + +/* + * Cyclic Redundancy Check + */ + +static bool op_crc32(DisasContext *s, arg_rrr *a, bool c, MemOp sz) +{ + TCGv_i32 t1, t2, t3; + + if (!dc_isar_feature(aa32_crc32, s)) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + switch (sz) { + case MO_8: + gen_uxtb(t2); + break; + case MO_16: + gen_uxth(t2); + break; + case MO_32: + break; + default: + g_assert_not_reached(); + } + t3 = tcg_constant_i32(1 << sz); + if (c) { + gen_helper_crc32c(t1, t1, t2, t3); + } else { + gen_helper_crc32(t1, t1, t2, t3); + } + tcg_temp_free_i32(t2); + store_reg(s, a->rd, t1); + return true; +} + +#define DO_CRC32(NAME, c, sz) \ +static bool trans_##NAME(DisasContext *s, arg_rrr *a) \ + { return op_crc32(s, a, c, sz); } + +DO_CRC32(CRC32B, false, MO_8) +DO_CRC32(CRC32H, false, MO_16) +DO_CRC32(CRC32W, false, MO_32) +DO_CRC32(CRC32CB, true, MO_8) +DO_CRC32(CRC32CH, true, MO_16) +DO_CRC32(CRC32CW, true, MO_32) + +#undef DO_CRC32 + +/* + * Miscellaneous instructions + */ + +static bool trans_MRS_bank(DisasContext *s, arg_MRS_bank *a) +{ + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + gen_mrs_banked(s, a->r, a->sysm, a->rd); + return true; +} + +static bool trans_MSR_bank(DisasContext *s, arg_MSR_bank *a) +{ + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + gen_msr_banked(s, a->r, a->sysm, a->rn); + return true; +} + +static bool trans_MRS_reg(DisasContext *s, arg_MRS_reg *a) +{ + TCGv_i32 tmp; + + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (a->r) { + if (IS_USER(s)) { + unallocated_encoding(s); + return true; + } + tmp = load_cpu_field(spsr); + } else { + tmp = tcg_temp_new_i32(); + gen_helper_cpsr_read(tmp, cpu_env); + } + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_MSR_reg(DisasContext *s, arg_MSR_reg *a) +{ + TCGv_i32 tmp; + uint32_t mask = msr_mask(s, a->mask, a->r); + + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + tmp = load_reg(s, a->rn); + if (gen_set_psr(s, mask, a->r, tmp)) { + unallocated_encoding(s); + } + return true; +} + +static bool trans_MRS_v7m(DisasContext *s, arg_MRS_v7m *a) +{ + TCGv_i32 tmp; + + if (!arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + tmp = tcg_temp_new_i32(); + gen_helper_v7m_mrs(tmp, cpu_env, tcg_constant_i32(a->sysm)); + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_MSR_v7m(DisasContext *s, arg_MSR_v7m *a) +{ + TCGv_i32 addr, reg; + + if (!arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + addr = tcg_constant_i32((a->mask << 10) | a->sysm); + reg = load_reg(s, a->rn); + gen_helper_v7m_msr(cpu_env, addr, reg); + tcg_temp_free_i32(reg); + /* If we wrote to CONTROL, the EL might have changed */ + gen_rebuild_hflags(s, true); + gen_lookup_tb(s); + return true; +} + +static bool trans_BX(DisasContext *s, arg_BX *a) +{ + if (!ENABLE_ARCH_4T) { + return false; + } + gen_bx_excret(s, load_reg(s, a->rm)); + return true; +} + +static bool trans_BXJ(DisasContext *s, arg_BXJ *a) +{ + if (!ENABLE_ARCH_5J || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + /* + * v7A allows BXJ to be trapped via HSTR.TJDBX. We don't waste a + * TBFLAGS bit on a basically-never-happens case, so call a helper + * function to check for the trap and raise the exception if needed + * (passing it the register number for the syndrome value). + * v8A doesn't have this HSTR bit. + */ + if (!arm_dc_feature(s, ARM_FEATURE_V8) && + arm_dc_feature(s, ARM_FEATURE_EL2) && + s->current_el < 2 && s->ns) { + gen_helper_check_bxj_trap(cpu_env, tcg_constant_i32(a->rm)); + } + /* Trivial implementation equivalent to bx. */ + gen_bx(s, load_reg(s, a->rm)); + return true; +} + +static bool trans_BLX_r(DisasContext *s, arg_BLX_r *a) +{ + TCGv_i32 tmp; + + if (!ENABLE_ARCH_5) { + return false; + } + tmp = load_reg(s, a->rm); + gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb); + gen_bx(s, tmp); + return true; +} + +/* + * BXNS/BLXNS: only exist for v8M with the security extensions, + * and always UNDEF if NonSecure. We don't implement these in + * the user-only mode either (in theory you can use them from + * Secure User mode but they are too tied in to system emulation). + */ +static bool trans_BXNS(DisasContext *s, arg_BXNS *a) +{ + if (!s->v8m_secure || IS_USER_ONLY) { + unallocated_encoding(s); + } else { + gen_bxns(s, a->rm); + } + return true; +} + +static bool trans_BLXNS(DisasContext *s, arg_BLXNS *a) +{ + if (!s->v8m_secure || IS_USER_ONLY) { + unallocated_encoding(s); + } else { + gen_blxns(s, a->rm); + } + return true; +} + +static bool trans_CLZ(DisasContext *s, arg_CLZ *a) +{ + TCGv_i32 tmp; + + if (!ENABLE_ARCH_5) { + return false; + } + tmp = load_reg(s, a->rm); + tcg_gen_clzi_i32(tmp, tmp, 32); + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_ERET(DisasContext *s, arg_ERET *a) +{ + TCGv_i32 tmp; + + if (!arm_dc_feature(s, ARM_FEATURE_V7VE)) { + return false; + } + if (IS_USER(s)) { + unallocated_encoding(s); + return true; + } + if (s->current_el == 2) { + /* ERET from Hyp uses ELR_Hyp, not LR */ + tmp = load_cpu_field(elr_el[2]); + } else { + tmp = load_reg(s, 14); + } + gen_exception_return(s, tmp); + return true; +} + +static bool trans_HLT(DisasContext *s, arg_HLT *a) +{ + gen_hlt(s, a->imm); + return true; +} + +static bool trans_BKPT(DisasContext *s, arg_BKPT *a) +{ + if (!ENABLE_ARCH_5) { + return false; + } + /* BKPT is OK with ECI set and leaves it untouched */ + s->eci_handled = true; + if (arm_dc_feature(s, ARM_FEATURE_M) && + semihosting_enabled(s->current_el == 0) && + (a->imm == 0xab)) { + gen_exception_internal_insn(s, EXCP_SEMIHOST); + } else { + gen_exception_bkpt_insn(s, syn_aa32_bkpt(a->imm, false)); + } + return true; +} + +static bool trans_HVC(DisasContext *s, arg_HVC *a) +{ + if (!ENABLE_ARCH_7 || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (IS_USER(s)) { + unallocated_encoding(s); + } else { + gen_hvc(s, a->imm); + } + return true; +} + +static bool trans_SMC(DisasContext *s, arg_SMC *a) +{ + if (!ENABLE_ARCH_6K || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (IS_USER(s)) { + unallocated_encoding(s); + } else { + gen_smc(s); + } + return true; +} + +static bool trans_SG(DisasContext *s, arg_SG *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_M) || + !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + /* + * SG (v8M only) + * The bulk of the behaviour for this instruction is implemented + * in v7m_handle_execute_nsc(), which deals with the insn when + * it is executed by a CPU in non-secure state from memory + * which is Secure & NonSecure-Callable. + * Here we only need to handle the remaining cases: + * * in NS memory (including the "security extension not + * implemented" case) : NOP + * * in S memory but CPU already secure (clear IT bits) + * We know that the attribute for the memory this insn is + * in must match the current CPU state, because otherwise + * get_phys_addr_pmsav8 would have generated an exception. + */ + if (s->v8m_secure) { + /* Like the IT insn, we don't need to generate any code */ + s->condexec_cond = 0; + s->condexec_mask = 0; + } + return true; +} + +static bool trans_TT(DisasContext *s, arg_TT *a) +{ + TCGv_i32 addr, tmp; + + if (!arm_dc_feature(s, ARM_FEATURE_M) || + !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + if (a->rd == 13 || a->rd == 15 || a->rn == 15) { + /* We UNDEF for these UNPREDICTABLE cases */ + unallocated_encoding(s); + return true; + } + if (a->A && !s->v8m_secure) { + /* This case is UNDEFINED. */ + unallocated_encoding(s); + return true; + } + + addr = load_reg(s, a->rn); + tmp = tcg_temp_new_i32(); + gen_helper_v7m_tt(tmp, cpu_env, addr, tcg_constant_i32((a->A << 1) | a->T)); + tcg_temp_free_i32(addr); + store_reg(s, a->rd, tmp); + return true; +} + +/* + * Load/store register index + */ + +static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w) +{ + ISSInfo ret; + + /* ISS not valid if writeback */ + if (p && !w) { + ret = rd; + if (curr_insn_len(s) == 2) { + ret |= ISSIs16Bit; + } + } else { + ret = ISSInvalid; + } + return ret; +} + +static TCGv_i32 op_addr_rr_pre(DisasContext *s, arg_ldst_rr *a) +{ + TCGv_i32 addr = load_reg(s, a->rn); + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + if (a->p) { + TCGv_i32 ofs = load_reg(s, a->rm); + gen_arm_shift_im(ofs, a->shtype, a->shimm, 0); + if (a->u) { + tcg_gen_add_i32(addr, addr, ofs); + } else { + tcg_gen_sub_i32(addr, addr, ofs); + } + tcg_temp_free_i32(ofs); + } + return addr; +} + +static void op_addr_rr_post(DisasContext *s, arg_ldst_rr *a, + TCGv_i32 addr, int address_offset) +{ + if (!a->p) { + TCGv_i32 ofs = load_reg(s, a->rm); + gen_arm_shift_im(ofs, a->shtype, a->shimm, 0); + if (a->u) { + tcg_gen_add_i32(addr, addr, ofs); + } else { + tcg_gen_sub_i32(addr, addr, ofs); + } + tcg_temp_free_i32(ofs); + } else if (!a->w) { + tcg_temp_free_i32(addr); + return; + } + tcg_gen_addi_i32(addr, addr, address_offset); + store_reg(s, a->rn, addr); +} + +static bool op_load_rr(DisasContext *s, arg_ldst_rr *a, + MemOp mop, int mem_idx) +{ + ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w); + TCGv_i32 addr, tmp; + + addr = op_addr_rr_pre(s, a); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop); + disas_set_da_iss(s, mop, issinfo); + + /* + * Perform base writeback before the loaded value to + * ensure correct behavior with overlapping index registers. + */ + op_addr_rr_post(s, a, addr, 0); + store_reg_from_load(s, a->rt, tmp); + return true; +} + +static bool op_store_rr(DisasContext *s, arg_ldst_rr *a, + MemOp mop, int mem_idx) +{ + ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite; + TCGv_i32 addr, tmp; + + /* + * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it + * is either UNPREDICTABLE or has defined behaviour + */ + if (s->thumb && a->rn == 15) { + return false; + } + + addr = op_addr_rr_pre(s, a); + + tmp = load_reg(s, a->rt); + gen_aa32_st_i32(s, tmp, addr, mem_idx, mop); + disas_set_da_iss(s, mop, issinfo); + tcg_temp_free_i32(tmp); + + op_addr_rr_post(s, a, addr, 0); + return true; +} + +static bool trans_LDRD_rr(DisasContext *s, arg_ldst_rr *a) +{ + int mem_idx = get_mem_index(s); + TCGv_i32 addr, tmp; + + if (!ENABLE_ARCH_5TE) { + return false; + } + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + addr = op_addr_rr_pre(s, a); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + store_reg(s, a->rt, tmp); + + tcg_gen_addi_i32(addr, addr, 4); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + store_reg(s, a->rt + 1, tmp); + + /* LDRD w/ base writeback is undefined if the registers overlap. */ + op_addr_rr_post(s, a, addr, -4); + return true; +} + +static bool trans_STRD_rr(DisasContext *s, arg_ldst_rr *a) +{ + int mem_idx = get_mem_index(s); + TCGv_i32 addr, tmp; + + if (!ENABLE_ARCH_5TE) { + return false; + } + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + addr = op_addr_rr_pre(s, a); + + tmp = load_reg(s, a->rt); + gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + + tcg_gen_addi_i32(addr, addr, 4); + + tmp = load_reg(s, a->rt + 1); + gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + + op_addr_rr_post(s, a, addr, -4); + return true; +} + +/* + * Load/store immediate index + */ + +static TCGv_i32 op_addr_ri_pre(DisasContext *s, arg_ldst_ri *a) +{ + int ofs = a->imm; + + if (!a->u) { + ofs = -ofs; + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + /* + * Stackcheck. Here we know 'addr' is the current SP; + * U is set if we're moving SP up, else down. It is + * UNKNOWN whether the limit check triggers when SP starts + * below the limit and ends up above it; we chose to do so. + */ + if (!a->u) { + TCGv_i32 newsp = tcg_temp_new_i32(); + tcg_gen_addi_i32(newsp, cpu_R[13], ofs); + gen_helper_v8m_stackcheck(cpu_env, newsp); + tcg_temp_free_i32(newsp); + } else { + gen_helper_v8m_stackcheck(cpu_env, cpu_R[13]); + } + } + + return add_reg_for_lit(s, a->rn, a->p ? ofs : 0); +} + +static void op_addr_ri_post(DisasContext *s, arg_ldst_ri *a, + TCGv_i32 addr, int address_offset) +{ + if (!a->p) { + if (a->u) { + address_offset += a->imm; + } else { + address_offset -= a->imm; + } + } else if (!a->w) { + tcg_temp_free_i32(addr); + return; + } + tcg_gen_addi_i32(addr, addr, address_offset); + store_reg(s, a->rn, addr); +} + +static bool op_load_ri(DisasContext *s, arg_ldst_ri *a, + MemOp mop, int mem_idx) +{ + ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w); + TCGv_i32 addr, tmp; + + addr = op_addr_ri_pre(s, a); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop); + disas_set_da_iss(s, mop, issinfo); + + /* + * Perform base writeback before the loaded value to + * ensure correct behavior with overlapping index registers. + */ + op_addr_ri_post(s, a, addr, 0); + store_reg_from_load(s, a->rt, tmp); + return true; +} + +static bool op_store_ri(DisasContext *s, arg_ldst_ri *a, + MemOp mop, int mem_idx) +{ + ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite; + TCGv_i32 addr, tmp; + + /* + * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it + * is either UNPREDICTABLE or has defined behaviour + */ + if (s->thumb && a->rn == 15) { + return false; + } + + addr = op_addr_ri_pre(s, a); + + tmp = load_reg(s, a->rt); + gen_aa32_st_i32(s, tmp, addr, mem_idx, mop); + disas_set_da_iss(s, mop, issinfo); + tcg_temp_free_i32(tmp); + + op_addr_ri_post(s, a, addr, 0); + return true; +} + +static bool op_ldrd_ri(DisasContext *s, arg_ldst_ri *a, int rt2) +{ + int mem_idx = get_mem_index(s); + TCGv_i32 addr, tmp; + + addr = op_addr_ri_pre(s, a); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + store_reg(s, a->rt, tmp); + + tcg_gen_addi_i32(addr, addr, 4); + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + store_reg(s, rt2, tmp); + + /* LDRD w/ base writeback is undefined if the registers overlap. */ + op_addr_ri_post(s, a, addr, -4); + return true; +} + +static bool trans_LDRD_ri_a32(DisasContext *s, arg_ldst_ri *a) +{ + if (!ENABLE_ARCH_5TE || (a->rt & 1)) { + return false; + } + return op_ldrd_ri(s, a, a->rt + 1); +} + +static bool trans_LDRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a) +{ + arg_ldst_ri b = { + .u = a->u, .w = a->w, .p = a->p, + .rn = a->rn, .rt = a->rt, .imm = a->imm + }; + return op_ldrd_ri(s, &b, a->rt2); +} + +static bool op_strd_ri(DisasContext *s, arg_ldst_ri *a, int rt2) +{ + int mem_idx = get_mem_index(s); + TCGv_i32 addr, tmp; + + addr = op_addr_ri_pre(s, a); + + tmp = load_reg(s, a->rt); + gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + + tcg_gen_addi_i32(addr, addr, 4); + + tmp = load_reg(s, rt2); + gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + + op_addr_ri_post(s, a, addr, -4); + return true; +} + +static bool trans_STRD_ri_a32(DisasContext *s, arg_ldst_ri *a) +{ + if (!ENABLE_ARCH_5TE || (a->rt & 1)) { + return false; + } + return op_strd_ri(s, a, a->rt + 1); +} + +static bool trans_STRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a) +{ + arg_ldst_ri b = { + .u = a->u, .w = a->w, .p = a->p, + .rn = a->rn, .rt = a->rt, .imm = a->imm + }; + return op_strd_ri(s, &b, a->rt2); +} + +#define DO_LDST(NAME, WHICH, MEMOP) \ +static bool trans_##NAME##_ri(DisasContext *s, arg_ldst_ri *a) \ +{ \ + return op_##WHICH##_ri(s, a, MEMOP, get_mem_index(s)); \ +} \ +static bool trans_##NAME##T_ri(DisasContext *s, arg_ldst_ri *a) \ +{ \ + return op_##WHICH##_ri(s, a, MEMOP, get_a32_user_mem_index(s)); \ +} \ +static bool trans_##NAME##_rr(DisasContext *s, arg_ldst_rr *a) \ +{ \ + return op_##WHICH##_rr(s, a, MEMOP, get_mem_index(s)); \ +} \ +static bool trans_##NAME##T_rr(DisasContext *s, arg_ldst_rr *a) \ +{ \ + return op_##WHICH##_rr(s, a, MEMOP, get_a32_user_mem_index(s)); \ +} + +DO_LDST(LDR, load, MO_UL) +DO_LDST(LDRB, load, MO_UB) +DO_LDST(LDRH, load, MO_UW) +DO_LDST(LDRSB, load, MO_SB) +DO_LDST(LDRSH, load, MO_SW) + +DO_LDST(STR, store, MO_UL) +DO_LDST(STRB, store, MO_UB) +DO_LDST(STRH, store, MO_UW) + +#undef DO_LDST + +/* + * Synchronization primitives + */ + +static bool op_swp(DisasContext *s, arg_SWP *a, MemOp opc) +{ + TCGv_i32 addr, tmp; + TCGv taddr; + + opc |= s->be_data; + addr = load_reg(s, a->rn); + taddr = gen_aa32_addr(s, addr, opc); + tcg_temp_free_i32(addr); + + tmp = load_reg(s, a->rt2); + tcg_gen_atomic_xchg_i32(tmp, taddr, tmp, get_mem_index(s), opc); + tcg_temp_free(taddr); + + store_reg(s, a->rt, tmp); + return true; +} + +static bool trans_SWP(DisasContext *s, arg_SWP *a) +{ + return op_swp(s, a, MO_UL | MO_ALIGN); +} + +static bool trans_SWPB(DisasContext *s, arg_SWP *a) +{ + return op_swp(s, a, MO_UB); +} + +/* + * Load/Store Exclusive and Load-Acquire/Store-Release + */ + +static bool op_strex(DisasContext *s, arg_STREX *a, MemOp mop, bool rel) +{ + TCGv_i32 addr; + /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */ + bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M); + + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rd == 15 || a->rn == 15 || a->rt == 15 + || a->rd == a->rn || a->rd == a->rt + || (!v8a && s->thumb && (a->rd == 13 || a->rt == 13)) + || (mop == MO_64 + && (a->rt2 == 15 + || a->rd == a->rt2 + || (!v8a && s->thumb && a->rt2 == 13)))) { + unallocated_encoding(s); + return true; + } + + if (rel) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + } + + addr = tcg_temp_local_new_i32(); + load_reg_var(s, addr, a->rn); + tcg_gen_addi_i32(addr, addr, a->imm); + + gen_store_exclusive(s, a->rd, a->rt, a->rt2, addr, mop); + tcg_temp_free_i32(addr); + return true; +} + +static bool trans_STREX(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + return op_strex(s, a, MO_32, false); +} + +static bool trans_STREXD_a32(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_6K) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + a->rt2 = a->rt + 1; + return op_strex(s, a, MO_64, false); +} + +static bool trans_STREXD_t32(DisasContext *s, arg_STREX *a) +{ + return op_strex(s, a, MO_64, false); +} + +static bool trans_STREXB(DisasContext *s, arg_STREX *a) +{ + if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) { + return false; + } + return op_strex(s, a, MO_8, false); +} + +static bool trans_STREXH(DisasContext *s, arg_STREX *a) +{ + if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) { + return false; + } + return op_strex(s, a, MO_16, false); +} + +static bool trans_STLEX(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_strex(s, a, MO_32, true); +} + +static bool trans_STLEXD_a32(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + a->rt2 = a->rt + 1; + return op_strex(s, a, MO_64, true); +} + +static bool trans_STLEXD_t32(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_strex(s, a, MO_64, true); +} + +static bool trans_STLEXB(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_strex(s, a, MO_8, true); +} + +static bool trans_STLEXH(DisasContext *s, arg_STREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_strex(s, a, MO_16, true); +} + +static bool op_stl(DisasContext *s, arg_STL *a, MemOp mop) +{ + TCGv_i32 addr, tmp; + + if (!ENABLE_ARCH_8) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rn == 15 || a->rt == 15) { + unallocated_encoding(s); + return true; + } + + addr = load_reg(s, a->rn); + tmp = load_reg(s, a->rt); + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN); + disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel | ISSIsWrite); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(addr); + return true; +} + +static bool trans_STL(DisasContext *s, arg_STL *a) +{ + return op_stl(s, a, MO_UL); +} + +static bool trans_STLB(DisasContext *s, arg_STL *a) +{ + return op_stl(s, a, MO_UB); +} + +static bool trans_STLH(DisasContext *s, arg_STL *a) +{ + return op_stl(s, a, MO_UW); +} + +static bool op_ldrex(DisasContext *s, arg_LDREX *a, MemOp mop, bool acq) +{ + TCGv_i32 addr; + /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */ + bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M); + + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rn == 15 || a->rt == 15 + || (!v8a && s->thumb && a->rt == 13) + || (mop == MO_64 + && (a->rt2 == 15 || a->rt == a->rt2 + || (!v8a && s->thumb && a->rt2 == 13)))) { + unallocated_encoding(s); + return true; + } + + addr = tcg_temp_local_new_i32(); + load_reg_var(s, addr, a->rn); + tcg_gen_addi_i32(addr, addr, a->imm); + + gen_load_exclusive(s, a->rt, a->rt2, addr, mop); + tcg_temp_free_i32(addr); + + if (acq) { + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); + } + return true; +} + +static bool trans_LDREX(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + return op_ldrex(s, a, MO_32, false); +} + +static bool trans_LDREXD_a32(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_6K) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + a->rt2 = a->rt + 1; + return op_ldrex(s, a, MO_64, false); +} + +static bool trans_LDREXD_t32(DisasContext *s, arg_LDREX *a) +{ + return op_ldrex(s, a, MO_64, false); +} + +static bool trans_LDREXB(DisasContext *s, arg_LDREX *a) +{ + if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) { + return false; + } + return op_ldrex(s, a, MO_8, false); +} + +static bool trans_LDREXH(DisasContext *s, arg_LDREX *a) +{ + if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) { + return false; + } + return op_ldrex(s, a, MO_16, false); +} + +static bool trans_LDAEX(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_ldrex(s, a, MO_32, true); +} + +static bool trans_LDAEXD_a32(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rt & 1) { + unallocated_encoding(s); + return true; + } + a->rt2 = a->rt + 1; + return op_ldrex(s, a, MO_64, true); +} + +static bool trans_LDAEXD_t32(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_ldrex(s, a, MO_64, true); +} + +static bool trans_LDAEXB(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_ldrex(s, a, MO_8, true); +} + +static bool trans_LDAEXH(DisasContext *s, arg_LDREX *a) +{ + if (!ENABLE_ARCH_8) { + return false; + } + return op_ldrex(s, a, MO_16, true); +} + +static bool op_lda(DisasContext *s, arg_LDA *a, MemOp mop) +{ + TCGv_i32 addr, tmp; + + if (!ENABLE_ARCH_8) { + return false; + } + /* We UNDEF for these UNPREDICTABLE cases. */ + if (a->rn == 15 || a->rt == 15) { + unallocated_encoding(s); + return true; + } + + addr = load_reg(s, a->rn); + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN); + disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel); + tcg_temp_free_i32(addr); + + store_reg(s, a->rt, tmp); + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + return true; +} + +static bool trans_LDA(DisasContext *s, arg_LDA *a) +{ + return op_lda(s, a, MO_UL); +} + +static bool trans_LDAB(DisasContext *s, arg_LDA *a) +{ + return op_lda(s, a, MO_UB); +} + +static bool trans_LDAH(DisasContext *s, arg_LDA *a) +{ + return op_lda(s, a, MO_UW); +} + +/* + * Media instructions + */ + +static bool trans_USADA8(DisasContext *s, arg_USADA8 *a) +{ + TCGv_i32 t1, t2; + + if (!ENABLE_ARCH_6) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + gen_helper_usad8(t1, t1, t2); + tcg_temp_free_i32(t2); + if (a->ra != 15) { + t2 = load_reg(s, a->ra); + tcg_gen_add_i32(t1, t1, t2); + tcg_temp_free_i32(t2); + } + store_reg(s, a->rd, t1); + return true; +} + +static bool op_bfx(DisasContext *s, arg_UBFX *a, bool u) +{ + TCGv_i32 tmp; + int width = a->widthm1 + 1; + int shift = a->lsb; + + if (!ENABLE_ARCH_6T2) { + return false; + } + if (shift + width > 32) { + /* UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + tmp = load_reg(s, a->rn); + if (u) { + tcg_gen_extract_i32(tmp, tmp, shift, width); + } else { + tcg_gen_sextract_i32(tmp, tmp, shift, width); + } + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_SBFX(DisasContext *s, arg_SBFX *a) +{ + return op_bfx(s, a, false); +} + +static bool trans_UBFX(DisasContext *s, arg_UBFX *a) +{ + return op_bfx(s, a, true); +} + +static bool trans_BFCI(DisasContext *s, arg_BFCI *a) +{ + TCGv_i32 tmp; + int msb = a->msb, lsb = a->lsb; + int width; + + if (!ENABLE_ARCH_6T2) { + return false; + } + if (msb < lsb) { + /* UNPREDICTABLE; we choose to UNDEF */ + unallocated_encoding(s); + return true; + } + + width = msb + 1 - lsb; + if (a->rn == 15) { + /* BFC */ + tmp = tcg_const_i32(0); + } else { + /* BFI */ + tmp = load_reg(s, a->rn); + } + if (width != 32) { + TCGv_i32 tmp2 = load_reg(s, a->rd); + tcg_gen_deposit_i32(tmp, tmp2, tmp, lsb, width); + tcg_temp_free_i32(tmp2); + } + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_UDF(DisasContext *s, arg_UDF *a) +{ + unallocated_encoding(s); + return true; +} + +/* + * Parallel addition and subtraction + */ + +static bool op_par_addsub(DisasContext *s, arg_rrr *a, + void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0, t1; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + t0 = load_reg(s, a->rn); + t1 = load_reg(s, a->rm); + + gen(t0, t0, t1); + + tcg_temp_free_i32(t1); + store_reg(s, a->rd, t0); + return true; +} + +static bool op_par_addsub_ge(DisasContext *s, arg_rrr *a, + void (*gen)(TCGv_i32, TCGv_i32, + TCGv_i32, TCGv_ptr)) +{ + TCGv_i32 t0, t1; + TCGv_ptr ge; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + t0 = load_reg(s, a->rn); + t1 = load_reg(s, a->rm); + + ge = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ge, cpu_env, offsetof(CPUARMState, GE)); + gen(t0, t0, t1, ge); + + tcg_temp_free_ptr(ge); + tcg_temp_free_i32(t1); + store_reg(s, a->rd, t0); + return true; +} + +#define DO_PAR_ADDSUB(NAME, helper) \ +static bool trans_##NAME(DisasContext *s, arg_rrr *a) \ +{ \ + return op_par_addsub(s, a, helper); \ +} + +#define DO_PAR_ADDSUB_GE(NAME, helper) \ +static bool trans_##NAME(DisasContext *s, arg_rrr *a) \ +{ \ + return op_par_addsub_ge(s, a, helper); \ +} + +DO_PAR_ADDSUB_GE(SADD16, gen_helper_sadd16) +DO_PAR_ADDSUB_GE(SASX, gen_helper_saddsubx) +DO_PAR_ADDSUB_GE(SSAX, gen_helper_ssubaddx) +DO_PAR_ADDSUB_GE(SSUB16, gen_helper_ssub16) +DO_PAR_ADDSUB_GE(SADD8, gen_helper_sadd8) +DO_PAR_ADDSUB_GE(SSUB8, gen_helper_ssub8) + +DO_PAR_ADDSUB_GE(UADD16, gen_helper_uadd16) +DO_PAR_ADDSUB_GE(UASX, gen_helper_uaddsubx) +DO_PAR_ADDSUB_GE(USAX, gen_helper_usubaddx) +DO_PAR_ADDSUB_GE(USUB16, gen_helper_usub16) +DO_PAR_ADDSUB_GE(UADD8, gen_helper_uadd8) +DO_PAR_ADDSUB_GE(USUB8, gen_helper_usub8) + +DO_PAR_ADDSUB(QADD16, gen_helper_qadd16) +DO_PAR_ADDSUB(QASX, gen_helper_qaddsubx) +DO_PAR_ADDSUB(QSAX, gen_helper_qsubaddx) +DO_PAR_ADDSUB(QSUB16, gen_helper_qsub16) +DO_PAR_ADDSUB(QADD8, gen_helper_qadd8) +DO_PAR_ADDSUB(QSUB8, gen_helper_qsub8) + +DO_PAR_ADDSUB(UQADD16, gen_helper_uqadd16) +DO_PAR_ADDSUB(UQASX, gen_helper_uqaddsubx) +DO_PAR_ADDSUB(UQSAX, gen_helper_uqsubaddx) +DO_PAR_ADDSUB(UQSUB16, gen_helper_uqsub16) +DO_PAR_ADDSUB(UQADD8, gen_helper_uqadd8) +DO_PAR_ADDSUB(UQSUB8, gen_helper_uqsub8) + +DO_PAR_ADDSUB(SHADD16, gen_helper_shadd16) +DO_PAR_ADDSUB(SHASX, gen_helper_shaddsubx) +DO_PAR_ADDSUB(SHSAX, gen_helper_shsubaddx) +DO_PAR_ADDSUB(SHSUB16, gen_helper_shsub16) +DO_PAR_ADDSUB(SHADD8, gen_helper_shadd8) +DO_PAR_ADDSUB(SHSUB8, gen_helper_shsub8) + +DO_PAR_ADDSUB(UHADD16, gen_helper_uhadd16) +DO_PAR_ADDSUB(UHASX, gen_helper_uhaddsubx) +DO_PAR_ADDSUB(UHSAX, gen_helper_uhsubaddx) +DO_PAR_ADDSUB(UHSUB16, gen_helper_uhsub16) +DO_PAR_ADDSUB(UHADD8, gen_helper_uhadd8) +DO_PAR_ADDSUB(UHSUB8, gen_helper_uhsub8) + +#undef DO_PAR_ADDSUB +#undef DO_PAR_ADDSUB_GE + +/* + * Packing, unpacking, saturation, and reversal + */ + +static bool trans_PKH(DisasContext *s, arg_PKH *a) +{ + TCGv_i32 tn, tm; + int shift = a->imm; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + tn = load_reg(s, a->rn); + tm = load_reg(s, a->rm); + if (a->tb) { + /* PKHTB */ + if (shift == 0) { + shift = 31; + } + tcg_gen_sari_i32(tm, tm, shift); + tcg_gen_deposit_i32(tn, tn, tm, 0, 16); + } else { + /* PKHBT */ + tcg_gen_shli_i32(tm, tm, shift); + tcg_gen_deposit_i32(tn, tm, tn, 0, 16); + } + tcg_temp_free_i32(tm); + store_reg(s, a->rd, tn); + return true; +} + +static bool op_sat(DisasContext *s, arg_sat *a, + void (*gen)(TCGv_i32, TCGv_env, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 tmp; + int shift = a->imm; + + if (!ENABLE_ARCH_6) { + return false; + } + + tmp = load_reg(s, a->rn); + if (a->sh) { + tcg_gen_sari_i32(tmp, tmp, shift ? shift : 31); + } else { + tcg_gen_shli_i32(tmp, tmp, shift); + } + + gen(tmp, cpu_env, tmp, tcg_constant_i32(a->satimm)); + + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_SSAT(DisasContext *s, arg_sat *a) +{ + return op_sat(s, a, gen_helper_ssat); +} + +static bool trans_USAT(DisasContext *s, arg_sat *a) +{ + return op_sat(s, a, gen_helper_usat); +} + +static bool trans_SSAT16(DisasContext *s, arg_sat *a) +{ + if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) { + return false; + } + return op_sat(s, a, gen_helper_ssat16); +} + +static bool trans_USAT16(DisasContext *s, arg_sat *a) +{ + if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) { + return false; + } + return op_sat(s, a, gen_helper_usat16); +} + +static bool op_xta(DisasContext *s, arg_rrr_rot *a, + void (*gen_extract)(TCGv_i32, TCGv_i32), + void (*gen_add)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 tmp; + + if (!ENABLE_ARCH_6) { + return false; + } + + tmp = load_reg(s, a->rm); + /* + * TODO: In many cases we could do a shift instead of a rotate. + * Combined with a simple extend, that becomes an extract. + */ + tcg_gen_rotri_i32(tmp, tmp, a->rot * 8); + gen_extract(tmp, tmp); + + if (a->rn != 15) { + TCGv_i32 tmp2 = load_reg(s, a->rn); + gen_add(tmp, tmp, tmp2); + tcg_temp_free_i32(tmp2); + } + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_SXTAB(DisasContext *s, arg_rrr_rot *a) +{ + return op_xta(s, a, tcg_gen_ext8s_i32, tcg_gen_add_i32); +} + +static bool trans_SXTAH(DisasContext *s, arg_rrr_rot *a) +{ + return op_xta(s, a, tcg_gen_ext16s_i32, tcg_gen_add_i32); +} + +static bool trans_SXTAB16(DisasContext *s, arg_rrr_rot *a) +{ + if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) { + return false; + } + return op_xta(s, a, gen_helper_sxtb16, gen_add16); +} + +static bool trans_UXTAB(DisasContext *s, arg_rrr_rot *a) +{ + return op_xta(s, a, tcg_gen_ext8u_i32, tcg_gen_add_i32); +} + +static bool trans_UXTAH(DisasContext *s, arg_rrr_rot *a) +{ + return op_xta(s, a, tcg_gen_ext16u_i32, tcg_gen_add_i32); +} + +static bool trans_UXTAB16(DisasContext *s, arg_rrr_rot *a) +{ + if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) { + return false; + } + return op_xta(s, a, gen_helper_uxtb16, gen_add16); +} + +static bool trans_SEL(DisasContext *s, arg_rrr *a) +{ + TCGv_i32 t1, t2, t3; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + t3 = tcg_temp_new_i32(); + tcg_gen_ld_i32(t3, cpu_env, offsetof(CPUARMState, GE)); + gen_helper_sel_flags(t1, t3, t1, t2); + tcg_temp_free_i32(t3); + tcg_temp_free_i32(t2); + store_reg(s, a->rd, t1); + return true; +} + +static bool op_rr(DisasContext *s, arg_rr *a, + void (*gen)(TCGv_i32, TCGv_i32)) +{ + TCGv_i32 tmp; + + tmp = load_reg(s, a->rm); + gen(tmp, tmp); + store_reg(s, a->rd, tmp); + return true; +} + +static bool trans_REV(DisasContext *s, arg_rr *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + return op_rr(s, a, tcg_gen_bswap32_i32); +} + +static bool trans_REV16(DisasContext *s, arg_rr *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + return op_rr(s, a, gen_rev16); +} + +static bool trans_REVSH(DisasContext *s, arg_rr *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + return op_rr(s, a, gen_revsh); +} + +static bool trans_RBIT(DisasContext *s, arg_rr *a) +{ + if (!ENABLE_ARCH_6T2) { + return false; + } + return op_rr(s, a, gen_helper_rbit); +} + +/* + * Signed multiply, signed and unsigned divide + */ + +static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub) +{ + TCGv_i32 t1, t2; + + if (!ENABLE_ARCH_6) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + if (m_swap) { + gen_swap_half(t2, t2); + } + gen_smul_dual(t1, t2); + + if (sub) { + /* + * This subtraction cannot overflow, so we can do a simple + * 32-bit subtraction and then a possible 32-bit saturating + * addition of Ra. + */ + tcg_gen_sub_i32(t1, t1, t2); + tcg_temp_free_i32(t2); + + if (a->ra != 15) { + t2 = load_reg(s, a->ra); + gen_helper_add_setq(t1, cpu_env, t1, t2); + tcg_temp_free_i32(t2); + } + } else if (a->ra == 15) { + /* Single saturation-checking addition */ + gen_helper_add_setq(t1, cpu_env, t1, t2); + tcg_temp_free_i32(t2); + } else { + /* + * We need to add the products and Ra together and then + * determine whether the final result overflowed. Doing + * this as two separate add-and-check-overflow steps incorrectly + * sets Q for cases like (-32768 * -32768) + (-32768 * -32768) + -1. + * Do all the arithmetic at 64-bits and then check for overflow. + */ + TCGv_i64 p64, q64; + TCGv_i32 t3, qf, one; + + p64 = tcg_temp_new_i64(); + q64 = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(p64, t1); + tcg_gen_ext_i32_i64(q64, t2); + tcg_gen_add_i64(p64, p64, q64); + load_reg_var(s, t2, a->ra); + tcg_gen_ext_i32_i64(q64, t2); + tcg_gen_add_i64(p64, p64, q64); + tcg_temp_free_i64(q64); + + tcg_gen_extr_i64_i32(t1, t2, p64); + tcg_temp_free_i64(p64); + /* + * t1 is the low half of the result which goes into Rd. + * We have overflow and must set Q if the high half (t2) + * is different from the sign-extension of t1. + */ + t3 = tcg_temp_new_i32(); + tcg_gen_sari_i32(t3, t1, 31); + qf = load_cpu_field(QF); + one = tcg_constant_i32(1); + tcg_gen_movcond_i32(TCG_COND_NE, qf, t2, t3, one, qf); + store_cpu_field(qf, QF); + tcg_temp_free_i32(t3); + tcg_temp_free_i32(t2); + } + store_reg(s, a->rd, t1); + return true; +} + +static bool trans_SMLAD(DisasContext *s, arg_rrrr *a) +{ + return op_smlad(s, a, false, false); +} + +static bool trans_SMLADX(DisasContext *s, arg_rrrr *a) +{ + return op_smlad(s, a, true, false); +} + +static bool trans_SMLSD(DisasContext *s, arg_rrrr *a) +{ + return op_smlad(s, a, false, true); +} + +static bool trans_SMLSDX(DisasContext *s, arg_rrrr *a) +{ + return op_smlad(s, a, true, true); +} + +static bool op_smlald(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub) +{ + TCGv_i32 t1, t2; + TCGv_i64 l1, l2; + + if (!ENABLE_ARCH_6) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + if (m_swap) { + gen_swap_half(t2, t2); + } + gen_smul_dual(t1, t2); + + l1 = tcg_temp_new_i64(); + l2 = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(l1, t1); + tcg_gen_ext_i32_i64(l2, t2); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t2); + + if (sub) { + tcg_gen_sub_i64(l1, l1, l2); + } else { + tcg_gen_add_i64(l1, l1, l2); + } + tcg_temp_free_i64(l2); + + gen_addq(s, l1, a->ra, a->rd); + gen_storeq_reg(s, a->ra, a->rd, l1); + tcg_temp_free_i64(l1); + return true; +} + +static bool trans_SMLALD(DisasContext *s, arg_rrrr *a) +{ + return op_smlald(s, a, false, false); +} + +static bool trans_SMLALDX(DisasContext *s, arg_rrrr *a) +{ + return op_smlald(s, a, true, false); +} + +static bool trans_SMLSLD(DisasContext *s, arg_rrrr *a) +{ + return op_smlald(s, a, false, true); +} + +static bool trans_SMLSLDX(DisasContext *s, arg_rrrr *a) +{ + return op_smlald(s, a, true, true); +} + +static bool op_smmla(DisasContext *s, arg_rrrr *a, bool round, bool sub) +{ + TCGv_i32 t1, t2; + + if (s->thumb + ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP) + : !ENABLE_ARCH_6) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + tcg_gen_muls2_i32(t2, t1, t1, t2); + + if (a->ra != 15) { + TCGv_i32 t3 = load_reg(s, a->ra); + if (sub) { + /* + * For SMMLS, we need a 64-bit subtract. Borrow caused by + * a non-zero multiplicand lowpart, and the correct result + * lowpart for rounding. + */ + tcg_gen_sub2_i32(t2, t1, tcg_constant_i32(0), t3, t2, t1); + } else { + tcg_gen_add_i32(t1, t1, t3); + } + tcg_temp_free_i32(t3); + } + if (round) { + /* + * Adding 0x80000000 to the 64-bit quantity means that we have + * carry in to the high word when the low word has the msb set. + */ + tcg_gen_shri_i32(t2, t2, 31); + tcg_gen_add_i32(t1, t1, t2); + } + tcg_temp_free_i32(t2); + store_reg(s, a->rd, t1); + return true; +} + +static bool trans_SMMLA(DisasContext *s, arg_rrrr *a) +{ + return op_smmla(s, a, false, false); +} + +static bool trans_SMMLAR(DisasContext *s, arg_rrrr *a) +{ + return op_smmla(s, a, true, false); +} + +static bool trans_SMMLS(DisasContext *s, arg_rrrr *a) +{ + return op_smmla(s, a, false, true); +} + +static bool trans_SMMLSR(DisasContext *s, arg_rrrr *a) +{ + return op_smmla(s, a, true, true); +} + +static bool op_div(DisasContext *s, arg_rrr *a, bool u) +{ + TCGv_i32 t1, t2; + + if (s->thumb + ? !dc_isar_feature(aa32_thumb_div, s) + : !dc_isar_feature(aa32_arm_div, s)) { + return false; + } + + t1 = load_reg(s, a->rn); + t2 = load_reg(s, a->rm); + if (u) { + gen_helper_udiv(t1, cpu_env, t1, t2); + } else { + gen_helper_sdiv(t1, cpu_env, t1, t2); + } + tcg_temp_free_i32(t2); + store_reg(s, a->rd, t1); + return true; +} + +static bool trans_SDIV(DisasContext *s, arg_rrr *a) +{ + return op_div(s, a, false); +} + +static bool trans_UDIV(DisasContext *s, arg_rrr *a) +{ + return op_div(s, a, true); +} + +/* + * Block data transfer + */ + +static TCGv_i32 op_addr_block_pre(DisasContext *s, arg_ldst_block *a, int n) +{ + TCGv_i32 addr = load_reg(s, a->rn); + + if (a->b) { + if (a->i) { + /* pre increment */ + tcg_gen_addi_i32(addr, addr, 4); + } else { + /* pre decrement */ + tcg_gen_addi_i32(addr, addr, -(n * 4)); + } + } else if (!a->i && n != 1) { + /* post decrement */ + tcg_gen_addi_i32(addr, addr, -((n - 1) * 4)); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + /* + * If the writeback is incrementing SP rather than + * decrementing it, and the initial SP is below the + * stack limit but the final written-back SP would + * be above, then we must not perform any memory + * accesses, but it is IMPDEF whether we generate + * an exception. We choose to do so in this case. + * At this point 'addr' is the lowest address, so + * either the original SP (if incrementing) or our + * final SP (if decrementing), so that's what we check. + */ + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + return addr; +} + +static void op_addr_block_post(DisasContext *s, arg_ldst_block *a, + TCGv_i32 addr, int n) +{ + if (a->w) { + /* write back */ + if (!a->b) { + if (a->i) { + /* post increment */ + tcg_gen_addi_i32(addr, addr, 4); + } else { + /* post decrement */ + tcg_gen_addi_i32(addr, addr, -(n * 4)); + } + } else if (!a->i && n != 1) { + /* pre decrement */ + tcg_gen_addi_i32(addr, addr, -((n - 1) * 4)); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } +} + +static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n) +{ + int i, j, n, list, mem_idx; + bool user = a->u; + TCGv_i32 addr, tmp; + + if (user) { + /* STM (user) */ + if (IS_USER(s)) { + /* Only usable in supervisor mode. */ + unallocated_encoding(s); + return true; + } + } + + list = a->list; + n = ctpop16(list); + if (n < min_n || a->rn == 15) { + unallocated_encoding(s); + return true; + } + + s->eci_handled = true; + + addr = op_addr_block_pre(s, a, n); + mem_idx = get_mem_index(s); + + for (i = j = 0; i < 16; i++) { + if (!(list & (1 << i))) { + continue; + } + + if (user && i != 15) { + tmp = tcg_temp_new_i32(); + gen_helper_get_user_reg(tmp, cpu_env, tcg_constant_i32(i)); + } else { + tmp = load_reg(s, i); + } + gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + tcg_temp_free_i32(tmp); + + /* No need to add after the last transfer. */ + if (++j != n) { + tcg_gen_addi_i32(addr, addr, 4); + } + } + + op_addr_block_post(s, a, addr, n); + clear_eci_state(s); + return true; +} + +static bool trans_STM(DisasContext *s, arg_ldst_block *a) +{ + /* BitCount(list) < 1 is UNPREDICTABLE */ + return op_stm(s, a, 1); +} + +static bool trans_STM_t32(DisasContext *s, arg_ldst_block *a) +{ + /* Writeback register in register list is UNPREDICTABLE for T32. */ + if (a->w && (a->list & (1 << a->rn))) { + unallocated_encoding(s); + return true; + } + /* BitCount(list) < 2 is UNPREDICTABLE */ + return op_stm(s, a, 2); +} + +static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n) +{ + int i, j, n, list, mem_idx; + bool loaded_base; + bool user = a->u; + bool exc_return = false; + TCGv_i32 addr, tmp, loaded_var; + + if (user) { + /* LDM (user), LDM (exception return) */ + if (IS_USER(s)) { + /* Only usable in supervisor mode. */ + unallocated_encoding(s); + return true; + } + if (extract32(a->list, 15, 1)) { + exc_return = true; + user = false; + } else { + /* LDM (user) does not allow writeback. */ + if (a->w) { + unallocated_encoding(s); + return true; + } + } + } + + list = a->list; + n = ctpop16(list); + if (n < min_n || a->rn == 15) { + unallocated_encoding(s); + return true; + } + + s->eci_handled = true; + + addr = op_addr_block_pre(s, a, n); + mem_idx = get_mem_index(s); + loaded_base = false; + loaded_var = NULL; + + for (i = j = 0; i < 16; i++) { + if (!(list & (1 << i))) { + continue; + } + + tmp = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN); + if (user) { + gen_helper_set_user_reg(cpu_env, tcg_constant_i32(i), tmp); + tcg_temp_free_i32(tmp); + } else if (i == a->rn) { + loaded_var = tmp; + loaded_base = true; + } else if (i == 15 && exc_return) { + store_pc_exc_ret(s, tmp); + } else { + store_reg_from_load(s, i, tmp); + } + + /* No need to add after the last transfer. */ + if (++j != n) { + tcg_gen_addi_i32(addr, addr, 4); + } + } + + op_addr_block_post(s, a, addr, n); + + if (loaded_base) { + /* Note that we reject base == pc above. */ + store_reg(s, a->rn, loaded_var); + } + + if (exc_return) { + /* Restore CPSR from SPSR. */ + tmp = load_cpu_field(spsr); + if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { + gen_io_start(); + } + gen_helper_cpsr_write_eret(cpu_env, tmp); + tcg_temp_free_i32(tmp); + /* Must exit loop to check un-masked IRQs */ + s->base.is_jmp = DISAS_EXIT; + } + clear_eci_state(s); + return true; +} + +static bool trans_LDM_a32(DisasContext *s, arg_ldst_block *a) +{ + /* + * Writeback register in register list is UNPREDICTABLE + * for ArchVersion() >= 7. Prior to v7, A32 would write + * an UNKNOWN value to the base register. + */ + if (ENABLE_ARCH_7 && a->w && (a->list & (1 << a->rn))) { + unallocated_encoding(s); + return true; + } + /* BitCount(list) < 1 is UNPREDICTABLE */ + return do_ldm(s, a, 1); +} + +static bool trans_LDM_t32(DisasContext *s, arg_ldst_block *a) +{ + /* Writeback register in register list is UNPREDICTABLE for T32. */ + if (a->w && (a->list & (1 << a->rn))) { + unallocated_encoding(s); + return true; + } + /* BitCount(list) < 2 is UNPREDICTABLE */ + return do_ldm(s, a, 2); +} + +static bool trans_LDM_t16(DisasContext *s, arg_ldst_block *a) +{ + /* Writeback is conditional on the base register not being loaded. */ + a->w = !(a->list & (1 << a->rn)); + /* BitCount(list) < 1 is UNPREDICTABLE */ + return do_ldm(s, a, 1); +} + +static bool trans_CLRM(DisasContext *s, arg_CLRM *a) +{ + int i; + TCGv_i32 zero; + + if (!dc_isar_feature(aa32_m_sec_state, s)) { + return false; + } + + if (extract32(a->list, 13, 1)) { + return false; + } + + if (!a->list) { + /* UNPREDICTABLE; we choose to UNDEF */ + return false; + } + + s->eci_handled = true; + + zero = tcg_constant_i32(0); + for (i = 0; i < 15; i++) { + if (extract32(a->list, i, 1)) { + /* Clear R[i] */ + tcg_gen_mov_i32(cpu_R[i], zero); + } + } + if (extract32(a->list, 15, 1)) { + /* + * Clear APSR (by calling the MSR helper with the same argument + * as for "MSR APSR_nzcvqg, Rn": mask = 0b1100, SYSM=0) + */ + gen_helper_v7m_msr(cpu_env, tcg_constant_i32(0xc00), zero); + } + clear_eci_state(s); + return true; +} + +/* + * Branch, branch with link + */ + +static bool trans_B(DisasContext *s, arg_i *a) +{ + gen_jmp(s, jmp_diff(s, a->imm)); + return true; +} + +static bool trans_B_cond_thumb(DisasContext *s, arg_ci *a) +{ + /* This has cond from encoding, required to be outside IT block. */ + if (a->cond >= 0xe) { + return false; + } + if (s->condexec_mask) { + unallocated_encoding(s); + return true; + } + arm_skip_unless(s, a->cond); + gen_jmp(s, jmp_diff(s, a->imm)); + return true; +} + +static bool trans_BL(DisasContext *s, arg_i *a) +{ + gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb); + gen_jmp(s, jmp_diff(s, a->imm)); + return true; +} + +static bool trans_BLX_i(DisasContext *s, arg_BLX_i *a) +{ + /* + * BLX <imm> would be useless on M-profile; the encoding space + * is used for other insns from v8.1M onward, and UNDEFs before that. + */ + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + + /* For A32, ARM_FEATURE_V5 is checked near the start of the uncond block. */ + if (s->thumb && (a->imm & 2)) { + return false; + } + gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb); + store_cpu_field_constant(!s->thumb, thumb); + /* This jump is computed from an aligned PC: subtract off the low bits. */ + gen_jmp(s, jmp_diff(s, a->imm - (s->pc_curr & 3))); + return true; +} + +static bool trans_BL_BLX_prefix(DisasContext *s, arg_BL_BLX_prefix *a) +{ + assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2)); + gen_pc_plus_diff(s, cpu_R[14], jmp_diff(s, a->imm << 12)); + return true; +} + +static bool trans_BL_suffix(DisasContext *s, arg_BL_suffix *a) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + + assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2)); + tcg_gen_addi_i32(tmp, cpu_R[14], (a->imm << 1) | 1); + gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1); + gen_bx(s, tmp); + return true; +} + +static bool trans_BLX_suffix(DisasContext *s, arg_BLX_suffix *a) +{ + TCGv_i32 tmp; + + assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2)); + if (!ENABLE_ARCH_5) { + return false; + } + tmp = tcg_temp_new_i32(); + tcg_gen_addi_i32(tmp, cpu_R[14], a->imm << 1); + tcg_gen_andi_i32(tmp, tmp, 0xfffffffc); + gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1); + gen_bx(s, tmp); + return true; +} + +static bool trans_BF(DisasContext *s, arg_BF *a) +{ + /* + * M-profile branch future insns. The architecture permits an + * implementation to implement these as NOPs (equivalent to + * discarding the LO_BRANCH_INFO cache immediately), and we + * take that IMPDEF option because for QEMU a "real" implementation + * would be complicated and wouldn't execute any faster. + */ + if (!dc_isar_feature(aa32_lob, s)) { + return false; + } + if (a->boff == 0) { + /* SEE "Related encodings" (loop insns) */ + return false; + } + /* Handle as NOP */ + return true; +} + +static bool trans_DLS(DisasContext *s, arg_DLS *a) +{ + /* M-profile low-overhead loop start */ + TCGv_i32 tmp; + + if (!dc_isar_feature(aa32_lob, s)) { + return false; + } + if (a->rn == 13 || a->rn == 15) { + /* + * For DLSTP rn == 15 is a related encoding (LCTP); the + * other cases caught by this condition are all + * CONSTRAINED UNPREDICTABLE: we choose to UNDEF + */ + return false; + } + + if (a->size != 4) { + /* DLSTP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + if (!vfp_access_check(s)) { + return true; + } + } + + /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */ + tmp = load_reg(s, a->rn); + store_reg(s, 14, tmp); + if (a->size != 4) { + /* DLSTP: set FPSCR.LTPSIZE */ + store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize); + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + } + return true; +} + +static bool trans_WLS(DisasContext *s, arg_WLS *a) +{ + /* M-profile low-overhead while-loop start */ + TCGv_i32 tmp; + DisasLabel nextlabel; + + if (!dc_isar_feature(aa32_lob, s)) { + return false; + } + if (a->rn == 13 || a->rn == 15) { + /* + * For WLSTP rn == 15 is a related encoding (LE); the + * other cases caught by this condition are all + * CONSTRAINED UNPREDICTABLE: we choose to UNDEF + */ + return false; + } + if (s->condexec_mask) { + /* + * WLS in an IT block is CONSTRAINED UNPREDICTABLE; + * we choose to UNDEF, because otherwise our use of + * gen_goto_tb(1) would clash with the use of TB exit 1 + * in the dc->condjmp condition-failed codepath in + * arm_tr_tb_stop() and we'd get an assertion. + */ + return false; + } + if (a->size != 4) { + /* WLSTP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + /* + * We need to check that the FPU is enabled here, but mustn't + * call vfp_access_check() to do that because we don't want to + * do the lazy state preservation in the "loop count is zero" case. + * Do the check-and-raise-exception by hand. + */ + if (s->fp_excp_el) { + gen_exception_insn_el(s, 0, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return true; + } + } + + nextlabel = gen_disas_label(s); + tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel.label); + tmp = load_reg(s, a->rn); + store_reg(s, 14, tmp); + if (a->size != 4) { + /* + * WLSTP: set FPSCR.LTPSIZE. This requires that we do the + * lazy state preservation, new FP context creation, etc, + * that vfp_access_check() does. We know that the actual + * access check will succeed (ie it won't generate code that + * throws an exception) because we did that check by hand earlier. + */ + bool ok = vfp_access_check(s); + assert(ok); + store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize); + /* + * LTPSIZE updated, but MVE_NO_PRED will always be the same thing (0) + * when we take this upcoming exit from this TB, so gen_jmp_tb() is OK. + */ + } + gen_jmp_tb(s, curr_insn_len(s), 1); + + set_disas_label(s, nextlabel); + gen_jmp(s, jmp_diff(s, a->imm)); + return true; +} + +static bool trans_LE(DisasContext *s, arg_LE *a) +{ + /* + * M-profile low-overhead loop end. The architecture permits an + * implementation to discard the LO_BRANCH_INFO cache at any time, + * and we take the IMPDEF option to never set it in the first place + * (equivalent to always discarding it immediately), because for QEMU + * a "real" implementation would be complicated and wouldn't execute + * any faster. + */ + TCGv_i32 tmp; + DisasLabel loopend; + bool fpu_active; + + if (!dc_isar_feature(aa32_lob, s)) { + return false; + } + if (a->f && a->tp) { + return false; + } + if (s->condexec_mask) { + /* + * LE in an IT block is CONSTRAINED UNPREDICTABLE; + * we choose to UNDEF, because otherwise our use of + * gen_goto_tb(1) would clash with the use of TB exit 1 + * in the dc->condjmp condition-failed codepath in + * arm_tr_tb_stop() and we'd get an assertion. + */ + return false; + } + if (a->tp) { + /* LETP */ + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + if (!vfp_access_check(s)) { + s->eci_handled = true; + return true; + } + } + + /* LE/LETP is OK with ECI set and leaves it untouched */ + s->eci_handled = true; + + /* + * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE + * UsageFault exception for the LE insn in that case. Note that we + * are not directly checking FPSCR.LTPSIZE but instead check the + * pseudocode LTPSIZE() function, which returns 4 if the FPU is + * not currently active (ie ActiveFPState() returns false). We + * can identify not-active purely from our TB state flags, as the + * FPU is active only if: + * the FPU is enabled + * AND lazy state preservation is not active + * AND we do not need a new fp context (this is the ASPEN/FPCA check) + * + * Usually we don't need to care about this distinction between + * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check() + * will either take an exception or clear the conditions that make + * the FPU not active. But LE is an unusual case of a non-FP insn + * that looks at LTPSIZE. + */ + fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed; + + if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) { + /* Need to do a runtime check for LTPSIZE != 4 */ + DisasLabel skipexc = gen_disas_label(s); + tmp = load_cpu_field(v7m.ltpsize); + tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc.label); + tcg_temp_free_i32(tmp); + gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized()); + set_disas_label(s, skipexc); + } + + if (a->f) { + /* Loop-forever: just jump back to the loop start */ + gen_jmp(s, jmp_diff(s, -a->imm)); + return true; + } + + /* + * Not loop-forever. If LR <= loop-decrement-value this is the last loop. + * For LE, we know at this point that LTPSIZE must be 4 and the + * loop decrement value is 1. For LETP we need to calculate the decrement + * value from LTPSIZE. + */ + loopend = gen_disas_label(s); + if (!a->tp) { + tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend.label); + tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1); + } else { + /* + * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local + * so that decr stays live after the brcondi. + */ + TCGv_i32 decr = tcg_temp_local_new_i32(); + TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize); + tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize); + tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr); + tcg_temp_free_i32(ltpsize); + + tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend.label); + + tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr); + tcg_temp_free_i32(decr); + } + /* Jump back to the loop start */ + gen_jmp(s, jmp_diff(s, -a->imm)); + + set_disas_label(s, loopend); + if (a->tp) { + /* Exits from tail-pred loops must reset LTPSIZE to 4 */ + store_cpu_field(tcg_constant_i32(4), v7m.ltpsize); + } + /* End TB, continuing to following insn */ + gen_jmp_tb(s, curr_insn_len(s), 1); + return true; +} + +static bool trans_LCTP(DisasContext *s, arg_LCTP *a) +{ + /* + * M-profile Loop Clear with Tail Predication. Since our implementation + * doesn't cache branch information, all we need to do is reset + * FPSCR.LTPSIZE to 4. + */ + + if (!dc_isar_feature(aa32_lob, s) || + !dc_isar_feature(aa32_mve, s)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + store_cpu_field_constant(4, v7m.ltpsize); + return true; +} + +static bool trans_VCTP(DisasContext *s, arg_VCTP *a) +{ + /* + * M-profile Create Vector Tail Predicate. This insn is itself + * predicated and is subject to beatwise execution. + */ + TCGv_i32 rn_shifted, masklen; + + if (!dc_isar_feature(aa32_mve, s) || a->rn == 13 || a->rn == 15) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * We pre-calculate the mask length here to avoid having + * to have multiple helpers specialized for size. + * We pass the helper "rn <= (1 << (4 - size)) ? (rn << size) : 16". + */ + rn_shifted = tcg_temp_new_i32(); + masklen = load_reg(s, a->rn); + tcg_gen_shli_i32(rn_shifted, masklen, a->size); + tcg_gen_movcond_i32(TCG_COND_LEU, masklen, + masklen, tcg_constant_i32(1 << (4 - a->size)), + rn_shifted, tcg_constant_i32(16)); + gen_helper_mve_vctp(cpu_env, masklen); + tcg_temp_free_i32(masklen); + tcg_temp_free_i32(rn_shifted); + /* This insn updates predication bits */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + mve_update_eci(s); + return true; +} + +static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half) +{ + TCGv_i32 addr, tmp; + + tmp = load_reg(s, a->rm); + if (half) { + tcg_gen_add_i32(tmp, tmp, tmp); + } + addr = load_reg(s, a->rn); + tcg_gen_add_i32(addr, addr, tmp); + + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), half ? MO_UW : MO_UB); + + tcg_gen_add_i32(tmp, tmp, tmp); + gen_pc_plus_diff(s, addr, jmp_diff(s, 0)); + tcg_gen_add_i32(tmp, tmp, addr); + tcg_temp_free_i32(addr); + store_reg(s, 15, tmp); + return true; +} + +static bool trans_TBB(DisasContext *s, arg_tbranch *a) +{ + return op_tbranch(s, a, false); +} + +static bool trans_TBH(DisasContext *s, arg_tbranch *a) +{ + return op_tbranch(s, a, true); +} + +static bool trans_CBZ(DisasContext *s, arg_CBZ *a) +{ + TCGv_i32 tmp = load_reg(s, a->rn); + + arm_gen_condlabel(s); + tcg_gen_brcondi_i32(a->nz ? TCG_COND_EQ : TCG_COND_NE, + tmp, 0, s->condlabel.label); + tcg_temp_free_i32(tmp); + gen_jmp(s, jmp_diff(s, a->imm)); + return true; +} + +/* + * Supervisor call - both T32 & A32 come here so we need to check + * which mode we are in when checking for semihosting. + */ + +static bool trans_SVC(DisasContext *s, arg_SVC *a) +{ + const uint32_t semihost_imm = s->thumb ? 0xab : 0x123456; + + if (!arm_dc_feature(s, ARM_FEATURE_M) && + semihosting_enabled(s->current_el == 0) && + (a->imm == semihost_imm)) { + gen_exception_internal_insn(s, EXCP_SEMIHOST); + } else { + if (s->fgt_svc) { + uint32_t syndrome = syn_aa32_svc(a->imm, s->thumb); + gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2); + } else { + gen_update_pc(s, curr_insn_len(s)); + s->svc_imm = a->imm; + s->base.is_jmp = DISAS_SWI; + } + } + return true; +} + +/* + * Unconditional system instructions + */ + +static bool trans_RFE(DisasContext *s, arg_RFE *a) +{ + static const int8_t pre_offset[4] = { + /* DA */ -4, /* IA */ 0, /* DB */ -8, /* IB */ 4 + }; + static const int8_t post_offset[4] = { + /* DA */ -8, /* IA */ 4, /* DB */ -4, /* IB */ 0 + }; + TCGv_i32 addr, t1, t2; + + if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (IS_USER(s)) { + unallocated_encoding(s); + return true; + } + + addr = load_reg(s, a->rn); + tcg_gen_addi_i32(addr, addr, pre_offset[a->pu]); + + /* Load PC into tmp and CPSR into tmp2. */ + t1 = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, t1, addr, get_mem_index(s), MO_UL | MO_ALIGN); + tcg_gen_addi_i32(addr, addr, 4); + t2 = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, t2, addr, get_mem_index(s), MO_UL | MO_ALIGN); + + if (a->w) { + /* Base writeback. */ + tcg_gen_addi_i32(addr, addr, post_offset[a->pu]); + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + gen_rfe(s, t1, t2); + return true; +} + +static bool trans_SRS(DisasContext *s, arg_SRS *a) +{ + if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + gen_srs(s, a->mode, a->pu, a->w); + return true; +} + +static bool trans_CPS(DisasContext *s, arg_CPS *a) +{ + uint32_t mask, val; + + if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (IS_USER(s)) { + /* Implemented as NOP in user mode. */ + return true; + } + /* TODO: There are quite a lot of UNPREDICTABLE argument combinations. */ + + mask = val = 0; + if (a->imod & 2) { + if (a->A) { + mask |= CPSR_A; + } + if (a->I) { + mask |= CPSR_I; + } + if (a->F) { + mask |= CPSR_F; + } + if (a->imod & 1) { + val |= mask; + } + } + if (a->M) { + mask |= CPSR_M; + val |= a->mode; + } + if (mask) { + gen_set_psr_im(s, mask, 0, val); + } + return true; +} + +static bool trans_CPS_v7m(DisasContext *s, arg_CPS_v7m *a) +{ + TCGv_i32 tmp, addr; + + if (!arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + if (IS_USER(s)) { + /* Implemented as NOP in user mode. */ + return true; + } + + tmp = tcg_constant_i32(a->im); + /* FAULTMASK */ + if (a->F) { + addr = tcg_constant_i32(19); + gen_helper_v7m_msr(cpu_env, addr, tmp); + } + /* PRIMASK */ + if (a->I) { + addr = tcg_constant_i32(16); + gen_helper_v7m_msr(cpu_env, addr, tmp); + } + gen_rebuild_hflags(s, false); + gen_lookup_tb(s); + return true; +} + +/* + * Clear-Exclusive, Barriers + */ + +static bool trans_CLREX(DisasContext *s, arg_CLREX *a) +{ + if (s->thumb + ? !ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M) + : !ENABLE_ARCH_6K) { + return false; + } + gen_clrex(s); + return true; +} + +static bool trans_DSB(DisasContext *s, arg_DSB *a) +{ + if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); + return true; +} + +static bool trans_DMB(DisasContext *s, arg_DMB *a) +{ + return trans_DSB(s, NULL); +} + +static bool trans_ISB(DisasContext *s, arg_ISB *a) +{ + if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) { + return false; + } + /* + * We need to break the TB after this insn to execute + * self-modifying code correctly and also to take + * any pending interrupts immediately. + */ + s->base.is_jmp = DISAS_TOO_MANY; + return true; +} + +static bool trans_SB(DisasContext *s, arg_SB *a) +{ + if (!dc_isar_feature(aa32_sb, s)) { + return false; + } + /* + * TODO: There is no speculation barrier opcode + * for TCG; MB and end the TB instead. + */ + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); + s->base.is_jmp = DISAS_TOO_MANY; + return true; +} + +static bool trans_SETEND(DisasContext *s, arg_SETEND *a) +{ + if (!ENABLE_ARCH_6) { + return false; + } + if (a->E != (s->be_data == MO_BE)) { + gen_helper_setend(cpu_env); + s->base.is_jmp = DISAS_UPDATE_EXIT; + } + return true; +} + +/* + * Preload instructions + * All are nops, contingent on the appropriate arch level. + */ + +static bool trans_PLD(DisasContext *s, arg_PLD *a) +{ + return ENABLE_ARCH_5TE; +} + +static bool trans_PLDW(DisasContext *s, arg_PLD *a) +{ + return arm_dc_feature(s, ARM_FEATURE_V7MP); +} + +static bool trans_PLI(DisasContext *s, arg_PLD *a) +{ + return ENABLE_ARCH_7; +} + +/* + * If-then + */ + +static bool trans_IT(DisasContext *s, arg_IT *a) +{ + int cond_mask = a->cond_mask; + + /* + * No actual code generated for this insn, just setup state. + * + * Combinations of firstcond and mask which set up an 0b1111 + * condition are UNPREDICTABLE; we take the CONSTRAINED + * UNPREDICTABLE choice to treat 0b1111 the same as 0b1110, + * i.e. both meaning "execute always". + */ + s->condexec_cond = (cond_mask >> 4) & 0xe; + s->condexec_mask = cond_mask & 0x1f; + return true; +} + +/* v8.1M CSEL/CSINC/CSNEG/CSINV */ +static bool trans_CSEL(DisasContext *s, arg_CSEL *a) +{ + TCGv_i32 rn, rm, zero; + DisasCompare c; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + + if (a->rm == 13) { + /* SEE "Related encodings" (MVE shifts) */ + return false; + } + + if (a->rd == 13 || a->rd == 15 || a->rn == 13 || a->fcond >= 14) { + /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + return false; + } + + /* In this insn input reg fields of 0b1111 mean "zero", not "PC" */ + zero = tcg_constant_i32(0); + if (a->rn == 15) { + rn = zero; + } else { + rn = load_reg(s, a->rn); + } + if (a->rm == 15) { + rm = zero; + } else { + rm = load_reg(s, a->rm); + } + + switch (a->op) { + case 0: /* CSEL */ + break; + case 1: /* CSINC */ + tcg_gen_addi_i32(rm, rm, 1); + break; + case 2: /* CSINV */ + tcg_gen_not_i32(rm, rm); + break; + case 3: /* CSNEG */ + tcg_gen_neg_i32(rm, rm); + break; + default: + g_assert_not_reached(); + } + + arm_test_cc(&c, a->fcond); + tcg_gen_movcond_i32(c.cond, rn, c.value, zero, rn, rm); + arm_free_cc(&c); + + store_reg(s, a->rd, rn); + tcg_temp_free_i32(rm); + + return true; +} + +/* + * Legacy decoder. + */ + +static void disas_arm_insn(DisasContext *s, unsigned int insn) +{ + unsigned int cond = insn >> 28; + + /* M variants do not implement ARM mode; this must raise the INVSTATE + * UsageFault exception. + */ + if (arm_dc_feature(s, ARM_FEATURE_M)) { + gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized()); + return; + } + + if (s->pstate_il) { + /* + * Illegal execution state. This has priority over BTI + * exceptions, but comes after instruction abort exceptions. + */ + gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate()); + return; + } + + if (cond == 0xf) { + /* In ARMv3 and v4 the NV condition is UNPREDICTABLE; we + * choose to UNDEF. In ARMv5 and above the space is used + * for miscellaneous unconditional instructions. + */ + if (!arm_dc_feature(s, ARM_FEATURE_V5)) { + unallocated_encoding(s); + return; + } + + /* Unconditional instructions. */ + /* TODO: Perhaps merge these into one decodetree output file. */ + if (disas_a32_uncond(s, insn) || + disas_vfp_uncond(s, insn) || + disas_neon_dp(s, insn) || + disas_neon_ls(s, insn) || + disas_neon_shared(s, insn)) { + return; + } + /* fall back to legacy decoder */ + + if ((insn & 0x0e000f00) == 0x0c000100) { + if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) { + /* iWMMXt register transfer. */ + if (extract32(s->c15_cpar, 1, 1)) { + if (!disas_iwmmxt_insn(s, insn)) { + return; + } + } + } + } + goto illegal_op; + } + if (cond != 0xe) { + /* if not always execute, we generate a conditional jump to + next instruction */ + arm_skip_unless(s, cond); + } + + /* TODO: Perhaps merge these into one decodetree output file. */ + if (disas_a32(s, insn) || + disas_vfp(s, insn)) { + return; + } + /* fall back to legacy decoder */ + /* TODO: convert xscale/iwmmxt decoder to decodetree ?? */ + if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) { + if (((insn & 0x0c000e00) == 0x0c000000) + && ((insn & 0x03000000) != 0x03000000)) { + /* Coprocessor insn, coprocessor 0 or 1 */ + disas_xscale_insn(s, insn); + return; + } + } + +illegal_op: + unallocated_encoding(s); +} + +static bool thumb_insn_is_16bit(DisasContext *s, uint32_t pc, uint32_t insn) +{ + /* + * Return true if this is a 16 bit instruction. We must be precise + * about this (matching the decode). + */ + if ((insn >> 11) < 0x1d) { + /* Definitely a 16-bit instruction */ + return true; + } + + /* Top five bits 0b11101 / 0b11110 / 0b11111 : this is the + * first half of a 32-bit Thumb insn. Thumb-1 cores might + * end up actually treating this as two 16-bit insns, though, + * if it's half of a bl/blx pair that might span a page boundary. + */ + if (arm_dc_feature(s, ARM_FEATURE_THUMB2) || + arm_dc_feature(s, ARM_FEATURE_M)) { + /* Thumb2 cores (including all M profile ones) always treat + * 32-bit insns as 32-bit. + */ + return false; + } + + if ((insn >> 11) == 0x1e && pc - s->page_start < TARGET_PAGE_SIZE - 3) { + /* 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix, and the suffix + * is not on the next page; we merge this into a 32-bit + * insn. + */ + return false; + } + /* 0b1110_1xxx_xxxx_xxxx : BLX suffix (or UNDEF); + * 0b1111_1xxx_xxxx_xxxx : BL suffix; + * 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix on the end of a page + * -- handle as single 16 bit insn + */ + return true; +} + +/* Translate a 32-bit thumb instruction. */ +static void disas_thumb2_insn(DisasContext *s, uint32_t insn) +{ + /* + * ARMv6-M supports a limited subset of Thumb2 instructions. + * Other Thumb1 architectures allow only 32-bit + * combined BL/BLX prefix and suffix. + */ + if (arm_dc_feature(s, ARM_FEATURE_M) && + !arm_dc_feature(s, ARM_FEATURE_V7)) { + int i; + bool found = false; + static const uint32_t armv6m_insn[] = {0xf3808000 /* msr */, + 0xf3b08040 /* dsb */, + 0xf3b08050 /* dmb */, + 0xf3b08060 /* isb */, + 0xf3e08000 /* mrs */, + 0xf000d000 /* bl */}; + static const uint32_t armv6m_mask[] = {0xffe0d000, + 0xfff0d0f0, + 0xfff0d0f0, + 0xfff0d0f0, + 0xffe0d000, + 0xf800d000}; + + for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) { + if ((insn & armv6m_mask[i]) == armv6m_insn[i]) { + found = true; + break; + } + } + if (!found) { + goto illegal_op; + } + } else if ((insn & 0xf800e800) != 0xf000e800) { + if (!arm_dc_feature(s, ARM_FEATURE_THUMB2)) { + unallocated_encoding(s); + return; + } + } + + if (arm_dc_feature(s, ARM_FEATURE_M)) { + /* + * NOCP takes precedence over any UNDEF for (almost) the + * entire wide range of coprocessor-space encodings, so check + * for it first before proceeding to actually decode eg VFP + * insns. This decode also handles the few insns which are + * in copro space but do not have NOCP checks (eg VLLDM, VLSTM). + */ + if (disas_m_nocp(s, insn)) { + return; + } + } + + if ((insn & 0xef000000) == 0xef000000) { + /* + * T32 encodings 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq + * transform into + * A32 encodings 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq + */ + uint32_t a32_insn = (insn & 0xe2ffffff) | + ((insn & (1 << 28)) >> 4) | (1 << 28); + + if (disas_neon_dp(s, a32_insn)) { + return; + } + } + + if ((insn & 0xff100000) == 0xf9000000) { + /* + * T32 encodings 0b1111_1001_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq + * transform into + * A32 encodings 0b1111_0100_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq + */ + uint32_t a32_insn = (insn & 0x00ffffff) | 0xf4000000; + + if (disas_neon_ls(s, a32_insn)) { + return; + } + } + + /* + * TODO: Perhaps merge these into one decodetree output file. + * Note disas_vfp is written for a32 with cond field in the + * top nibble. The t32 encoding requires 0xe in the top nibble. + */ + if (disas_t32(s, insn) || + disas_vfp_uncond(s, insn) || + disas_neon_shared(s, insn) || + disas_mve(s, insn) || + ((insn >> 28) == 0xe && disas_vfp(s, insn))) { + return; + } + +illegal_op: + unallocated_encoding(s); +} + +static void disas_thumb_insn(DisasContext *s, uint32_t insn) +{ + if (!disas_t16(s, insn)) { + unallocated_encoding(s); + } +} + +static bool insn_crosses_page(CPUARMState *env, DisasContext *s) +{ + /* Return true if the insn at dc->base.pc_next might cross a page boundary. + * (False positives are OK, false negatives are not.) + * We know this is a Thumb insn, and our caller ensures we are + * only called if dc->base.pc_next is less than 4 bytes from the page + * boundary, so we cross the page if the first 16 bits indicate + * that this is a 32 bit insn. + */ + uint16_t insn = arm_lduw_code(env, &s->base, s->base.pc_next, s->sctlr_b); + + return !thumb_insn_is_16bit(s, s->base.pc_next, insn); +} + +static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + CPUARMState *env = cs->env_ptr; + ARMCPU *cpu = env_archcpu(env); + CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb); + uint32_t condexec, core_mmu_idx; + + dc->isar = &cpu->isar; + dc->condjmp = 0; + dc->pc_save = dc->base.pc_first; + dc->aarch64 = false; + dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB); + dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE; + condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC); + /* + * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this + * is always the IT bits. On M-profile, some of the reserved encodings + * of IT are used instead to indicate either ICI or ECI, which + * indicate partial progress of a restartable insn that was interrupted + * partway through by an exception: + * * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits + * * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits + * In all cases CONDEXEC == 0 means "not in IT block or restartable + * insn, behave normally". + */ + dc->eci = dc->condexec_mask = dc->condexec_cond = 0; + dc->eci_handled = false; + if (condexec & 0xf) { + dc->condexec_mask = (condexec & 0xf) << 1; + dc->condexec_cond = condexec >> 4; + } else { + if (arm_feature(env, ARM_FEATURE_M)) { + dc->eci = condexec >> 4; + } + } + + core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX); + dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx); + dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx); +#if !defined(CONFIG_USER_ONLY) + dc->user = (dc->current_el == 0); +#endif + dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL); + dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM); + dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL); + dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE); + dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC); + + if (arm_feature(env, ARM_FEATURE_M)) { + dc->vfp_enabled = 1; + dc->be_data = MO_TE; + dc->v7m_handler_mode = EX_TBFLAG_M32(tb_flags, HANDLER); + dc->v8m_secure = EX_TBFLAG_M32(tb_flags, SECURE); + dc->v8m_stackcheck = EX_TBFLAG_M32(tb_flags, STACKCHECK); + dc->v8m_fpccr_s_wrong = EX_TBFLAG_M32(tb_flags, FPCCR_S_WRONG); + dc->v7m_new_fp_ctxt_needed = + EX_TBFLAG_M32(tb_flags, NEW_FP_CTXT_NEEDED); + dc->v7m_lspact = EX_TBFLAG_M32(tb_flags, LSPACT); + dc->mve_no_pred = EX_TBFLAG_M32(tb_flags, MVE_NO_PRED); + } else { + dc->sctlr_b = EX_TBFLAG_A32(tb_flags, SCTLR__B); + dc->hstr_active = EX_TBFLAG_A32(tb_flags, HSTR_ACTIVE); + dc->ns = EX_TBFLAG_A32(tb_flags, NS); + dc->vfp_enabled = EX_TBFLAG_A32(tb_flags, VFPEN); + if (arm_feature(env, ARM_FEATURE_XSCALE)) { + dc->c15_cpar = EX_TBFLAG_A32(tb_flags, XSCALE_CPAR); + } else { + dc->vec_len = EX_TBFLAG_A32(tb_flags, VECLEN); + dc->vec_stride = EX_TBFLAG_A32(tb_flags, VECSTRIDE); + } + dc->sme_trap_nonstreaming = + EX_TBFLAG_A32(tb_flags, SME_TRAP_NONSTREAMING); + } + dc->cp_regs = cpu->cp_regs; + dc->features = env->features; + + /* Single step state. The code-generation logic here is: + * SS_ACTIVE == 0: + * generate code with no special handling for single-stepping (except + * that anything that can make us go to SS_ACTIVE == 1 must end the TB; + * this happens anyway because those changes are all system register or + * PSTATE writes). + * SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending) + * emit code for one insn + * emit code to clear PSTATE.SS + * emit code to generate software step exception for completed step + * end TB (as usual for having generated an exception) + * SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending) + * emit code to generate a software step exception + * end the TB + */ + dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE); + dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS); + dc->is_ldex = false; + + dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK; + + /* If architectural single step active, limit to 1. */ + if (dc->ss_active) { + dc->base.max_insns = 1; + } + + /* ARM is a fixed-length ISA. Bound the number of insns to execute + to those left on the page. */ + if (!dc->thumb) { + int bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4; + dc->base.max_insns = MIN(dc->base.max_insns, bound); + } + + cpu_V0 = tcg_temp_new_i64(); + cpu_V1 = tcg_temp_new_i64(); + cpu_M0 = tcg_temp_new_i64(); +} + +static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + + /* A note on handling of the condexec (IT) bits: + * + * We want to avoid the overhead of having to write the updated condexec + * bits back to the CPUARMState for every instruction in an IT block. So: + * (1) if the condexec bits are not already zero then we write + * zero back into the CPUARMState now. This avoids complications trying + * to do it at the end of the block. (For example if we don't do this + * it's hard to identify whether we can safely skip writing condexec + * at the end of the TB, which we definitely want to do for the case + * where a TB doesn't do anything with the IT state at all.) + * (2) if we are going to leave the TB then we call gen_set_condexec() + * which will write the correct value into CPUARMState if zero is wrong. + * This is done both for leaving the TB at the end, and for leaving + * it because of an exception we know will happen, which is done in + * gen_exception_insn(). The latter is necessary because we need to + * leave the TB with the PC/IT state just prior to execution of the + * instruction which caused the exception. + * (3) if we leave the TB unexpectedly (eg a data abort on a load) + * then the CPUARMState will be wrong and we need to reset it. + * This is handled in the same way as restoration of the + * PC in these situations; we save the value of the condexec bits + * for each PC via tcg_gen_insn_start(), and restore_state_to_opc() + * then uses this to restore them after an exception. + * + * Note that there are no instructions which can read the condexec + * bits, and none which can write non-static values to them, so + * we don't need to care about whether CPUARMState is correct in the + * middle of a TB. + */ + + /* Reset the conditional execution bits immediately. This avoids + complications trying to do it at the end of the block. */ + if (dc->condexec_mask || dc->condexec_cond) { + store_cpu_field_constant(0, condexec_bits); + } +} + +static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + /* + * The ECI/ICI bits share PSR bits with the IT bits, so we + * need to reconstitute the bits from the split-out DisasContext + * fields here. + */ + uint32_t condexec_bits; + target_ulong pc_arg = dc->base.pc_next; + + if (TARGET_TB_PCREL) { + pc_arg &= ~TARGET_PAGE_MASK; + } + if (dc->eci) { + condexec_bits = dc->eci << 4; + } else { + condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1); + } + tcg_gen_insn_start(pc_arg, condexec_bits, 0); + dc->insn_start = tcg_last_op(); +} + +static bool arm_check_kernelpage(DisasContext *dc) +{ +#ifdef CONFIG_USER_ONLY + /* Intercept jump to the magic kernel page. */ + if (dc->base.pc_next >= 0xffff0000) { + /* We always get here via a jump, so know we are not in a + conditional execution block. */ + gen_exception_internal(EXCP_KERNEL_TRAP); + dc->base.is_jmp = DISAS_NORETURN; + return true; + } +#endif + return false; +} + +static bool arm_check_ss_active(DisasContext *dc) +{ + if (dc->ss_active && !dc->pstate_ss) { + /* Singlestep state is Active-pending. + * If we're in this state at the start of a TB then either + * a) we just took an exception to an EL which is being debugged + * and this is the first insn in the exception handler + * b) debug exceptions were masked and we just unmasked them + * without changing EL (eg by clearing PSTATE.D) + * In either case we're going to take a swstep exception in the + * "did not step an insn" case, and so the syndrome ISV and EX + * bits should be zero. + */ + assert(dc->base.num_insns == 1); + gen_swstep_exception(dc, 0, 0); + dc->base.is_jmp = DISAS_NORETURN; + return true; + } + + return false; +} + +static void arm_post_translate_insn(DisasContext *dc) +{ + if (dc->condjmp && dc->base.is_jmp == DISAS_NEXT) { + if (dc->pc_save != dc->condlabel.pc_save) { + gen_update_pc(dc, dc->condlabel.pc_save - dc->pc_save); + } + gen_set_label(dc->condlabel.label); + dc->condjmp = 0; + } + translator_loop_temp_check(&dc->base); +} + +static void arm_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + CPUARMState *env = cpu->env_ptr; + uint32_t pc = dc->base.pc_next; + unsigned int insn; + + /* Singlestep exceptions have the highest priority. */ + if (arm_check_ss_active(dc)) { + dc->base.pc_next = pc + 4; + return; + } + + if (pc & 3) { + /* + * PC alignment fault. This has priority over the instruction abort + * that we would receive from a translation fault via arm_ldl_code + * (or the execution of the kernelpage entrypoint). This should only + * be possible after an indirect branch, at the start of the TB. + */ + assert(dc->base.num_insns == 1); + gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc)); + dc->base.is_jmp = DISAS_NORETURN; + dc->base.pc_next = QEMU_ALIGN_UP(pc, 4); + return; + } + + if (arm_check_kernelpage(dc)) { + dc->base.pc_next = pc + 4; + return; + } + + dc->pc_curr = pc; + insn = arm_ldl_code(env, &dc->base, pc, dc->sctlr_b); + dc->insn = insn; + dc->base.pc_next = pc + 4; + disas_arm_insn(dc, insn); + + arm_post_translate_insn(dc); + + /* ARM is a fixed-length ISA. We performed the cross-page check + in init_disas_context by adjusting max_insns. */ +} + +static bool thumb_insn_is_unconditional(DisasContext *s, uint32_t insn) +{ + /* Return true if this Thumb insn is always unconditional, + * even inside an IT block. This is true of only a very few + * instructions: BKPT, HLT, and SG. + * + * A larger class of instructions are UNPREDICTABLE if used + * inside an IT block; we do not need to detect those here, because + * what we do by default (perform the cc check and update the IT + * bits state machine) is a permitted CONSTRAINED UNPREDICTABLE + * choice for those situations. + * + * insn is either a 16-bit or a 32-bit instruction; the two are + * distinguishable because for the 16-bit case the top 16 bits + * are zeroes, and that isn't a valid 32-bit encoding. + */ + if ((insn & 0xffffff00) == 0xbe00) { + /* BKPT */ + return true; + } + + if ((insn & 0xffffffc0) == 0xba80 && arm_dc_feature(s, ARM_FEATURE_V8) && + !arm_dc_feature(s, ARM_FEATURE_M)) { + /* HLT: v8A only. This is unconditional even when it is going to + * UNDEF; see the v8A ARM ARM DDI0487B.a H3.3. + * For v7 cores this was a plain old undefined encoding and so + * honours its cc check. (We might be using the encoding as + * a semihosting trap, but we don't change the cc check behaviour + * on that account, because a debugger connected to a real v7A + * core and emulating semihosting traps by catching the UNDEF + * exception would also only see cases where the cc check passed. + * No guest code should be trying to do a HLT semihosting trap + * in an IT block anyway. + */ + return true; + } + + if (insn == 0xe97fe97f && arm_dc_feature(s, ARM_FEATURE_V8) && + arm_dc_feature(s, ARM_FEATURE_M)) { + /* SG: v8M only */ + return true; + } + + return false; +} + +static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + CPUARMState *env = cpu->env_ptr; + uint32_t pc = dc->base.pc_next; + uint32_t insn; + bool is_16bit; + /* TCG op to rewind to if this turns out to be an invalid ECI state */ + TCGOp *insn_eci_rewind = NULL; + target_ulong insn_eci_pc_save = -1; + + /* Misaligned thumb PC is architecturally impossible. */ + assert((dc->base.pc_next & 1) == 0); + + if (arm_check_ss_active(dc) || arm_check_kernelpage(dc)) { + dc->base.pc_next = pc + 2; + return; + } + + dc->pc_curr = pc; + insn = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b); + is_16bit = thumb_insn_is_16bit(dc, dc->base.pc_next, insn); + pc += 2; + if (!is_16bit) { + uint32_t insn2 = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b); + insn = insn << 16 | insn2; + pc += 2; + } + dc->base.pc_next = pc; + dc->insn = insn; + + if (dc->pstate_il) { + /* + * Illegal execution state. This has priority over BTI + * exceptions, but comes after instruction abort exceptions. + */ + gen_exception_insn(dc, 0, EXCP_UDEF, syn_illegalstate()); + return; + } + + if (dc->eci) { + /* + * For M-profile continuable instructions, ECI/ICI handling + * falls into these cases: + * - interrupt-continuable instructions + * These are the various load/store multiple insns (both + * integer and fp). The ICI bits indicate the register + * where the load/store can resume. We make the IMPDEF + * choice to always do "instruction restart", ie ignore + * the ICI value and always execute the ldm/stm from the + * start. So all we need to do is zero PSR.ICI if the + * insn executes. + * - MVE instructions subject to beat-wise execution + * Here the ECI bits indicate which beats have already been + * executed, and we must honour this. Each insn of this + * type will handle it correctly. We will update PSR.ECI + * in the helper function for the insn (some ECI values + * mean that the following insn also has been partially + * executed). + * - Special cases which don't advance ECI + * The insns LE, LETP and BKPT leave the ECI/ICI state + * bits untouched. + * - all other insns (the common case) + * Non-zero ECI/ICI means an INVSTATE UsageFault. + * We place a rewind-marker here. Insns in the previous + * three categories will set a flag in the DisasContext. + * If the flag isn't set after we call disas_thumb_insn() + * or disas_thumb2_insn() then we know we have a "some other + * insn" case. We will rewind to the marker (ie throwing away + * all the generated code) and instead emit "take exception". + */ + insn_eci_rewind = tcg_last_op(); + insn_eci_pc_save = dc->pc_save; + } + + if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) { + uint32_t cond = dc->condexec_cond; + + /* + * Conditionally skip the insn. Note that both 0xe and 0xf mean + * "always"; 0xf is not "never". + */ + if (cond < 0x0e) { + arm_skip_unless(dc, cond); + } + } + + if (is_16bit) { + disas_thumb_insn(dc, insn); + } else { + disas_thumb2_insn(dc, insn); + } + + /* Advance the Thumb condexec condition. */ + if (dc->condexec_mask) { + dc->condexec_cond = ((dc->condexec_cond & 0xe) | + ((dc->condexec_mask >> 4) & 1)); + dc->condexec_mask = (dc->condexec_mask << 1) & 0x1f; + if (dc->condexec_mask == 0) { + dc->condexec_cond = 0; + } + } + + if (dc->eci && !dc->eci_handled) { + /* + * Insn wasn't valid for ECI/ICI at all: undo what we + * just generated and instead emit an exception + */ + tcg_remove_ops_after(insn_eci_rewind); + dc->pc_save = insn_eci_pc_save; + dc->condjmp = 0; + gen_exception_insn(dc, 0, EXCP_INVSTATE, syn_uncategorized()); + } + + arm_post_translate_insn(dc); + + /* Thumb is a variable-length ISA. Stop translation when the next insn + * will touch a new page. This ensures that prefetch aborts occur at + * the right place. + * + * We want to stop the TB if the next insn starts in a new page, + * or if it spans between this page and the next. This means that + * if we're looking at the last halfword in the page we need to + * see if it's a 16-bit Thumb insn (which will fit in this TB) + * or a 32-bit Thumb insn (which won't). + * This is to avoid generating a silly TB with a single 16-bit insn + * in it at the end of this page (which would execute correctly + * but isn't very efficient). + */ + if (dc->base.is_jmp == DISAS_NEXT + && (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE + || (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE - 3 + && insn_crosses_page(env, dc)))) { + dc->base.is_jmp = DISAS_TOO_MANY; + } +} + +static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + + /* At this stage dc->condjmp will only be set when the skipped + instruction was a conditional branch or trap, and the PC has + already been written. */ + gen_set_condexec(dc); + if (dc->base.is_jmp == DISAS_BX_EXCRET) { + /* Exception return branches need some special case code at the + * end of the TB, which is complex enough that it has to + * handle the single-step vs not and the condition-failed + * insn codepath itself. + */ + gen_bx_excret_final_code(dc); + } else if (unlikely(dc->ss_active)) { + /* Unconditional and "condition passed" instruction codepath. */ + switch (dc->base.is_jmp) { + case DISAS_SWI: + gen_ss_advance(dc); + gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb)); + break; + case DISAS_HVC: + gen_ss_advance(dc); + gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2); + break; + case DISAS_SMC: + gen_ss_advance(dc); + gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3); + break; + case DISAS_NEXT: + case DISAS_TOO_MANY: + case DISAS_UPDATE_EXIT: + case DISAS_UPDATE_NOCHAIN: + gen_update_pc(dc, curr_insn_len(dc)); + /* fall through */ + default: + /* FIXME: Single stepping a WFI insn will not halt the CPU. */ + gen_singlestep_exception(dc); + break; + case DISAS_NORETURN: + break; + } + } else { + /* While branches must always occur at the end of an IT block, + there are a few other things that can cause us to terminate + the TB in the middle of an IT block: + - Exception generating instructions (bkpt, swi, undefined). + - Page boundaries. + - Hardware watchpoints. + Hardware breakpoints have already been handled and skip this code. + */ + switch (dc->base.is_jmp) { + case DISAS_NEXT: + case DISAS_TOO_MANY: + gen_goto_tb(dc, 1, curr_insn_len(dc)); + break; + case DISAS_UPDATE_NOCHAIN: + gen_update_pc(dc, curr_insn_len(dc)); + /* fall through */ + case DISAS_JUMP: + gen_goto_ptr(); + break; + case DISAS_UPDATE_EXIT: + gen_update_pc(dc, curr_insn_len(dc)); + /* fall through */ + default: + /* indicate that the hash table must be used to find the next TB */ + tcg_gen_exit_tb(NULL, 0); + break; + case DISAS_NORETURN: + /* nothing more to generate */ + break; + case DISAS_WFI: + gen_helper_wfi(cpu_env, tcg_constant_i32(curr_insn_len(dc))); + /* + * The helper doesn't necessarily throw an exception, but we + * must go back to the main loop to check for interrupts anyway. + */ + tcg_gen_exit_tb(NULL, 0); + break; + case DISAS_WFE: + gen_helper_wfe(cpu_env); + break; + case DISAS_YIELD: + gen_helper_yield(cpu_env); + break; + case DISAS_SWI: + gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb)); + break; + case DISAS_HVC: + gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2); + break; + case DISAS_SMC: + gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3); + break; + } + } + + if (dc->condjmp) { + /* "Condition failed" instruction codepath for the branch/trap insn */ + set_disas_label(dc, dc->condlabel); + gen_set_condexec(dc); + if (unlikely(dc->ss_active)) { + gen_update_pc(dc, curr_insn_len(dc)); + gen_singlestep_exception(dc); + } else { + gen_goto_tb(dc, 1, curr_insn_len(dc)); + } + } +} + +static void arm_tr_disas_log(const DisasContextBase *dcbase, + CPUState *cpu, FILE *logfile) +{ + DisasContext *dc = container_of(dcbase, DisasContext, base); + + fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first)); + target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size); +} + +static const TranslatorOps arm_translator_ops = { + .init_disas_context = arm_tr_init_disas_context, + .tb_start = arm_tr_tb_start, + .insn_start = arm_tr_insn_start, + .translate_insn = arm_tr_translate_insn, + .tb_stop = arm_tr_tb_stop, + .disas_log = arm_tr_disas_log, +}; + +static const TranslatorOps thumb_translator_ops = { + .init_disas_context = arm_tr_init_disas_context, + .tb_start = arm_tr_tb_start, + .insn_start = arm_tr_insn_start, + .translate_insn = thumb_tr_translate_insn, + .tb_stop = arm_tr_tb_stop, + .disas_log = arm_tr_disas_log, +}; + +/* generate intermediate code for basic block 'tb'. */ +void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns, + target_ulong pc, void *host_pc) +{ + DisasContext dc = { }; + const TranslatorOps *ops = &arm_translator_ops; + CPUARMTBFlags tb_flags = arm_tbflags_from_tb(tb); + + if (EX_TBFLAG_AM32(tb_flags, THUMB)) { + ops = &thumb_translator_ops; + } +#ifdef TARGET_AARCH64 + if (EX_TBFLAG_ANY(tb_flags, AARCH64_STATE)) { + ops = &aarch64_translator_ops; + } +#endif + + translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base); +} diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h new file mode 100644 index 0000000..3717824 --- /dev/null +++ b/target/arm/tcg/translate.h @@ -0,0 +1,644 @@ +#ifndef TARGET_ARM_TRANSLATE_H +#define TARGET_ARM_TRANSLATE_H + +#include "exec/translator.h" +#include "internals.h" + + +/* internal defines */ + +/* + * Save pc_save across a branch, so that we may restore the value from + * before the branch at the point the label is emitted. + */ +typedef struct DisasLabel { + TCGLabel *label; + target_ulong pc_save; +} DisasLabel; + +typedef struct DisasContext { + DisasContextBase base; + const ARMISARegisters *isar; + + /* The address of the current instruction being translated. */ + target_ulong pc_curr; + /* + * For TARGET_TB_PCREL, the full value of cpu_pc is not known + * (although the page offset is known). For convenience, the + * translation loop uses the full virtual address that triggered + * the translation, from base.pc_start through pc_curr. + * For efficiency, we do not update cpu_pc for every instruction. + * Instead, pc_save has the value of pc_curr at the time of the + * last update to cpu_pc, which allows us to compute the addend + * needed to bring cpu_pc current: pc_curr - pc_save. + * If cpu_pc now contains the destination of an indirect branch, + * pc_save contains -1 to indicate that relative updates are no + * longer possible. + */ + target_ulong pc_save; + target_ulong page_start; + uint32_t insn; + /* Nonzero if this instruction has been conditionally skipped. */ + int condjmp; + /* The label that will be jumped to when the instruction is skipped. */ + DisasLabel condlabel; + /* Thumb-2 conditional execution bits. */ + int condexec_mask; + int condexec_cond; + /* M-profile ECI/ICI exception-continuable instruction state */ + int eci; + /* + * trans_ functions for insns which are continuable should set this true + * after decode (ie after any UNDEF checks) + */ + bool eci_handled; + int sctlr_b; + MemOp be_data; +#if !defined(CONFIG_USER_ONLY) + int user; +#endif + ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */ + uint8_t tbii; /* TBI1|TBI0 for insns */ + uint8_t tbid; /* TBI1|TBI0 for data */ + uint8_t tcma; /* TCMA1|TCMA0 for MTE */ + bool ns; /* Use non-secure CPREG bank on access */ + int fp_excp_el; /* FP exception EL or 0 if enabled */ + int sve_excp_el; /* SVE exception EL or 0 if enabled */ + int sme_excp_el; /* SME exception EL or 0 if enabled */ + int vl; /* current vector length in bytes */ + int svl; /* current streaming vector length in bytes */ + bool vfp_enabled; /* FP enabled via FPSCR.EN */ + int vec_len; + int vec_stride; + bool v7m_handler_mode; + bool v8m_secure; /* true if v8M and we're in Secure mode */ + bool v8m_stackcheck; /* true if we need to perform v8M stack limit checks */ + bool v8m_fpccr_s_wrong; /* true if v8M FPCCR.S != v8m_secure */ + bool v7m_new_fp_ctxt_needed; /* ASPEN set but no active FP context */ + bool v7m_lspact; /* FPCCR.LSPACT set */ + /* Immediate value in AArch32 SVC insn; must be set if is_jmp == DISAS_SWI + * so that top level loop can generate correct syndrome information. + */ + uint32_t svc_imm; + int current_el; + GHashTable *cp_regs; + uint64_t features; /* CPU features bits */ + bool aarch64; + bool thumb; + /* Because unallocated encodings generate different exception syndrome + * information from traps due to FP being disabled, we can't do a single + * "is fp access disabled" check at a high level in the decode tree. + * To help in catching bugs where the access check was forgotten in some + * code path, we set this flag when the access check is done, and assert + * that it is set at the point where we actually touch the FP regs. + */ + bool fp_access_checked; + bool sve_access_checked; + /* ARMv8 single-step state (this is distinct from the QEMU gdbstub + * single-step support). + */ + bool ss_active; + bool pstate_ss; + /* True if the insn just emitted was a load-exclusive instruction + * (necessary for syndrome information for single step exceptions), + * ie A64 LDX*, LDAX*, A32/T32 LDREX*, LDAEX*. + */ + bool is_ldex; + /* True if AccType_UNPRIV should be used for LDTR et al */ + bool unpriv; + /* True if v8.3-PAuth is active. */ + bool pauth_active; + /* True if v8.5-MTE access to tags is enabled. */ + bool ata; + /* True if v8.5-MTE tag checks affect the PE; index with is_unpriv. */ + bool mte_active[2]; + /* True with v8.5-BTI and SCTLR_ELx.BT* set. */ + bool bt; + /* True if any CP15 access is trapped by HSTR_EL2 */ + bool hstr_active; + /* True if memory operations require alignment */ + bool align_mem; + /* True if PSTATE.IL is set */ + bool pstate_il; + /* True if PSTATE.SM is set. */ + bool pstate_sm; + /* True if PSTATE.ZA is set. */ + bool pstate_za; + /* True if non-streaming insns should raise an SME Streaming exception. */ + bool sme_trap_nonstreaming; + /* True if the current instruction is non-streaming. */ + bool is_nonstreaming; + /* True if MVE insns are definitely not predicated by VPR or LTPSIZE */ + bool mve_no_pred; + /* True if fine-grained traps are active */ + bool fgt_active; + /* True if fine-grained trap on ERET is enabled */ + bool fgt_eret; + /* True if fine-grained trap on SVC is enabled */ + bool fgt_svc; + /* + * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI. + * < 0, set by the current instruction. + */ + int8_t btype; + /* A copy of cpu->dcz_blocksize. */ + uint8_t dcz_blocksize; + /* True if this page is guarded. */ + bool guarded_page; + /* Bottom two bits of XScale c15_cpar coprocessor access control reg */ + int c15_cpar; + /* TCG op of the current insn_start. */ + TCGOp *insn_start; +#define TMP_A64_MAX 16 + int tmp_a64_count; + TCGv_i64 tmp_a64[TMP_A64_MAX]; +} DisasContext; + +typedef struct DisasCompare { + TCGCond cond; + TCGv_i32 value; + bool value_global; +} DisasCompare; + +/* Share the TCG temporaries common between 32 and 64 bit modes. */ +extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF; +extern TCGv_i64 cpu_exclusive_addr; +extern TCGv_i64 cpu_exclusive_val; + +/* + * Constant expanders for the decoders. + */ + +static inline int negate(DisasContext *s, int x) +{ + return -x; +} + +static inline int plus_1(DisasContext *s, int x) +{ + return x + 1; +} + +static inline int plus_2(DisasContext *s, int x) +{ + return x + 2; +} + +static inline int plus_12(DisasContext *s, int x) +{ + return x + 12; +} + +static inline int times_2(DisasContext *s, int x) +{ + return x * 2; +} + +static inline int times_4(DisasContext *s, int x) +{ + return x * 4; +} + +static inline int times_2_plus_1(DisasContext *s, int x) +{ + return x * 2 + 1; +} + +static inline int rsub_64(DisasContext *s, int x) +{ + return 64 - x; +} + +static inline int rsub_32(DisasContext *s, int x) +{ + return 32 - x; +} + +static inline int rsub_16(DisasContext *s, int x) +{ + return 16 - x; +} + +static inline int rsub_8(DisasContext *s, int x) +{ + return 8 - x; +} + +static inline int neon_3same_fp_size(DisasContext *s, int x) +{ + /* Convert 0==fp32, 1==fp16 into a MO_* value */ + return MO_32 - x; +} + +static inline int arm_dc_feature(DisasContext *dc, int feature) +{ + return (dc->features & (1ULL << feature)) != 0; +} + +static inline int get_mem_index(DisasContext *s) +{ + return arm_to_core_mmu_idx(s->mmu_idx); +} + +static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn) +{ + /* We don't need to save all of the syndrome so we mask and shift + * out unneeded bits to help the sleb128 encoder do a better job. + */ + syn &= ARM_INSN_START_WORD2_MASK; + syn >>= ARM_INSN_START_WORD2_SHIFT; + + /* We check and clear insn_start_idx to catch multiple updates. */ + assert(s->insn_start != NULL); + tcg_set_insn_start_param(s->insn_start, 2, syn); + s->insn_start = NULL; +} + +static inline int curr_insn_len(DisasContext *s) +{ + return s->base.pc_next - s->pc_curr; +} + +/* is_jmp field values */ +#define DISAS_JUMP DISAS_TARGET_0 /* only pc was modified dynamically */ +/* CPU state was modified dynamically; exit to main loop for interrupts. */ +#define DISAS_UPDATE_EXIT DISAS_TARGET_1 +/* These instructions trap after executing, so the A32/T32 decoder must + * defer them until after the conditional execution state has been updated. + * WFI also needs special handling when single-stepping. + */ +#define DISAS_WFI DISAS_TARGET_2 +#define DISAS_SWI DISAS_TARGET_3 +/* WFE */ +#define DISAS_WFE DISAS_TARGET_4 +#define DISAS_HVC DISAS_TARGET_5 +#define DISAS_SMC DISAS_TARGET_6 +#define DISAS_YIELD DISAS_TARGET_7 +/* M profile branch which might be an exception return (and so needs + * custom end-of-TB code) + */ +#define DISAS_BX_EXCRET DISAS_TARGET_8 +/* + * For instructions which want an immediate exit to the main loop, as opposed + * to attempting to use lookup_and_goto_ptr. Unlike DISAS_UPDATE_EXIT, this + * doesn't write the PC on exiting the translation loop so you need to ensure + * something (gen_a64_update_pc or runtime helper) has done so before we reach + * return from cpu_tb_exec. + */ +#define DISAS_EXIT DISAS_TARGET_9 +/* CPU state was modified dynamically; no need to exit, but do not chain. */ +#define DISAS_UPDATE_NOCHAIN DISAS_TARGET_10 + +#ifdef TARGET_AARCH64 +void a64_translate_init(void); +void gen_a64_update_pc(DisasContext *s, target_long diff); +extern const TranslatorOps aarch64_translator_ops; +#else +static inline void a64_translate_init(void) +{ +} + +static inline void gen_a64_update_pc(DisasContext *s, target_long diff) +{ +} +#endif + +void arm_test_cc(DisasCompare *cmp, int cc); +void arm_free_cc(DisasCompare *cmp); +void arm_jump_cc(DisasCompare *cmp, TCGLabel *label); +void arm_gen_test_cc(int cc, TCGLabel *label); +MemOp pow2_align(unsigned i); +void unallocated_encoding(DisasContext *s); +void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp, + uint32_t syn, uint32_t target_el); +void gen_exception_insn(DisasContext *s, target_long pc_diff, + int excp, uint32_t syn); + +/* Return state of Alternate Half-precision flag, caller frees result */ +static inline TCGv_i32 get_ahp_flag(void) +{ + TCGv_i32 ret = tcg_temp_new_i32(); + + tcg_gen_ld_i32(ret, cpu_env, + offsetof(CPUARMState, vfp.xregs[ARM_VFP_FPSCR])); + tcg_gen_extract_i32(ret, ret, 26, 1); + + return ret; +} + +/* Set bits within PSTATE. */ +static inline void set_pstate_bits(uint32_t bits) +{ + TCGv_i32 p = tcg_temp_new_i32(); + + tcg_debug_assert(!(bits & CACHED_PSTATE_BITS)); + + tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate)); + tcg_gen_ori_i32(p, p, bits); + tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate)); + tcg_temp_free_i32(p); +} + +/* Clear bits within PSTATE. */ +static inline void clear_pstate_bits(uint32_t bits) +{ + TCGv_i32 p = tcg_temp_new_i32(); + + tcg_debug_assert(!(bits & CACHED_PSTATE_BITS)); + + tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate)); + tcg_gen_andi_i32(p, p, ~bits); + tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate)); + tcg_temp_free_i32(p); +} + +/* If the singlestep state is Active-not-pending, advance to Active-pending. */ +static inline void gen_ss_advance(DisasContext *s) +{ + if (s->ss_active) { + s->pstate_ss = 0; + clear_pstate_bits(PSTATE_SS); + } +} + +/* Generate an architectural singlestep exception */ +static inline void gen_swstep_exception(DisasContext *s, int isv, int ex) +{ + /* Fill in the same_el field of the syndrome in the helper. */ + uint32_t syn = syn_swstep(false, isv, ex); + gen_helper_exception_swstep(cpu_env, tcg_constant_i32(syn)); +} + +/* + * Given a VFP floating point constant encoded into an 8 bit immediate in an + * instruction, expand it to the actual constant value of the specified + * size, as per the VFPExpandImm() pseudocode in the Arm ARM. + */ +uint64_t vfp_expand_imm(int size, uint8_t imm8); + +/* Vector operations shared between ARM and AArch64. */ +void gen_gvec_ceq0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_clt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_cgt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_cle0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_cge0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void gen_ushl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b); +void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b); +void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); + +void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, + int64_t shift, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); +void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); + +/* + * Forward to the isar_feature_* tests given a DisasContext pointer. + */ +#define dc_isar_feature(name, ctx) \ + ({ DisasContext *ctx_ = (ctx); isar_feature_##name(ctx_->isar); }) + +/* Note that the gvec expanders operate on offsets + sizes. */ +typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t); +typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t, + uint32_t, uint32_t); +typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); +typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); + +/* Function prototype for gen_ functions for calling Neon helpers */ +typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32); +typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32); +typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32); +typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32); +typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32, + TCGv_i32, TCGv_i32); +typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64); +typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64); +typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64); +typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64); +typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32); +typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32); +typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr); +typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); +typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); +typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64); +typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr); +typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp); +typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift); +typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32); +typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift); +typedef void ShiftFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32); + +/** + * arm_tbflags_from_tb: + * @tb: the TranslationBlock + * + * Extract the flag values from @tb. + */ +static inline CPUARMTBFlags arm_tbflags_from_tb(const TranslationBlock *tb) +{ + return (CPUARMTBFlags){ tb->flags, tb->cs_base }; +} + +/* + * Enum for argument to fpstatus_ptr(). + */ +typedef enum ARMFPStatusFlavour { + FPST_FPCR, + FPST_FPCR_F16, + FPST_STD, + FPST_STD_F16, +} ARMFPStatusFlavour; + +/** + * fpstatus_ptr: return TCGv_ptr to the specified fp_status field + * + * We have multiple softfloat float_status fields in the Arm CPU state struct + * (see the comment in cpu.h for details). Return a TCGv_ptr which has + * been set up to point to the requested field in the CPU state struct. + * The options are: + * + * FPST_FPCR + * for non-FP16 operations controlled by the FPCR + * FPST_FPCR_F16 + * for operations controlled by the FPCR where FPCR.FZ16 is to be used + * FPST_STD + * for A32/T32 Neon operations using the "standard FPSCR value" + * FPST_STD_F16 + * as FPST_STD, but where FPCR.FZ16 is to be used + */ +static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour) +{ + TCGv_ptr statusptr = tcg_temp_new_ptr(); + int offset; + + switch (flavour) { + case FPST_FPCR: + offset = offsetof(CPUARMState, vfp.fp_status); + break; + case FPST_FPCR_F16: + offset = offsetof(CPUARMState, vfp.fp_status_f16); + break; + case FPST_STD: + offset = offsetof(CPUARMState, vfp.standard_fp_status); + break; + case FPST_STD_F16: + offset = offsetof(CPUARMState, vfp.standard_fp_status_f16); + break; + default: + g_assert_not_reached(); + } + tcg_gen_addi_ptr(statusptr, cpu_env, offset); + return statusptr; +} + +/** + * finalize_memop: + * @s: DisasContext + * @opc: size+sign+align of the memory operation + * + * Build the complete MemOp for a memory operation, including alignment + * and endianness. + * + * If (op & MO_AMASK) then the operation already contains the required + * alignment, e.g. for AccType_ATOMIC. Otherwise, this an optionally + * unaligned operation, e.g. for AccType_NORMAL. + * + * In the latter case, there are configuration bits that require alignment, + * and this is applied here. Note that there is no way to indicate that + * no alignment should ever be enforced; this must be handled manually. + */ +static inline MemOp finalize_memop(DisasContext *s, MemOp opc) +{ + if (s->align_mem && !(opc & MO_AMASK)) { + opc |= MO_ALIGN; + } + return opc | s->be_data; +} + +/** + * asimd_imm_const: Expand an encoded SIMD constant value + * + * Expand a SIMD constant value. This is essentially the pseudocode + * AdvSIMDExpandImm, except that we also perform the boolean NOT needed for + * VMVN and VBIC (when cmode < 14 && op == 1). + * + * The combination cmode == 15 op == 1 is a reserved encoding for AArch32; + * callers must catch this; we return the 64-bit constant value defined + * for AArch64. + * + * cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but + * is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A; + * we produce an immediate constant value of 0 in these cases. + */ +uint64_t asimd_imm_const(uint32_t imm, int cmode, int op); + +/* + * gen_disas_label: + * Create a label and cache a copy of pc_save. + */ +static inline DisasLabel gen_disas_label(DisasContext *s) +{ + return (DisasLabel){ + .label = gen_new_label(), + .pc_save = s->pc_save, + }; +} + +/* + * set_disas_label: + * Emit a label and restore the cached copy of pc_save. + */ +static inline void set_disas_label(DisasContext *s, DisasLabel l) +{ + gen_set_label(l.label); + s->pc_save = l.pc_save; +} + +static inline TCGv_ptr gen_lookup_cp_reg(uint32_t key) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + gen_helper_lookup_cp_reg(ret, cpu_env, tcg_constant_i32(key)); + return ret; +} + +/* + * Helpers for implementing sets of trans_* functions. + * Defer the implementation of NAME to FUNC, with optional extra arguments. + */ +#define TRANS(NAME, FUNC, ...) \ + static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \ + { return FUNC(s, __VA_ARGS__); } +#define TRANS_FEAT(NAME, FEAT, FUNC, ...) \ + static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \ + { return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); } + +#define TRANS_FEAT_NONSTREAMING(NAME, FEAT, FUNC, ...) \ + static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \ + { \ + s->is_nonstreaming = true; \ + return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); \ + } + +#endif /* TARGET_ARM_TRANSLATE_H */ diff --git a/target/arm/tcg/vfp-uncond.decode b/target/arm/tcg/vfp-uncond.decode new file mode 100644 index 0000000..5c50447 --- /dev/null +++ b/target/arm/tcg/vfp-uncond.decode @@ -0,0 +1,82 @@ +# AArch32 VFP instruction descriptions (unconditional insns) +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# Encodings for the unconditional VFP instructions are here: +# generally anything matching A32 +# 1111 1110 .... .... .... 101. ...0 .... +# and T32 +# 1111 110. .... .... .... 101. .... .... +# 1111 1110 .... .... .... 101. .... .... +# (but those patterns might also cover some Neon instructions, +# which do not live in this file.) + +# VFP registers have an odd encoding with a four-bit field +# and a one-bit field which are assembled in different orders +# depending on whether the register is double or single precision. +# Each individual instruction function must do the checks for +# "double register selected but CPU does not have double support" +# and "double register number has bit 4 set but CPU does not +# support D16-D31" (which should UNDEF). +%vm_dp 5:1 0:4 +%vm_sp 0:4 5:1 +%vn_dp 7:1 16:4 +%vn_sp 16:4 7:1 +%vd_dp 22:1 12:4 +%vd_sp 12:4 22:1 + +@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp +@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp + +VSEL 1111 1110 0. cc:2 .... .... 1001 .0.0 .... \ + vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1 +VSEL 1111 1110 0. cc:2 .... .... 1010 .0.0 .... \ + vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2 +VSEL 1111 1110 0. cc:2 .... .... 1011 .0.0 .... \ + vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3 + +VMAXNM_hp 1111 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s +VMINNM_hp 1111 1110 1.00 .... .... 1001 .1.0 .... @vfp_dnm_s + +VMAXNM_sp 1111 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s +VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s + +VMAXNM_dp 1111 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d +VMINNM_dp 1111 1110 1.00 .... .... 1011 .1.0 .... @vfp_dnm_d + +VRINT 1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \ + vm=%vm_sp vd=%vd_sp sz=1 +VRINT 1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \ + vm=%vm_sp vd=%vd_sp sz=2 +VRINT 1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \ + vm=%vm_dp vd=%vd_dp sz=3 + +# VCVT float to int with specified rounding mode; Vd is always single-precision +VCVT 1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \ + vm=%vm_sp vd=%vd_sp sz=1 +VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \ + vm=%vm_sp vd=%vd_sp sz=2 +VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \ + vm=%vm_dp vd=%vd_sp sz=3 + +VMOVX 1111 1110 1.11 0000 .... 1010 01 . 0 .... \ + vd=%vd_sp vm=%vm_sp + +VINS 1111 1110 1.11 0000 .... 1010 11 . 0 .... \ + vd=%vd_sp vm=%vm_sp diff --git a/target/arm/tcg/vfp.decode b/target/arm/tcg/vfp.decode new file mode 100644 index 0000000..5405e80 --- /dev/null +++ b/target/arm/tcg/vfp.decode @@ -0,0 +1,247 @@ +# AArch32 VFP instruction descriptions (conditional insns) +# +# Copyright (c) 2019 Linaro, Ltd +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, see <http://www.gnu.org/licenses/>. + +# +# This file is processed by scripts/decodetree.py +# +# Encodings for the conditional VFP instructions are here: +# generally anything matching A32 +# cccc 11.. .... .... .... 101. .... .... +# and T32 +# 1110 110. .... .... .... 101. .... .... +# 1110 1110 .... .... .... 101. .... .... +# (but those patterns might also cover some Neon instructions, +# which do not live in this file.) + +# VFP registers have an odd encoding with a four-bit field +# and a one-bit field which are assembled in different orders +# depending on whether the register is double or single precision. +# Each individual instruction function must do the checks for +# "double register selected but CPU does not have double support" +# and "double register number has bit 4 set but CPU does not +# support D16-D31" (which should UNDEF). +%vm_dp 5:1 0:4 +%vm_sp 0:4 5:1 +%vn_dp 7:1 16:4 +%vn_sp 16:4 7:1 +%vd_dp 22:1 12:4 +%vd_sp 12:4 22:1 + +%vmov_idx_b 21:1 5:2 +%vmov_idx_h 21:1 6:1 + +%vmov_imm 16:4 0:4 + +@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp +@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp + +@vfp_dm_ss ................................ vm=%vm_sp vd=%vd_sp +@vfp_dm_dd ................................ vm=%vm_dp vd=%vd_dp +@vfp_dm_ds ................................ vm=%vm_sp vd=%vd_dp +@vfp_dm_sd ................................ vm=%vm_dp vd=%vd_sp + +# VMOV scalar to general-purpose register; note that this does +# include some Neon cases. +VMOV_to_gp ---- 1110 u:1 1. 1 .... rt:4 1011 ... 1 0000 \ + vn=%vn_dp size=0 index=%vmov_idx_b +VMOV_to_gp ---- 1110 u:1 0. 1 .... rt:4 1011 ..1 1 0000 \ + vn=%vn_dp size=1 index=%vmov_idx_h +VMOV_to_gp ---- 1110 0 0 index:1 1 .... rt:4 1011 .00 1 0000 \ + vn=%vn_dp size=2 u=0 + +VMOV_from_gp ---- 1110 0 1. 0 .... rt:4 1011 ... 1 0000 \ + vn=%vn_dp size=0 index=%vmov_idx_b +VMOV_from_gp ---- 1110 0 0. 0 .... rt:4 1011 ..1 1 0000 \ + vn=%vn_dp size=1 index=%vmov_idx_h +VMOV_from_gp ---- 1110 0 0 index:1 0 .... rt:4 1011 .00 1 0000 \ + vn=%vn_dp size=2 + +VDUP ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \ + vn=%vn_dp + +VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000 +VMOV_half ---- 1110 000 l:1 .... rt:4 1001 . 001 0000 vn=%vn_sp +VMOV_single ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 vn=%vn_sp + +VMOV_64_sp ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... vm=%vm_sp +VMOV_64_dp ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 .... vm=%vm_dp + +VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp +VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp +VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp + +# We split the load/store multiple up into two patterns to avoid +# overlap with other insns in the "Advanced SIMD load/store and 64-bit move" +# grouping: +# P=0 U=0 W=0 is 64-bit VMOV +# P=1 W=0 is VLDR/VSTR +# P=U W=1 is UNDEF +# leaving P=0 U=1 W=x and P=1 U=0 W=1 for load/store multiple. +# These include FSTM/FLDM. +VLDM_VSTM_sp ---- 1100 1 . w:1 l:1 rn:4 .... 1010 imm:8 \ + vd=%vd_sp p=0 u=1 +VLDM_VSTM_dp ---- 1100 1 . w:1 l:1 rn:4 .... 1011 imm:8 \ + vd=%vd_dp p=0 u=1 + +VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \ + vd=%vd_sp p=1 u=0 w=1 +VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \ + vd=%vd_dp p=1 u=0 w=1 + +# 3-register VFP data-processing; bits [23,21:20,6] identify the operation. +VMLA_hp ---- 1110 0.00 .... .... 1001 .0.0 .... @vfp_dnm_s +VMLA_sp ---- 1110 0.00 .... .... 1010 .0.0 .... @vfp_dnm_s +VMLA_dp ---- 1110 0.00 .... .... 1011 .0.0 .... @vfp_dnm_d + +VMLS_hp ---- 1110 0.00 .... .... 1001 .1.0 .... @vfp_dnm_s +VMLS_sp ---- 1110 0.00 .... .... 1010 .1.0 .... @vfp_dnm_s +VMLS_dp ---- 1110 0.00 .... .... 1011 .1.0 .... @vfp_dnm_d + +VNMLS_hp ---- 1110 0.01 .... .... 1001 .0.0 .... @vfp_dnm_s +VNMLS_sp ---- 1110 0.01 .... .... 1010 .0.0 .... @vfp_dnm_s +VNMLS_dp ---- 1110 0.01 .... .... 1011 .0.0 .... @vfp_dnm_d + +VNMLA_hp ---- 1110 0.01 .... .... 1001 .1.0 .... @vfp_dnm_s +VNMLA_sp ---- 1110 0.01 .... .... 1010 .1.0 .... @vfp_dnm_s +VNMLA_dp ---- 1110 0.01 .... .... 1011 .1.0 .... @vfp_dnm_d + +VMUL_hp ---- 1110 0.10 .... .... 1001 .0.0 .... @vfp_dnm_s +VMUL_sp ---- 1110 0.10 .... .... 1010 .0.0 .... @vfp_dnm_s +VMUL_dp ---- 1110 0.10 .... .... 1011 .0.0 .... @vfp_dnm_d + +VNMUL_hp ---- 1110 0.10 .... .... 1001 .1.0 .... @vfp_dnm_s +VNMUL_sp ---- 1110 0.10 .... .... 1010 .1.0 .... @vfp_dnm_s +VNMUL_dp ---- 1110 0.10 .... .... 1011 .1.0 .... @vfp_dnm_d + +VADD_hp ---- 1110 0.11 .... .... 1001 .0.0 .... @vfp_dnm_s +VADD_sp ---- 1110 0.11 .... .... 1010 .0.0 .... @vfp_dnm_s +VADD_dp ---- 1110 0.11 .... .... 1011 .0.0 .... @vfp_dnm_d + +VSUB_hp ---- 1110 0.11 .... .... 1001 .1.0 .... @vfp_dnm_s +VSUB_sp ---- 1110 0.11 .... .... 1010 .1.0 .... @vfp_dnm_s +VSUB_dp ---- 1110 0.11 .... .... 1011 .1.0 .... @vfp_dnm_d + +VDIV_hp ---- 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s +VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s +VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d + +VFMA_hp ---- 1110 1.10 .... .... 1001 .0. 0 .... @vfp_dnm_s +VFMS_hp ---- 1110 1.10 .... .... 1001 .1. 0 .... @vfp_dnm_s +VFNMA_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s +VFNMS_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s + +VFMA_sp ---- 1110 1.10 .... .... 1010 .0. 0 .... @vfp_dnm_s +VFMS_sp ---- 1110 1.10 .... .... 1010 .1. 0 .... @vfp_dnm_s +VFNMA_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s +VFNMS_sp ---- 1110 1.01 .... .... 1010 .1. 0 .... @vfp_dnm_s + +VFMA_dp ---- 1110 1.10 .... .... 1011 .0.0 .... @vfp_dnm_d +VFMS_dp ---- 1110 1.10 .... .... 1011 .1.0 .... @vfp_dnm_d +VFNMA_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d +VFNMS_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d + +VMOV_imm_hp ---- 1110 1.11 .... .... 1001 0000 .... \ + vd=%vd_sp imm=%vmov_imm +VMOV_imm_sp ---- 1110 1.11 .... .... 1010 0000 .... \ + vd=%vd_sp imm=%vmov_imm +VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \ + vd=%vd_dp imm=%vmov_imm + +VMOV_reg_sp ---- 1110 1.11 0000 .... 1010 01.0 .... @vfp_dm_ss +VMOV_reg_dp ---- 1110 1.11 0000 .... 1011 01.0 .... @vfp_dm_dd + +VABS_hp ---- 1110 1.11 0000 .... 1001 11.0 .... @vfp_dm_ss +VABS_sp ---- 1110 1.11 0000 .... 1010 11.0 .... @vfp_dm_ss +VABS_dp ---- 1110 1.11 0000 .... 1011 11.0 .... @vfp_dm_dd + +VNEG_hp ---- 1110 1.11 0001 .... 1001 01.0 .... @vfp_dm_ss +VNEG_sp ---- 1110 1.11 0001 .... 1010 01.0 .... @vfp_dm_ss +VNEG_dp ---- 1110 1.11 0001 .... 1011 01.0 .... @vfp_dm_dd + +VSQRT_hp ---- 1110 1.11 0001 .... 1001 11.0 .... @vfp_dm_ss +VSQRT_sp ---- 1110 1.11 0001 .... 1010 11.0 .... @vfp_dm_ss +VSQRT_dp ---- 1110 1.11 0001 .... 1011 11.0 .... @vfp_dm_dd + +VCMP_hp ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCMP_sp ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCMP_dp ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \ + vd=%vd_dp vm=%vm_dp + +# VCVTT and VCVTB from f16: Vd format depends on size bit; Vm is always vm_sp +VCVT_f32_f16 ---- 1110 1.11 0010 .... 1010 t:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \ + vd=%vd_dp vm=%vm_sp + +# VCVTB and VCVTT to f16: Vd format is always vd_sp; +# Vm format depends on size bit +VCVT_b16_f32 ---- 1110 1.11 0011 .... 1001 t:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \ + vd=%vd_sp vm=%vm_dp + +VRINTR_hp ---- 1110 1.11 0110 .... 1001 01.0 .... @vfp_dm_ss +VRINTR_sp ---- 1110 1.11 0110 .... 1010 01.0 .... @vfp_dm_ss +VRINTR_dp ---- 1110 1.11 0110 .... 1011 01.0 .... @vfp_dm_dd + +VRINTZ_hp ---- 1110 1.11 0110 .... 1001 11.0 .... @vfp_dm_ss +VRINTZ_sp ---- 1110 1.11 0110 .... 1010 11.0 .... @vfp_dm_ss +VRINTZ_dp ---- 1110 1.11 0110 .... 1011 11.0 .... @vfp_dm_dd + +VRINTX_hp ---- 1110 1.11 0111 .... 1001 01.0 .... @vfp_dm_ss +VRINTX_sp ---- 1110 1.11 0111 .... 1010 01.0 .... @vfp_dm_ss +VRINTX_dp ---- 1110 1.11 0111 .... 1011 01.0 .... @vfp_dm_dd + +# VCVT between single and double: +# Vm precision depends on size; Vd is its reverse +VCVT_sp ---- 1110 1.11 0111 .... 1010 11.0 .... @vfp_dm_ds +VCVT_dp ---- 1110 1.11 0111 .... 1011 11.0 .... @vfp_dm_sd + +# VCVT from integer to floating point: Vm always single; Vd depends on size +VCVT_int_hp ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_int_sp ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_int_dp ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \ + vd=%vd_dp vm=%vm_sp + +# VJCVT is always dp to sp +VJCVT ---- 1110 1.11 1001 .... 1011 11.0 .... @vfp_dm_sd + +# VCVT between floating-point and fixed-point. The immediate value +# is in the same format as a Vm single-precision register number. +# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field +# for the convenience of the trans_VCVT_fix functions. +%vcvt_fix_op 18:1 16:1 7:1 +VCVT_fix_hp ---- 1110 1.11 1.1. .... 1001 .1.0 .... \ + vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op +VCVT_fix_sp ---- 1110 1.11 1.1. .... 1010 .1.0 .... \ + vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op +VCVT_fix_dp ---- 1110 1.11 1.1. .... 1011 .1.0 .... \ + vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op + +# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size +VCVT_hp_int ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_sp_int ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \ + vd=%vd_sp vm=%vm_sp +VCVT_dp_int ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \ + vd=%vd_sp vm=%vm_dp |