aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2020-09-01 16:51:37 +0100
committerPeter Maydell <peter.maydell@linaro.org>2020-09-01 16:51:37 +0100
commit8d90bfc5c31ad60f6049dd39be636b06bc00b652 (patch)
tree5af66a13ed5b0057351741ae6c548458b8728e08
parent071a6dba7d4db57e28f659b30829b1c22b945f4e (diff)
parent3f462bf0f6ea6382dd1502d4eb1fcd33c8e774f5 (diff)
downloadqemu-8d90bfc5c31ad60f6049dd39be636b06bc00b652.zip
qemu-8d90bfc5c31ad60f6049dd39be636b06bc00b652.tar.gz
qemu-8d90bfc5c31ad60f6049dd39be636b06bc00b652.tar.bz2
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200901' into staging
target-arm queue: * Implement fp16 support for AArch32 VFP and Neon * hw/arm/sbsa-ref: add "reg" property to DT cpu nodes * hw/arm/sbsa-ref : Add embedded controller in secure memory # gpg: Signature made Tue 01 Sep 2020 16:17:23 BST # gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE # gpg: issuer "peter.maydell@linaro.org" # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate] # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20200901: (47 commits) hw/arm/sbsa-ref : Add embedded controller in secure memory hw/misc/sbsa_ec : Add an embedded controller for sbsa-ref hw/arm/sbsa-ref: add "reg" property to DT cpu nodes target/arm: Enable FP16 in '-cpu max' target/arm: Implement fp16 for Neon VMUL, VMLA, VMLS target/arm/vec_helper: Add gvec fp indexed multiply-and-add operations target/arm/vec_helper: Handle oprsz less than 16 bytes in indexed operations target/arm: Implement fp16 for Neon VRINTX target/arm: Implement fp16 for Neon VRINT-with-specified-rounding-mode target/arm: Implement fp16 for Neon VCVT with rounding modes target/arm: Implement fp16 for Neon VCVT fixed-point target/arm: Convert Neon VCVT fixed-point to gvec target/arm: Implement fp16 for Neon float-integer VCVT target/arm: Implement fp16 for Neon pairwise fp ops target/arm: Implement fp16 for Neon VRSQRTS target/arm: Implement fp16 for Neon VRECPS target/arm: Implement fp16 for Neon fp compare-vs-0 target/arm: Implement fp16 for Neon VFMA, VMFS target/arm: Implement fp16 for Neon VMLA, VMLS operations target/arm: Implement fp16 for Neon VMAXNM, VMINNM ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--hw/arm/sbsa-ref.c43
-rw-r--r--hw/misc/meson.build2
-rw-r--r--hw/misc/sbsa_ec.c98
-rw-r--r--target/arm/cpu.c3
-rw-r--r--target/arm/cpu.h7
-rw-r--r--target/arm/cpu64.c10
-rw-r--r--target/arm/helper-a64.c11
-rw-r--r--target/arm/helper.h133
-rw-r--r--target/arm/neon-dp.decode8
-rw-r--r--target/arm/translate-neon.c.inc765
-rw-r--r--target/arm/translate-sve.c4
-rw-r--r--target/arm/translate-vfp.c.inc810
-rw-r--r--target/arm/vec_helper.c431
-rw-r--r--target/arm/vfp-uncond.decode27
-rw-r--r--target/arm/vfp.decode34
-rw-r--r--target/arm/vfp_helper.c244
16 files changed, 1824 insertions, 806 deletions
diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
index 2a7d9a6..47b5286 100644
--- a/hw/arm/sbsa-ref.c
+++ b/hw/arm/sbsa-ref.c
@@ -62,6 +62,7 @@ enum {
SBSA_CPUPERIPHS,
SBSA_GIC_DIST,
SBSA_GIC_REDIST,
+ SBSA_SECURE_EC,
SBSA_SMMU,
SBSA_UART,
SBSA_RTC,
@@ -107,6 +108,7 @@ static const MemMapEntry sbsa_ref_memmap[] = {
[SBSA_CPUPERIPHS] = { 0x40000000, 0x00040000 },
[SBSA_GIC_DIST] = { 0x40060000, 0x00010000 },
[SBSA_GIC_REDIST] = { 0x40080000, 0x04000000 },
+ [SBSA_SECURE_EC] = { 0x50000000, 0x00001000 },
[SBSA_UART] = { 0x60000000, 0x00001000 },
[SBSA_RTC] = { 0x60010000, 0x00001000 },
[SBSA_GPIO] = { 0x60020000, 0x00001000 },
@@ -138,6 +140,12 @@ static const int sbsa_ref_irqmap[] = {
[SBSA_EHCI] = 11,
};
+static uint64_t sbsa_ref_cpu_mp_affinity(SBSAMachineState *sms, int idx)
+{
+ uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER;
+ return arm_cpu_mp_affinity(idx, clustersz);
+}
+
/*
* Firmware on this machine only uses ACPI table to load OS, these limited
* device tree nodes are just to let firmware know the info which varies from
@@ -183,14 +191,31 @@ static void create_fdt(SBSAMachineState *sms)
g_free(matrix);
}
+ /*
+ * From Documentation/devicetree/bindings/arm/cpus.yaml
+ * On ARM v8 64-bit systems this property is required
+ * and matches the MPIDR_EL1 register affinity bits.
+ *
+ * * If cpus node's #address-cells property is set to 2
+ *
+ * The first reg cell bits [7:0] must be set to
+ * bits [39:32] of MPIDR_EL1.
+ *
+ * The second reg cell bits [23:0] must be set to
+ * bits [23:0] of MPIDR_EL1.
+ */
qemu_fdt_add_subnode(sms->fdt, "/cpus");
+ qemu_fdt_setprop_cell(sms->fdt, "/cpus", "#address-cells", 2);
+ qemu_fdt_setprop_cell(sms->fdt, "/cpus", "#size-cells", 0x0);
for (cpu = sms->smp_cpus - 1; cpu >= 0; cpu--) {
char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu);
ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
CPUState *cs = CPU(armcpu);
+ uint64_t mpidr = sbsa_ref_cpu_mp_affinity(sms, cpu);
qemu_fdt_add_subnode(sms->fdt, nodename);
+ qemu_fdt_setprop_u64(sms->fdt, nodename, "reg", mpidr);
if (ms->possible_cpus->cpus[cs->cpu_index].props.has_node_id) {
qemu_fdt_setprop_cell(sms->fdt, nodename, "numa-node-id",
@@ -585,6 +610,16 @@ static void *sbsa_ref_dtb(const struct arm_boot_info *binfo, int *fdt_size)
return board->fdt;
}
+static void create_secure_ec(MemoryRegion *mem)
+{
+ hwaddr base = sbsa_ref_memmap[SBSA_SECURE_EC].base;
+ DeviceState *dev = qdev_new("sbsa-ec");
+ SysBusDevice *s = SYS_BUS_DEVICE(dev);
+
+ memory_region_add_subregion(mem, base,
+ sysbus_mmio_get_region(s, 0));
+}
+
static void sbsa_ref_init(MachineState *machine)
{
unsigned int smp_cpus = machine->smp.cpus;
@@ -708,6 +743,8 @@ static void sbsa_ref_init(MachineState *machine)
create_pcie(sms);
+ create_secure_ec(secure_sysmem);
+
sms->bootinfo.ram_size = machine->ram_size;
sms->bootinfo.nb_cpus = smp_cpus;
sms->bootinfo.board_id = -1;
@@ -717,12 +754,6 @@ static void sbsa_ref_init(MachineState *machine)
arm_load_kernel(ARM_CPU(first_cpu), machine, &sms->bootinfo);
}
-static uint64_t sbsa_ref_cpu_mp_affinity(SBSAMachineState *sms, int idx)
-{
- uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER;
- return arm_cpu_mp_affinity(idx, clustersz);
-}
-
static const CPUArchIdList *sbsa_ref_possible_cpu_arch_ids(MachineState *ms)
{
unsigned int max_cpus = ms->smp.max_cpus;
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 84fed04..e1576b8 100644
--- a/hw/misc/meson.build
+++ b/hw/misc/meson.build
@@ -97,3 +97,5 @@ specific_ss.add(when: 'CONFIG_MAC_VIA', if_true: files('mac_via.c'))
specific_ss.add(when: 'CONFIG_MIPS_CPS', if_true: files('mips_cmgcr.c', 'mips_cpc.c'))
specific_ss.add(when: 'CONFIG_MIPS_ITU', if_true: files('mips_itu.c'))
+
+specific_ss.add(when: 'CONFIG_SBSA_REF', if_true: files('sbsa_ec.c'))
diff --git a/hw/misc/sbsa_ec.c b/hw/misc/sbsa_ec.c
new file mode 100644
index 0000000..9a7d7f9
--- /dev/null
+++ b/hw/misc/sbsa_ec.c
@@ -0,0 +1,98 @@
+/*
+ * ARM SBSA Reference Platform Embedded Controller
+ *
+ * A device to allow PSCI running in the secure side of sbsa-ref machine
+ * to communicate platform power states to qemu.
+ *
+ * Copyright (c) 2020 Nuvia Inc
+ * Written by Graeme Gregory <graeme@nuviainc.com>
+ *
+ * SPDX-License-Identifer: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/log.h"
+#include "hw/sysbus.h"
+#include "sysemu/runstate.h"
+
+typedef struct {
+ SysBusDevice parent_obj;
+ MemoryRegion iomem;
+} SECUREECState;
+
+#define TYPE_SBSA_EC "sbsa-ec"
+#define SECURE_EC(obj) OBJECT_CHECK(SECUREECState, (obj), TYPE_SBSA_EC)
+
+enum sbsa_ec_powerstates {
+ SBSA_EC_CMD_POWEROFF = 0x01,
+ SBSA_EC_CMD_REBOOT = 0x02,
+};
+
+static uint64_t sbsa_ec_read(void *opaque, hwaddr offset, unsigned size)
+{
+ /* No use for this currently */
+ qemu_log_mask(LOG_GUEST_ERROR, "sbsa-ec: no readable registers");
+ return 0;
+}
+
+static void sbsa_ec_write(void *opaque, hwaddr offset,
+ uint64_t value, unsigned size)
+{
+ if (offset == 0) { /* PSCI machine power command register */
+ switch (value) {
+ case SBSA_EC_CMD_POWEROFF:
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ break;
+ case SBSA_EC_CMD_REBOOT:
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+ break;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "sbsa-ec: unknown power command");
+ }
+ } else {
+ qemu_log_mask(LOG_GUEST_ERROR, "sbsa-ec: unknown EC register");
+ }
+}
+
+static const MemoryRegionOps sbsa_ec_ops = {
+ .read = sbsa_ec_read,
+ .write = sbsa_ec_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .valid.min_access_size = 4,
+ .valid.max_access_size = 4,
+};
+
+static void sbsa_ec_init(Object *obj)
+{
+ SECUREECState *s = SECURE_EC(obj);
+ SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+
+ memory_region_init_io(&s->iomem, obj, &sbsa_ec_ops, s, "sbsa-ec",
+ 0x1000);
+ sysbus_init_mmio(dev, &s->iomem);
+}
+
+static void sbsa_ec_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ /* No vmstate or reset required: device has no internal state */
+ dc->user_creatable = false;
+}
+
+static const TypeInfo sbsa_ec_info = {
+ .name = TYPE_SBSA_EC,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(SECUREECState),
+ .instance_init = sbsa_ec_init,
+ .class_init = sbsa_ec_class_init,
+};
+
+static void sbsa_ec_register_type(void)
+{
+ type_register_static(&sbsa_ec_info);
+}
+
+type_init(sbsa_ec_register_type);
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 6b382fc..c179e07 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2143,7 +2143,8 @@ static void arm_max_initfn(Object *obj)
cpu->isar.id_isar6 = t;
t = cpu->isar.mvfr1;
- t = FIELD_DP32(t, MVFR1, FPHP, 2); /* v8.0 FP support */
+ t = FIELD_DP32(t, MVFR1, FPHP, 3); /* v8.2-FP16 */
+ t = FIELD_DP32(t, MVFR1, SIMDHP, 2); /* v8.2-FP16 */
cpu->isar.mvfr1 = t;
t = cpu->isar.mvfr2;
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index ac857bd..a1c7d8e 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3519,12 +3519,7 @@ static inline bool isar_feature_aa32_predinv(const ARMISARegisters *id)
static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
{
- /*
- * This is a placeholder for use by VCMA until the rest of
- * the ARMv8.2-FP16 extension is implemented for aa32 mode.
- * At which point we can properly set and check MVFR1.FPHP.
- */
- return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, FP) == 1;
+ return FIELD_EX32(id->mvfr1, MVFR1, FPHP) >= 3;
}
static inline bool isar_feature_aa32_vfp_simd(const ARMISARegisters *id)
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index dd69618..3c2b3d9 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -704,12 +704,10 @@ static void aarch64_max_initfn(Object *obj)
u = FIELD_DP32(u, ID_DFR0, PERFMON, 5); /* v8.4-PMU */
cpu->isar.id_dfr0 = u;
- /*
- * FIXME: We do not yet support ARMv8.2-fp16 for AArch32 yet,
- * so do not set MVFR1.FPHP. Strictly speaking this is not legal,
- * but it is also not legal to enable SVE without support for FP16,
- * and enabling SVE in system mode is more useful in the short term.
- */
+ u = cpu->isar.mvfr1;
+ u = FIELD_DP32(u, MVFR1, FPHP, 3); /* v8.2-FP16 */
+ u = FIELD_DP32(u, MVFR1, SIMDHP, 2); /* v8.2-FP16 */
+ cpu->isar.mvfr1 = u;
#ifdef CONFIG_USER_ONLY
/* For usermode -cpu max we can use a larger and more efficient DCZ
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 8682630..0308214 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -234,17 +234,6 @@ uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
* versions, these do a fully fused multiply-add or
* multiply-add-and-halve.
*/
-#define float16_two make_float16(0x4000)
-#define float16_three make_float16(0x4200)
-#define float16_one_point_five make_float16(0x3e00)
-
-#define float32_two make_float32(0x40000000)
-#define float32_three make_float32(0x40400000)
-#define float32_one_point_five make_float32(0x3fc00000)
-
-#define float64_two make_float64(0x4000000000000000ULL)
-#define float64_three make_float64(0x4008000000000000ULL)
-#define float64_one_point_five make_float64(0x3FF8000000000000ULL)
uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
{
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 3ca73a1..8defd7c 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -101,30 +101,43 @@ DEF_HELPER_FLAGS_5(probe_access, TCG_CALL_NO_WG, void, env, tl, i32, i32, i32)
DEF_HELPER_1(vfp_get_fpscr, i32, env)
DEF_HELPER_2(vfp_set_fpscr, void, env, i32)
+DEF_HELPER_3(vfp_addh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_adds, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_addd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_subh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_subs, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_subd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_mulh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_muls, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_muld, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_divh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_divs, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_divd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_maxh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_maxs, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_maxd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_minh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_mins, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_mind, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_maxnumh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_maxnums, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_maxnumd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_minnumh, f16, f16, f16, ptr)
DEF_HELPER_3(vfp_minnums, f32, f32, f32, ptr)
DEF_HELPER_3(vfp_minnumd, f64, f64, f64, ptr)
+DEF_HELPER_1(vfp_negh, f16, f16)
DEF_HELPER_1(vfp_negs, f32, f32)
DEF_HELPER_1(vfp_negd, f64, f64)
+DEF_HELPER_1(vfp_absh, f16, f16)
DEF_HELPER_1(vfp_abss, f32, f32)
DEF_HELPER_1(vfp_absd, f64, f64)
+DEF_HELPER_2(vfp_sqrth, f16, f16, env)
DEF_HELPER_2(vfp_sqrts, f32, f32, env)
DEF_HELPER_2(vfp_sqrtd, f64, f64, env)
+DEF_HELPER_3(vfp_cmph, void, f16, f16, env)
DEF_HELPER_3(vfp_cmps, void, f32, f32, env)
DEF_HELPER_3(vfp_cmpd, void, f64, f64, env)
+DEF_HELPER_3(vfp_cmpeh, void, f16, f16, env)
DEF_HELPER_3(vfp_cmpes, void, f32, f32, env)
DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
@@ -151,6 +164,10 @@ DEF_HELPER_2(vfp_tosizh, s32, f16, ptr)
DEF_HELPER_2(vfp_tosizs, s32, f32, ptr)
DEF_HELPER_2(vfp_tosizd, s32, f64, ptr)
+DEF_HELPER_3(vfp_toshh_round_to_zero, i32, f16, i32, ptr)
+DEF_HELPER_3(vfp_toslh_round_to_zero, i32, f16, i32, ptr)
+DEF_HELPER_3(vfp_touhh_round_to_zero, i32, f16, i32, ptr)
+DEF_HELPER_3(vfp_toulh_round_to_zero, i32, f16, i32, ptr)
DEF_HELPER_3(vfp_toshs_round_to_zero, i32, f32, i32, ptr)
DEF_HELPER_3(vfp_tosls_round_to_zero, i32, f32, i32, ptr)
DEF_HELPER_3(vfp_touhs_round_to_zero, i32, f32, i32, ptr)
@@ -189,13 +206,14 @@ DEF_HELPER_3(vfp_sqtod, f64, i64, i32, ptr)
DEF_HELPER_3(vfp_uhtod, f64, i64, i32, ptr)
DEF_HELPER_3(vfp_ultod, f64, i64, i32, ptr)
DEF_HELPER_3(vfp_uqtod, f64, i64, i32, ptr)
+DEF_HELPER_3(vfp_shtoh, f16, i32, i32, ptr)
+DEF_HELPER_3(vfp_uhtoh, f16, i32, i32, ptr)
DEF_HELPER_3(vfp_sltoh, f16, i32, i32, ptr)
DEF_HELPER_3(vfp_ultoh, f16, i32, i32, ptr)
DEF_HELPER_3(vfp_sqtoh, f16, i64, i32, ptr)
DEF_HELPER_3(vfp_uqtoh, f16, i64, i32, ptr)
DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, ptr)
-DEF_HELPER_FLAGS_2(set_neon_rmode, TCG_CALL_NO_RWG, i32, i32, env)
DEF_HELPER_FLAGS_3(vfp_fcvt_f16_to_f32, TCG_CALL_NO_RWG, f32, f16, ptr, i32)
DEF_HELPER_FLAGS_3(vfp_fcvt_f32_to_f16, TCG_CALL_NO_RWG, f16, f32, ptr, i32)
@@ -204,9 +222,8 @@ DEF_HELPER_FLAGS_3(vfp_fcvt_f64_to_f16, TCG_CALL_NO_RWG, f16, f64, ptr, i32)
DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
+DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, ptr)
-DEF_HELPER_3(recps_f32, f32, env, f32, f32)
-DEF_HELPER_3(rsqrts_f32, f32, env, f32, f32)
DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
@@ -222,8 +239,10 @@ DEF_HELPER_3(shr_cc, i32, env, i32, i32)
DEF_HELPER_3(sar_cc, i32, env, i32, i32)
DEF_HELPER_3(ror_cc, i32, env, i32, i32)
+DEF_HELPER_FLAGS_2(rinth_exact, TCG_CALL_NO_RWG, f16, f16, ptr)
DEF_HELPER_FLAGS_2(rints_exact, TCG_CALL_NO_RWG, f32, f32, ptr)
DEF_HELPER_FLAGS_2(rintd_exact, TCG_CALL_NO_RWG, f64, f64, ptr)
+DEF_HELPER_FLAGS_2(rinth, TCG_CALL_NO_RWG, f16, f16, ptr)
DEF_HELPER_FLAGS_2(rints, TCG_CALL_NO_RWG, f32, f32, ptr)
DEF_HELPER_FLAGS_2(rintd, TCG_CALL_NO_RWG, f64, f64, ptr)
@@ -587,6 +606,43 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_paddh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmaxh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pminh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_padds, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmaxs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmins, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sstoh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sitos, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ustoh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uitos, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_tosszh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_tosizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_touszh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_touizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vcvt_sf, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_uf, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_fs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_fu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_hs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_hu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_ss, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_us, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vrint_rm_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vrint_rm_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vrintx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vrintx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -595,6 +651,21 @@ DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fcgt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fcgt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_fcge0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fcge0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_fceq0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fceq0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_fcle0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fcle0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_fclt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_fclt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
@@ -607,8 +678,54 @@ DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fceq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fceq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fcge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fcgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_facge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_facge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_facgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_facgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmax_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmax_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmin_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmaxnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_recps_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_recps_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
@@ -623,6 +740,16 @@ DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index 686f9fb..1e9e859 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -254,6 +254,8 @@ VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
# We use size=0 for fp32 and size=1 for fp16 to match the 3-same encodings.
@2reg_vcvt .... ... . . . 1 ..... .... .... . q:1 . . .... \
&2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i5
+@2reg_vcvt_f16 .... ... . . . 11 .... .... .... . q:1 . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
@@ -370,7 +372,11 @@ VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
# VCVT fixed<->float conversions
-# TODO: FP16 fixed<->float conversions are opc==0b1100 and 0b1101
+VCVT_SH_2sh 1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_UH_2sh 1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HS_2sh 1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HU_2sh 1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+
VCVT_SF_2sh 1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
VCVT_UF_2sh 1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
VCVT_FS_2sh 1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index 9879731a..2d49263 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -1033,122 +1033,54 @@ DO_3SAME_PAIR(VPADD, padd_u)
DO_3SAME_VQDMULH(VQDMULH, qdmulh)
DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
-static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
- bool reads_vd)
-{
- /*
- * FP operations handled elementwise 32 bits at a time.
- * If reads_vd is true then the old value of Vd will be
- * loaded before calling the callback function. This is
- * used for multiply-accumulate type operations.
- */
- TCGv_i32 tmp, tmp2;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vn | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vn | a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- tmp = neon_load_reg(a->vn, pass);
- tmp2 = neon_load_reg(a->vm, pass);
- if (reads_vd) {
- TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
- fn(tmp_rd, tmp, tmp2, fpstatus);
- neon_store_reg(a->vd, pass, tmp_rd);
- tcg_temp_free_i32(tmp);
- } else {
- fn(tmp, tmp, tmp2, fpstatus);
- neon_store_reg(a->vd, pass, tmp);
- }
- tcg_temp_free_i32(tmp2);
- }
- tcg_temp_free_ptr(fpstatus);
- return true;
-}
-
-/*
- * For all the functions using this macro, size == 1 means fp16,
- * which is an architecture extension we don't implement yet.
- */
-#define DO_3S_FP_GVEC(INSN,FUNC) \
- static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
- uint32_t rn_ofs, uint32_t rm_ofs, \
- uint32_t oprsz, uint32_t maxsz) \
+#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rn_ofs, uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
{ \
- TCGv_ptr fpst = fpstatus_ptr(FPST_STD); \
+ TCGv_ptr fpst = fpstatus_ptr(FPST); \
tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst, \
oprsz, maxsz, 0, FUNC); \
tcg_temp_free_ptr(fpst); \
- } \
+ }
+
+#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC) \
+ WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC) \
+ WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC) \
static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
{ \
if (a->size != 0) { \
- /* TODO fp16 support */ \
- return false; \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_3same(s, a, gen_##INSN##_fp16_3s); \
} \
- return do_3same(s, a, gen_##INSN##_3s); \
- }
-
-
-DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
-DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
-DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
-DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
-
-/*
- * For all the functions using this macro, size == 1 means fp16,
- * which is an architecture extension we don't implement yet.
- */
-#define DO_3S_FP(INSN,FUNC,READS_VD) \
- static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
- { \
- if (a->size != 0) { \
- /* TODO fp16 support */ \
- return false; \
- } \
- return do_3same_fp(s, a, FUNC, READS_VD); \
- }
-
-DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
-DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
-DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
-DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
-DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
-DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
-DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
-
-static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
-{
- gen_helper_vfp_muls(vn, vn, vm, fpstatus);
- gen_helper_vfp_adds(vd, vd, vn, fpstatus);
-}
-
-static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
-{
- gen_helper_vfp_muls(vn, vn, vm, fpstatus);
- gen_helper_vfp_subs(vd, vd, vn, fpstatus);
-}
-
-DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
-DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
+ return do_3same(s, a, gen_##INSN##_fp32_3s); \
+ }
+
+
+DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
+DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
+DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
+DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
+DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
+DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
+DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
+DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
+DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
+DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
+DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
+DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
+DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
+DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
+DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
+DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
+DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
+
+WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
+WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
+WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
+WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
{
@@ -1157,11 +1089,12 @@ static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
}
if (a->size != 0) {
- /* TODO fp16 support */
- return false;
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_3same(s, a, gen_VMAXNM_fp16_3s);
}
-
- return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
+ return do_3same(s, a, gen_VMAXNM_fp32_3s);
}
static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
@@ -1171,98 +1104,18 @@ static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
}
if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
-}
-
-WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
-
-static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
- uint32_t rn_ofs, uint32_t rm_ofs,
- uint32_t oprsz, uint32_t maxsz)
-{
- static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
- tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
-}
-
-static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same(s, a, gen_VRECPS_fp_3s);
-}
-
-WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
-
-static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
- uint32_t rn_ofs, uint32_t rm_ofs,
- uint32_t oprsz, uint32_t maxsz)
-{
- static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
- tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
-}
-
-static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same(s, a, gen_VRSQRTS_fp_3s);
-}
-
-static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
-{
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
-}
-
-static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (!dc_isar_feature(aa32_simdfmac, s)) {
- return false;
- }
-
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ return do_3same(s, a, gen_VMINNM_fp16_3s);
}
-
- return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
+ return do_3same(s, a, gen_VMINNM_fp32_3s);
}
-static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
+static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
+ gen_helper_gvec_3_ptr *fn)
{
- gen_helper_vfp_negs(vn, vn);
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
-}
-
-static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (!dc_isar_feature(aa32_simdfmac, s)) {
- return false;
- }
-
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
-}
-
-static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
-{
- /* FP operations handled pairwise 32 bits at a time */
- TCGv_i32 tmp, tmp2, tmp3;
+ /* FP pairwise operations */
TCGv_ptr fpstatus;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@@ -1281,26 +1134,14 @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
assert(a->q == 0); /* enforced by decode patterns */
- /*
- * Note that we have to be careful not to clobber the source operands
- * in the "vm == vd" case by storing the result of the first pass too
- * early. Since Q is 0 there are always just two passes, so instead
- * of a complicated loop over each pass we just unroll.
- */
- fpstatus = fpstatus_ptr(FPST_STD);
- tmp = neon_load_reg(a->vn, 0);
- tmp2 = neon_load_reg(a->vn, 1);
- fn(tmp, tmp, tmp2, fpstatus);
- tcg_temp_free_i32(tmp2);
- tmp3 = neon_load_reg(a->vm, 0);
- tmp2 = neon_load_reg(a->vm, 1);
- fn(tmp3, tmp3, tmp2, fpstatus);
- tcg_temp_free_i32(tmp2);
+ fpstatus = fpstatus_ptr(a->size != 0 ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+ vfp_reg_offset(1, a->vn),
+ vfp_reg_offset(1, a->vm),
+ fpstatus, 8, 8, 0, fn);
tcg_temp_free_ptr(fpstatus);
- neon_store_reg(a->vd, 0, tmp);
- neon_store_reg(a->vd, 1, tmp3);
return true;
}
@@ -1312,15 +1153,17 @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
{ \
if (a->size != 0) { \
- /* TODO fp16 support */ \
- return false; \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_3same_fp_pair(s, a, FUNC##h); \
} \
- return do_3same_fp_pair(s, a, FUNC); \
+ return do_3same_fp_pair(s, a, FUNC##s); \
}
-DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
-DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
-DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
+DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
+DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
+DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
{
@@ -1765,17 +1608,24 @@ static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
}
static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
- NeonGenTwoSingleOpFn *fn)
+ gen_helper_gvec_2_ptr *fn)
{
/* FP operations in 2-reg-and-shift group */
- TCGv_i32 tmp, shiftv;
- TCGv_ptr fpstatus;
- int pass;
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_reg_offset(a->vd, 0);
+ int rm_ofs = neon_reg_offset(a->vm, 0);
+ TCGv_ptr fpst;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
}
+ if (a->size != 0) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ }
+
/* UNDEF accesses to D16-D31 if they don't exist. */
if (!dc_isar_feature(aa32_simd_r32, s) &&
((a->vd | a->vm) & 0x10)) {
@@ -1790,15 +1640,9 @@ static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
return true;
}
- fpstatus = fpstatus_ptr(FPST_STD);
- shiftv = tcg_const_i32(a->shift);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- tmp = neon_load_reg(a->vm, pass);
- fn(tmp, tmp, shiftv, fpstatus);
- neon_store_reg(a->vd, pass, tmp);
- }
- tcg_temp_free_ptr(fpstatus);
- tcg_temp_free_i32(shiftv);
+ fpst = fpstatus_ptr(a->size ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
+ tcg_temp_free_ptr(fpst);
return true;
}
@@ -1808,10 +1652,15 @@ static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
return do_fp_2sh(s, a, FUNC); \
}
-DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
-DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
-DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
-DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
+DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
+DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
+DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
+DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
+
+DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
+DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
+DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
+DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
{
@@ -2583,70 +2432,70 @@ static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
}
-/*
- * Rather than have a float-specific version of do_2scalar just for
- * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
- * a NeonGenTwoOpFn.
- */
-#define WRAP_FP_FN(WRAPNAME, FUNC) \
- static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
- { \
- TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD); \
- FUNC(rd, rn, rm, fpstatus); \
- tcg_temp_free_ptr(fpstatus); \
+static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ /* Two registers and a scalar, using gvec */
+ int vec_size = a->q ? 16 : 8;
+ int rd_ofs = neon_reg_offset(a->vd, 0);
+ int rn_ofs = neon_reg_offset(a->vn, 0);
+ int rm_ofs;
+ int idx;
+ TCGv_ptr fpstatus;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
}
-WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
-WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
-WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
-static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
-{
- static NeonGenTwoOpFn * const opfn[] = {
- NULL,
- NULL, /* TODO: fp16 support */
- gen_VMUL_F_mul,
- NULL,
- };
+ if (!fn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
- return do_2scalar(s, a, opfn[a->size], NULL);
-}
+ if (a->q && ((a->vd | a->vn) & 1)) {
+ return false;
+ }
-static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
-{
- static NeonGenTwoOpFn * const opfn[] = {
- NULL,
- NULL, /* TODO: fp16 support */
- gen_VMUL_F_mul,
- NULL,
- };
- static NeonGenTwoOpFn * const accfn[] = {
- NULL,
- NULL, /* TODO: fp16 support */
- gen_VMUL_F_add,
- NULL,
- };
+ if (!vfp_access_check(s)) {
+ return true;
+ }
- return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
+ /* a->vm is M:Vm, which encodes both register and index */
+ idx = extract32(a->vm, a->size + 2, 2);
+ a->vm = extract32(a->vm, 0, a->size + 2);
+ rm_ofs = neon_reg_offset(a->vm, 0);
+
+ fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
+ tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
+ vec_size, vec_size, idx, fn);
+ tcg_temp_free_ptr(fpstatus);
+ return true;
}
-static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
-{
- static NeonGenTwoOpFn * const opfn[] = {
- NULL,
- NULL, /* TODO: fp16 support */
- gen_VMUL_F_mul,
- NULL,
- };
- static NeonGenTwoOpFn * const accfn[] = {
- NULL,
- NULL, /* TODO: fp16 support */
- gen_VMUL_F_sub,
- NULL,
- };
+#define DO_VMUL_F_2sc(NAME, FUNC) \
+ static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a) \
+ { \
+ static gen_helper_gvec_3_ptr * const opfn[] = { \
+ NULL, \
+ gen_helper_##FUNC##_h, \
+ gen_helper_##FUNC##_s, \
+ NULL, \
+ }; \
+ if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ return do_2scalar_fp_vec(s, a, opfn[a->size]); \
+ }
- return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
-}
+DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
+DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
+DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
@@ -3739,22 +3588,44 @@ static bool trans_VCNT(DisasContext *s, arg_2misc *a)
return do_2misc(s, a, gen_helper_neon_cnt_u8);
}
+static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
+ vece == MO_16 ? 0x7fff : 0x7fffffff,
+ oprsz, maxsz);
+}
+
static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
{
- if (a->size != 2) {
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ } else if (a->size != MO_32) {
return false;
}
- /* TODO: FP16 : size == 1 */
- return do_2misc(s, a, gen_helper_vfp_abss);
+ return do_2misc_vec(s, a, gen_VABS_F);
+}
+
+static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
+ vece == MO_16 ? 0x8000 : 0x80000000,
+ oprsz, maxsz);
}
static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
{
- if (a->size != 2) {
+ if (a->size == MO_16) {
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+ } else if (a->size != MO_32) {
return false;
}
- /* TODO: FP16 : size == 1 */
- return do_2misc(s, a, gen_helper_vfp_negs);
+ return do_2misc_vec(s, a, gen_VNEG_F);
}
static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
@@ -3808,226 +3679,100 @@ static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
return do_2misc(s, a, fn[a->size]);
}
-static bool do_2misc_fp(DisasContext *s, arg_2misc *a,
- NeonGenOneSingleOpFn *fn)
-{
- int pass;
- TCGv_ptr fpst;
-
- /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if (a->size != 2) {
- /* TODO: FP16 will be the size == 1 case */
- return false;
- }
-
- if ((a->vd | a->vm) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- fpst = fpstatus_ptr(FPST_STD);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
- fn(tmp, tmp, fpst);
- neon_store_reg(a->vd, pass, tmp);
+#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
+ NULL, HFUNC, SFUNC, NULL, \
+ }; \
+ TCGv_ptr fpst; \
+ fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD); \
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0, \
+ fns[vece]); \
+ tcg_temp_free_ptr(fpst); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ } else if (a->size != MO_32) { \
+ return false; \
+ } \
+ return do_2misc_vec(s, a, gen_##INSN); \
}
- tcg_temp_free_ptr(fpst);
- return true;
-}
-
-#define DO_2MISC_FP(INSN, FUNC) \
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
- { \
- return do_2misc_fp(s, a, FUNC); \
- }
+DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
+DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
+DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
+DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
+DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
+DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
+DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
+DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
+DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
+DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
+DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
-DO_2MISC_FP(VRECPE_F, gen_helper_recpe_f32)
-DO_2MISC_FP(VRSQRTE_F, gen_helper_rsqrte_f32)
-DO_2MISC_FP(VCVT_FS, gen_helper_vfp_sitos)
-DO_2MISC_FP(VCVT_FU, gen_helper_vfp_uitos)
-DO_2MISC_FP(VCVT_SF, gen_helper_vfp_tosizs)
-DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs)
+DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
{
if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
return false;
}
- return do_2misc_fp(s, a, gen_helper_rints_exact);
+ return trans_VRINTX_impl(s, a);
}
-#define WRAP_FP_CMP0_FWD(WRAPNAME, FUNC) \
- static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
- { \
- TCGv_i32 zero = tcg_const_i32(0); \
- FUNC(d, m, zero, fpst); \
- tcg_temp_free_i32(zero); \
- }
-#define WRAP_FP_CMP0_REV(WRAPNAME, FUNC) \
- static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
- { \
- TCGv_i32 zero = tcg_const_i32(0); \
- FUNC(d, zero, m, fpst); \
- tcg_temp_free_i32(zero); \
- }
-
-#define DO_FP_CMP0(INSN, FUNC, REV) \
- WRAP_FP_CMP0_##REV(gen_##INSN, FUNC) \
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
- { \
- return do_2misc_fp(s, a, gen_##INSN); \
- }
-
-DO_FP_CMP0(VCGT0_F, gen_helper_neon_cgt_f32, FWD)
-DO_FP_CMP0(VCGE0_F, gen_helper_neon_cge_f32, FWD)
-DO_FP_CMP0(VCEQ0_F, gen_helper_neon_ceq_f32, FWD)
-DO_FP_CMP0(VCLE0_F, gen_helper_neon_cge_f32, REV)
-DO_FP_CMP0(VCLT0_F, gen_helper_neon_cgt_f32, REV)
-
-static bool do_vrint(DisasContext *s, arg_2misc *a, int rmode)
-{
- /*
- * Handle a VRINT* operation by iterating 32 bits at a time,
- * with a specified rounding mode in operation.
- */
- int pass;
- TCGv_ptr fpst;
- TCGv_i32 tcg_rmode;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
- !arm_dc_feature(s, ARM_FEATURE_V8)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if (a->size != 2) {
- /* TODO: FP16 will be the size == 1 case */
- return false;
- }
-
- if ((a->vd | a->vm) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- fpst = fpstatus_ptr(FPST_STD);
- tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
- gen_helper_rints(tmp, tmp, fpst);
- neon_store_reg(a->vd, pass, tmp);
- }
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
- tcg_temp_free_i32(tcg_rmode);
- tcg_temp_free_ptr(fpst);
-
- return true;
-}
-
-#define DO_VRINT(INSN, RMODE) \
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
- { \
- return do_vrint(s, a, RMODE); \
- }
-
-DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
-DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
-DO_VRINT(VRINTZ, FPROUNDING_ZERO)
-DO_VRINT(VRINTM, FPROUNDING_NEGINF)
-DO_VRINT(VRINTP, FPROUNDING_POSINF)
-
-static bool do_vcvt(DisasContext *s, arg_2misc *a, int rmode, bool is_signed)
-{
- /*
- * Handle a VCVT* operation by iterating 32 bits at a time,
- * with a specified rounding mode in operation.
- */
- int pass;
- TCGv_ptr fpst;
- TCGv_i32 tcg_rmode, tcg_shift;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
- !arm_dc_feature(s, ARM_FEATURE_V8)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if (a->size != 2) {
- /* TODO: FP16 will be the size == 1 case */
- return false;
- }
-
- if ((a->vd | a->vm) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- fpst = fpstatus_ptr(FPST_STD);
- tcg_shift = tcg_const_i32(0);
- tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
- if (is_signed) {
- gen_helper_vfp_tosls(tmp, tmp, tcg_shift, fpst);
- } else {
- gen_helper_vfp_touls(tmp, tmp, tcg_shift, fpst);
- }
- neon_store_reg(a->vd, pass, tmp);
- }
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
- tcg_temp_free_i32(tcg_rmode);
- tcg_temp_free_i32(tcg_shift);
- tcg_temp_free_ptr(fpst);
-
- return true;
-}
-
-#define DO_VCVT(INSN, RMODE, SIGNED) \
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
- { \
- return do_vcvt(s, a, RMODE, SIGNED); \
- }
-
-DO_VCVT(VCVTAU, FPROUNDING_TIEAWAY, false)
-DO_VCVT(VCVTAS, FPROUNDING_TIEAWAY, true)
-DO_VCVT(VCVTNU, FPROUNDING_TIEEVEN, false)
-DO_VCVT(VCVTNS, FPROUNDING_TIEEVEN, true)
-DO_VCVT(VCVTPU, FPROUNDING_POSINF, false)
-DO_VCVT(VCVTPS, FPROUNDING_POSINF, true)
-DO_VCVT(VCVTMU, FPROUNDING_NEGINF, false)
-DO_VCVT(VCVTMS, FPROUNDING_NEGINF, true)
+#define DO_VEC_RMODE(INSN, RMODE, OP) \
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
+ uint32_t rm_ofs, \
+ uint32_t oprsz, uint32_t maxsz) \
+ { \
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
+ NULL, \
+ gen_helper_gvec_##OP##h, \
+ gen_helper_gvec_##OP##s, \
+ NULL, \
+ }; \
+ TCGv_ptr fpst; \
+ fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD); \
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, \
+ arm_rmode_to_sf(RMODE), fns[vece]); \
+ tcg_temp_free_ptr(fpst); \
+ } \
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
+ { \
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) { \
+ return false; \
+ } \
+ if (a->size == MO_16) { \
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
+ return false; \
+ } \
+ } else if (a->size != MO_32) { \
+ return false; \
+ } \
+ return do_2misc_vec(s, a, gen_##INSN); \
+ }
+
+DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
+DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
+DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
+DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
+DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
+DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
+
+DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
+DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
+DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
+DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
+DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
static bool trans_VSWP(DisasContext *s, arg_2misc *a)
{
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 15ad6c7..e4cd6b6 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3803,10 +3803,6 @@ static bool trans_##NAME##_zpzi(DisasContext *s, arg_rpri_esz *a) \
return true; \
}
-#define float16_two make_float16(0x4000)
-#define float32_two make_float32(0x40000000)
-#define float64_two make_float64(0x4000000000000000ULL)
-
DO_FP_IMM(FADD, fadds, half, one)
DO_FP_IMM(FSUB, fsubs, half, one)
DO_FP_IMM(FMUL, fmuls, half, two)
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
index 4eeafb4..28e0dba 100644
--- a/target/arm/translate-vfp.c.inc
+++ b/target/arm/translate-vfp.c.inc
@@ -190,18 +190,22 @@ static bool vfp_access_check(DisasContext *s)
static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
{
uint32_t rd, rn, rm;
- bool dp = a->dp;
+ int sz = a->sz;
if (!dc_isar_feature(aa32_vsel, s)) {
return false;
}
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist */
- if (dp && !dc_isar_feature(aa32_simd_r32, s) &&
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
((a->vm | a->vn | a->vd) & 0x10)) {
return false;
}
@@ -214,7 +218,7 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
return true;
}
- if (dp) {
+ if (sz == 3) {
TCGv_i64 frn, frm, dest;
TCGv_i64 tmp, zero, zf, nf, vf;
@@ -307,6 +311,10 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
tcg_temp_free_i32(tmp);
break;
}
+ /* For fp16 the top half is always zeroes */
+ if (sz == 1) {
+ tcg_gen_andi_i32(dest, dest, 0xffff);
+ }
neon_store_reg32(dest, rd);
tcg_temp_free_i32(frn);
tcg_temp_free_i32(frm);
@@ -333,7 +341,7 @@ static const uint8_t fp_decode_rm[] = {
static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
{
uint32_t rd, rm;
- bool dp = a->dp;
+ int sz = a->sz;
TCGv_ptr fpst;
TCGv_i32 tcg_rmode;
int rounding = fp_decode_rm[a->rm];
@@ -342,12 +350,16 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
return false;
}
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist */
- if (dp && !dc_isar_feature(aa32_simd_r32, s) &&
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
((a->vm | a->vd) & 0x10)) {
return false;
}
@@ -359,12 +371,16 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
return true;
}
- fpst = fpstatus_ptr(FPST_FPCR);
+ if (sz == 1) {
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ } else {
+ fpst = fpstatus_ptr(FPST_FPCR);
+ }
tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
- if (dp) {
+ if (sz == 3) {
TCGv_i64 tcg_op;
TCGv_i64 tcg_res;
tcg_op = tcg_temp_new_i64();
@@ -380,7 +396,11 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
tcg_op = tcg_temp_new_i32();
tcg_res = tcg_temp_new_i32();
neon_load_reg32(tcg_op, rm);
- gen_helper_rints(tcg_res, tcg_op, fpst);
+ if (sz == 1) {
+ gen_helper_rinth(tcg_res, tcg_op, fpst);
+ } else {
+ gen_helper_rints(tcg_res, tcg_op, fpst);
+ }
neon_store_reg32(tcg_res, rd);
tcg_temp_free_i32(tcg_op);
tcg_temp_free_i32(tcg_res);
@@ -396,7 +416,7 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
{
uint32_t rd, rm;
- bool dp = a->dp;
+ int sz = a->sz;
TCGv_ptr fpst;
TCGv_i32 tcg_rmode, tcg_shift;
int rounding = fp_decode_rm[a->rm];
@@ -406,12 +426,16 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
return false;
}
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+ return false;
+ }
+
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist */
- if (dp && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
return false;
}
@@ -422,14 +446,18 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
return true;
}
- fpst = fpstatus_ptr(FPST_FPCR);
+ if (sz == 1) {
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ } else {
+ fpst = fpstatus_ptr(FPST_FPCR);
+ }
tcg_shift = tcg_const_i32(0);
tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
- if (dp) {
+ if (sz == 3) {
TCGv_i64 tcg_double, tcg_res;
TCGv_i32 tcg_tmp;
tcg_double = tcg_temp_new_i64();
@@ -451,10 +479,18 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
tcg_single = tcg_temp_new_i32();
tcg_res = tcg_temp_new_i32();
neon_load_reg32(tcg_single, rm);
- if (is_signed) {
- gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
+ if (sz == 1) {
+ if (is_signed) {
+ gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
+ } else {
+ gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst);
+ }
} else {
- gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
+ if (is_signed) {
+ gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
+ } else {
+ gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
+ }
}
neon_store_reg32(tcg_res, rd);
tcg_temp_free_i32(tcg_res);
@@ -773,6 +809,40 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
return true;
}
+static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
+{
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (a->rt == 15) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (a->l) {
+ /* VFP to general purpose register */
+ tmp = tcg_temp_new_i32();
+ neon_load_reg32(tmp, a->vn);
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
+ store_reg(s, a->rt, tmp);
+ } else {
+ /* general purpose register to VFP */
+ tmp = load_reg(s, a->rt);
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
+ neon_store_reg32(tmp, a->vn);
+ tcg_temp_free_i32(tmp);
+ }
+
+ return true;
+}
+
static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
{
TCGv_i32 tmp;
@@ -886,6 +956,41 @@ static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
return true;
}
+static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
+{
+ uint32_t offset;
+ TCGv_i32 addr, tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */
+ offset = a->imm << 1;
+ if (!a->u) {
+ offset = -offset;
+ }
+
+ /* For thumb, use of PC is UNPREDICTABLE. */
+ addr = add_reg_for_lit(s, a->rn, offset);
+ tmp = tcg_temp_new_i32();
+ if (a->l) {
+ gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
+ neon_store_reg32(tmp, a->vd);
+ } else {
+ neon_load_reg32(tmp, a->vd);
+ gen_aa32_st16(s, tmp, addr, get_mem_index(s));
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(addr);
+
+ return true;
+}
+
static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
{
uint32_t offset;
@@ -1266,6 +1371,54 @@ static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
return true;
}
+static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ /*
+ * Do a half-precision operation. Functionally this is
+ * the same as do_vfp_3op_sp(), except:
+ * - it uses the FPST_FPCR_F16
+ * - it doesn't need the VFP vector handling (fp16 is a
+ * v8 feature, and in v8 VFP vectors don't exist)
+ * - it does the aa32_fp16_arith feature test
+ */
+ TCGv_i32 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ f0 = tcg_temp_new_i32();
+ f1 = tcg_temp_new_i32();
+ fd = tcg_temp_new_i32();
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+ neon_load_reg32(f0, vn);
+ neon_load_reg32(f1, vm);
+
+ if (reads_vd) {
+ neon_load_reg32(fd, vd);
+ }
+ fn(fd, f0, f1, fpst);
+ neon_store_reg32(fd, vd);
+
+ tcg_temp_free_i32(f0);
+ tcg_temp_free_i32(f1);
+ tcg_temp_free_i32(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
int vd, int vn, int vm, bool reads_vd)
{
@@ -1421,6 +1574,38 @@ static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
return true;
}
+static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
+{
+ /*
+ * Do a half-precision operation. Functionally this is
+ * the same as do_vfp_2op_sp(), except:
+ * - it doesn't need the VFP vector handling (fp16 is a
+ * v8 feature, and in v8 VFP vectors don't exist)
+ * - it does the aa32_fp16_arith feature test
+ */
+ TCGv_i32 f0;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ f0 = tcg_temp_new_i32();
+ neon_load_reg32(f0, vm);
+ fn(f0, f0);
+ neon_store_reg32(f0, vd);
+ tcg_temp_free_i32(f0);
+
+ return true;
+}
+
static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
{
uint32_t delta_m = 0;
@@ -1499,6 +1684,21 @@ static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
return true;
}
+static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
{
/* Note that order of inputs to the add matters for NaNs */
@@ -1529,6 +1729,25 @@ static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a)
return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
}
+static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VMLS: vd = vd + -(vn * vm)
+ * Note that order of inputs to the add matters for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(tmp, tmp);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
{
/*
@@ -1567,6 +1786,27 @@ static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a)
return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true);
}
+static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /*
+ * VNMLS: -fd + (fn * fm)
+ * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+ * plausible looking simplifications because this will give wrong results
+ * for NaNs.
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(vd, vd);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
{
/*
@@ -1609,6 +1849,23 @@ static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a)
return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true);
}
+static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMLA: -fd + -(fn * fm) */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+ gen_helper_vfp_negh(tmp, tmp);
+ gen_helper_vfp_negh(vd, vd);
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
{
/* VNMLA: -fd + -(fn * fm) */
@@ -1643,6 +1900,11 @@ static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a)
return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true);
}
+static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false);
+}
+
static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a)
{
return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false);
@@ -1653,6 +1915,18 @@ static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a)
return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false);
}
+static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* VNMUL: -(fn * fm) */
+ gen_helper_vfp_mulh(vd, vn, vm, fpst);
+ gen_helper_vfp_negh(vd, vd);
+}
+
+static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false);
+}
+
static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
{
/* VNMUL: -(fn * fm) */
@@ -1677,6 +1951,11 @@ static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a)
return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false);
}
+static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false);
+}
+
static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a)
{
return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false);
@@ -1687,6 +1966,11 @@ static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a)
return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false);
}
+static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false);
+}
+
static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a)
{
return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false);
@@ -1697,6 +1981,11 @@ static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a)
return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false);
}
+static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a)
+{
+ return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false);
+}
+
static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a)
{
return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false);
@@ -1707,6 +1996,24 @@ static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a)
return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
}
+static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_hp(s, gen_helper_vfp_minnumh,
+ a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a)
+{
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+ return false;
+ }
+ return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh,
+ a->vd, a->vn, a->vm, false);
+}
+
static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a)
{
if (!dc_isar_feature(aa32_vminmaxnm, s)) {
@@ -1743,6 +2050,69 @@ static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
a->vd, a->vn, a->vm, false);
}
+static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
+{
+ /*
+ * VFNMA : fd = muladd(-fd, fn, fm)
+ * VFNMS : fd = muladd(-fd, -fn, fm)
+ * VFMA : fd = muladd( fd, fn, fm)
+ * VFMS : fd = muladd( fd, -fn, fm)
+ *
+ * These are fused multiply-add, and must be done as one floating
+ * point operation with no rounding between the multiplication and
+ * addition steps. NB that doing the negations here as separate
+ * steps is correct : an input NaN should come out with its sign
+ * bit flipped if it is a negated-input.
+ */
+ TCGv_ptr fpst;
+ TCGv_i32 vn, vm, vd;
+
+ /*
+ * Present in VFPv4 only, and only with the FP16 extension.
+ * Note that we can't rely on the SIMDFMAC check alone, because
+ * in a Neon-no-VFP core that ID register field will be non-zero.
+ */
+ if (!dc_isar_feature(aa32_fp16_arith, s) ||
+ !dc_isar_feature(aa32_simdfmac, s) ||
+ !dc_isar_feature(aa32_fpsp_v2, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vn = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+ vd = tcg_temp_new_i32();
+
+ neon_load_reg32(vn, a->vn);
+ neon_load_reg32(vm, a->vm);
+ if (neg_n) {
+ /* VFNMS, VFMS */
+ gen_helper_vfp_negh(vn, vn);
+ }
+ neon_load_reg32(vd, a->vd);
+ if (neg_d) {
+ /* VFNMA, VFNMS */
+ gen_helper_vfp_negh(vd, vd);
+ }
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
+ neon_store_reg32(vd, a->vd);
+
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(vn);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_i32(vd);
+
+ return true;
+}
+
static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
{
/*
@@ -1808,26 +2178,6 @@ static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
return true;
}
-static bool trans_VFMA_sp(DisasContext *s, arg_VFMA_sp *a)
-{
- return do_vfm_sp(s, a, false, false);
-}
-
-static bool trans_VFMS_sp(DisasContext *s, arg_VFMS_sp *a)
-{
- return do_vfm_sp(s, a, true, false);
-}
-
-static bool trans_VFNMA_sp(DisasContext *s, arg_VFNMA_sp *a)
-{
- return do_vfm_sp(s, a, false, true);
-}
-
-static bool trans_VFNMS_sp(DisasContext *s, arg_VFNMS_sp *a)
-{
- return do_vfm_sp(s, a, true, true);
-}
-
static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
{
/*
@@ -1899,24 +2249,43 @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
return true;
}
-static bool trans_VFMA_dp(DisasContext *s, arg_VFMA_dp *a)
-{
- return do_vfm_dp(s, a, false, false);
-}
+#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD) \
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
+ arg_##INSN##_##PREC *a) \
+ { \
+ return do_vfm_##PREC(s, a, NEGN, NEGD); \
+ }
-static bool trans_VFMS_dp(DisasContext *s, arg_VFMS_dp *a)
-{
- return do_vfm_dp(s, a, true, false);
-}
+#define MAKE_VFM_TRANS_FNS(PREC) \
+ MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \
+ MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \
+ MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
+ MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
-static bool trans_VFNMA_dp(DisasContext *s, arg_VFNMA_dp *a)
-{
- return do_vfm_dp(s, a, false, true);
-}
+MAKE_VFM_TRANS_FNS(hp)
+MAKE_VFM_TRANS_FNS(sp)
+MAKE_VFM_TRANS_FNS(dp)
-static bool trans_VFNMS_dp(DisasContext *s, arg_VFNMS_dp *a)
+static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
{
- return do_vfm_dp(s, a, true, true);
+ TCGv_i32 fd;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fd = tcg_const_i32(vfp_expand_imm(MO_16, a->imm));
+ neon_store_reg32(fd, a->vd);
+ tcg_temp_free_i32(fd);
+ return true;
}
static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
@@ -2024,34 +2393,27 @@ static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
return true;
}
-static bool trans_VMOV_reg_sp(DisasContext *s, arg_VMOV_reg_sp *a)
-{
- return do_vfp_2op_sp(s, tcg_gen_mov_i32, a->vd, a->vm);
-}
+#define DO_VFP_2OP(INSN, PREC, FN) \
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
+ arg_##INSN##_##PREC *a) \
+ { \
+ return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \
+ }
-static bool trans_VMOV_reg_dp(DisasContext *s, arg_VMOV_reg_dp *a)
-{
- return do_vfp_2op_dp(s, tcg_gen_mov_i64, a->vd, a->vm);
-}
+DO_VFP_2OP(VMOV_reg, sp, tcg_gen_mov_i32)
+DO_VFP_2OP(VMOV_reg, dp, tcg_gen_mov_i64)
-static bool trans_VABS_sp(DisasContext *s, arg_VABS_sp *a)
-{
- return do_vfp_2op_sp(s, gen_helper_vfp_abss, a->vd, a->vm);
-}
+DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh)
+DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss)
+DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd)
-static bool trans_VABS_dp(DisasContext *s, arg_VABS_dp *a)
-{
- return do_vfp_2op_dp(s, gen_helper_vfp_absd, a->vd, a->vm);
-}
+DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh)
+DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs)
+DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd)
-static bool trans_VNEG_sp(DisasContext *s, arg_VNEG_sp *a)
+static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm)
{
- return do_vfp_2op_sp(s, gen_helper_vfp_negs, a->vd, a->vm);
-}
-
-static bool trans_VNEG_dp(DisasContext *s, arg_VNEG_dp *a)
-{
- return do_vfp_2op_dp(s, gen_helper_vfp_negd, a->vd, a->vm);
+ gen_helper_vfp_sqrth(vd, vm, cpu_env);
}
static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
@@ -2059,19 +2421,52 @@ static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
gen_helper_vfp_sqrts(vd, vm, cpu_env);
}
-static bool trans_VSQRT_sp(DisasContext *s, arg_VSQRT_sp *a)
-{
- return do_vfp_2op_sp(s, gen_VSQRT_sp, a->vd, a->vm);
-}
-
static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
{
gen_helper_vfp_sqrtd(vd, vm, cpu_env);
}
-static bool trans_VSQRT_dp(DisasContext *s, arg_VSQRT_dp *a)
+DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp)
+DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp)
+DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp)
+
+static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
{
- return do_vfp_2op_dp(s, gen_VSQRT_dp, a->vd, a->vm);
+ TCGv_i32 vd, vm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ /* Vm/M bits must be zero for the Z variant */
+ if (a->z && a->vm != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vd = tcg_temp_new_i32();
+ vm = tcg_temp_new_i32();
+
+ neon_load_reg32(vd, a->vd);
+ if (a->z) {
+ tcg_gen_movi_i32(vm, 0);
+ } else {
+ neon_load_reg32(vm, a->vm);
+ }
+
+ if (a->e) {
+ gen_helper_vfp_cmpeh(vd, vm, cpu_env);
+ } else {
+ gen_helper_vfp_cmph(vd, vm, cpu_env);
+ }
+
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i32(vm);
+
+ return true;
}
static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
@@ -2289,6 +2684,29 @@ static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
return true;
}
+static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ neon_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_rinth(tmp, tmp, fpst);
+ neon_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
{
TCGv_ptr fpst;
@@ -2344,6 +2762,34 @@ static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
return true;
}
+static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+ TCGv_i32 tcg_rmode;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ neon_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ tcg_rmode = tcg_const_i32(float_round_to_zero);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ gen_helper_rinth(tmp, tmp, fpst);
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+ neon_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tcg_rmode);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
{
TCGv_ptr fpst;
@@ -2409,6 +2855,29 @@ static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
return true;
}
+static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ tmp = tcg_temp_new_i32();
+ neon_load_reg32(tmp, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ gen_helper_rinth_exact(tmp, tmp, fpst);
+ neon_store_reg32(tmp, a->vd);
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
{
TCGv_ptr fpst;
@@ -2520,6 +2989,35 @@ static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)
return true;
}
+static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ vm = tcg_temp_new_i32();
+ neon_load_reg32(vm, a->vm);
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ if (a->s) {
+ /* i32 -> f16 */
+ gen_helper_vfp_sitoh(vm, vm, fpst);
+ } else {
+ /* u32 -> f16 */
+ gen_helper_vfp_uitoh(vm, vm, fpst);
+ }
+ neon_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
{
TCGv_i32 vm;
@@ -2618,6 +3116,65 @@ static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)
return true;
}
+static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
+{
+ TCGv_i32 vd, shift;
+ TCGv_ptr fpst;
+ int frac_bits;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+ vd = tcg_temp_new_i32();
+ neon_load_reg32(vd, a->vd);
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ shift = tcg_const_i32(frac_bits);
+
+ /* Switch on op:U:sx bits */
+ switch (a->opc) {
+ case 0:
+ gen_helper_vfp_shtoh(vd, vd, shift, fpst);
+ break;
+ case 1:
+ gen_helper_vfp_sltoh(vd, vd, shift, fpst);
+ break;
+ case 2:
+ gen_helper_vfp_uhtoh(vd, vd, shift, fpst);
+ break;
+ case 3:
+ gen_helper_vfp_ultoh(vd, vd, shift, fpst);
+ break;
+ case 4:
+ gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 5:
+ gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 6:
+ gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ case 7:
+ gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ neon_store_reg32(vd, a->vd);
+ tcg_temp_free_i32(vd);
+ tcg_temp_free_i32(shift);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
{
TCGv_i32 vd, shift;
@@ -2742,6 +3299,42 @@ static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
return true;
}
+static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
+{
+ TCGv_i32 vm;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
+ vm = tcg_temp_new_i32();
+ neon_load_reg32(vm, a->vm);
+
+ if (a->s) {
+ if (a->rz) {
+ gen_helper_vfp_tosizh(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_tosih(vm, vm, fpst);
+ }
+ } else {
+ if (a->rz) {
+ gen_helper_vfp_touizh(vm, vm, fpst);
+ } else {
+ gen_helper_vfp_touih(vm, vm, fpst);
+ }
+ }
+ neon_store_reg32(vm, a->vd);
+ tcg_temp_free_i32(vm);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
{
TCGv_i32 vm;
@@ -2895,3 +3488,56 @@ static bool trans_NOCP(DisasContext *s, arg_NOCP *a)
return false;
}
+
+static bool trans_VINS(DisasContext *s, arg_VINS *a)
+{
+ TCGv_i32 rd, rm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Insert low half of Vm into high half of Vd */
+ rm = tcg_temp_new_i32();
+ rd = tcg_temp_new_i32();
+ neon_load_reg32(rm, a->vm);
+ neon_load_reg32(rd, a->vd);
+ tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
+ neon_store_reg32(rd, a->vd);
+ tcg_temp_free_i32(rm);
+ tcg_temp_free_i32(rd);
+ return true;
+}
+
+static bool trans_VMOVX(DisasContext *s, arg_VINS *a)
+{
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
+ return false;
+ }
+
+ if (s->vec_len != 0 || s->vec_stride != 0) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /* Set Vd to high half of Vm */
+ rm = tcg_temp_new_i32();
+ neon_load_reg32(rm, a->vm);
+ tcg_gen_shri_i32(rm, rm, 16);
+ neon_store_reg32(rm, a->vd);
+ tcg_temp_free_i32(rm);
+ return true;
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index a6c53d2..a973454 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -656,6 +656,81 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+/*
+ * Floating point comparisons producing an integer result (all 1s or all 0s).
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
+ */
+static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_eq_quiet(op1, op2, stat);
+}
+
+static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_eq_quiet(op1, op2, stat);
+}
+
+static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_le(op2, op1, stat);
+}
+
+static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_le(op2, op1, stat);
+}
+
+static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_lt(op2, op1, stat);
+}
+
+static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_lt(op2, op1, stat);
+}
+
+static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_le(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_le(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static int16_t vfp_tosszh(float16 x, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ if (float16_is_any_nan(x)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_int16_round_to_zero(x, fpst);
+}
+
+static uint16_t vfp_touszh(float16 x, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ if (float16_is_any_nan(x)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_uint16_round_to_zero(x, fpst);
+}
+
#define DO_2OP(NAME, FUNC, TYPE) \
void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
{ \
@@ -675,7 +750,44 @@ DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
+DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
+
+DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
+DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
+DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
+DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
+DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
+DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
+DO_2OP(gvec_tosszh, vfp_tosszh, float16)
+DO_2OP(gvec_touszh, vfp_touszh, float16)
+
+#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
+ { \
+ return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
+ }
+
+#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
+ { \
+ return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
+ }
+
+#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
+ WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
+ WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
+ DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
+ DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
+
+DO_2OP_CMP0(cgt, cgt, FWD)
+DO_2OP_CMP0(cge, cge, FWD)
+DO_2OP_CMP0(ceq, ceq, FWD)
+DO_2OP_CMP0(clt, cgt, REV)
+DO_2OP_CMP0(cle, cge, REV)
+
#undef DO_2OP
+#undef DO_2OP_CMP0
/* Floating-point trigonometric starting value.
* See the ARM ARM pseudocode function FPTrigSMul.
@@ -707,11 +819,71 @@ static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
return result;
}
+static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
+{
+ return float16_abs(float16_sub(op1, op2, stat));
+}
+
static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
{
return float32_abs(float32_sub(op1, op2, stat));
}
+/*
+ * Reciprocal step. These are the AArch32 version which uses a
+ * non-fused multiply-and-subtract.
+ */
+static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
+{
+ op1 = float16_squash_input_denormal(op1, stat);
+ op2 = float16_squash_input_denormal(op2, stat);
+
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
+ return float16_two;
+ }
+ return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
+{
+ op1 = float32_squash_input_denormal(op1, stat);
+ op2 = float32_squash_input_denormal(op2, stat);
+
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
+ return float32_two;
+ }
+ return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
+}
+
+/* Reciprocal square-root step. AArch32 non-fused semantics. */
+static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
+{
+ op1 = float16_squash_input_denormal(op1, stat);
+ op2 = float16_squash_input_denormal(op2, stat);
+
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
+ return float16_one_point_five;
+ }
+ op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
+ return float16_div(op1, float16_two, stat);
+}
+
+static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
+{
+ op1 = float32_squash_input_denormal(op1, stat);
+ op2 = float32_squash_input_denormal(op2, stat);
+
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
+ return float32_one_point_five;
+ }
+ op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
+ return float32_div(op1, float32_two, stat);
+}
+
#define DO_3OP(NAME, FUNC, TYPE) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
{ \
@@ -739,8 +911,42 @@ DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
+DO_3OP(gvec_fabd_h, float16_abd, float16)
DO_3OP(gvec_fabd_s, float32_abd, float32)
+DO_3OP(gvec_fceq_h, float16_ceq, float16)
+DO_3OP(gvec_fceq_s, float32_ceq, float32)
+
+DO_3OP(gvec_fcge_h, float16_cge, float16)
+DO_3OP(gvec_fcge_s, float32_cge, float32)
+
+DO_3OP(gvec_fcgt_h, float16_cgt, float16)
+DO_3OP(gvec_fcgt_s, float32_cgt, float32)
+
+DO_3OP(gvec_facge_h, float16_acge, float16)
+DO_3OP(gvec_facge_s, float32_acge, float32)
+
+DO_3OP(gvec_facgt_h, float16_acgt, float16)
+DO_3OP(gvec_facgt_s, float32_acgt, float32)
+
+DO_3OP(gvec_fmax_h, float16_max, float16)
+DO_3OP(gvec_fmax_s, float32_max, float32)
+
+DO_3OP(gvec_fmin_h, float16_min, float16)
+DO_3OP(gvec_fmin_s, float32_min, float32)
+
+DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
+DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
+
+DO_3OP(gvec_fminnum_h, float16_minnum, float16)
+DO_3OP(gvec_fminnum_s, float32_minnum, float32)
+
+DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
+DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
+
+DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
+DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
+
#ifdef TARGET_AARCH64
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
@@ -754,6 +960,79 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
#endif
#undef DO_3OP
+/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
+static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_add(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_add(dest, float32_mul(op1, op2, stat), stat);
+}
+
+static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_sub(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_sub(dest, float32_mul(op1, op2, stat), stat);
+}
+
+/* Fused versions; these have the semantics Neon VFMA/VFMS want */
+static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(op1, op2, dest, 0, stat);
+}
+
+static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(op1, op2, dest, 0, stat);
+}
+
+static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
+}
+
+static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
+}
+
+#define DO_MULADD(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(d[i], n[i], m[i], stat); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
+DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
+
+DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
+DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+
+DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
+DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
+
+DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
+DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
+
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
*/
@@ -761,7 +1040,8 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
#define DO_MUL_IDX(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
{ \
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
intptr_t idx = simd_data(desc); \
TYPE *d = vd, *n = vn, *m = vm; \
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
@@ -782,7 +1062,8 @@ DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
#define DO_MLA_IDX(NAME, TYPE, OP, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
{ \
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
intptr_t idx = simd_data(desc); \
TYPE *d = vd, *n = vn, *m = vm, *a = va; \
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
@@ -804,32 +1085,51 @@ DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, )
#undef DO_MLA_IDX
-#define DO_FMUL_IDX(NAME, TYPE, H) \
+#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
{ \
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
intptr_t idx = simd_data(desc); \
TYPE *d = vd, *n = vn, *m = vm; \
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
TYPE mm = m[H(i + idx)]; \
for (j = 0; j < segment; j++) { \
- d[i + j] = TYPE##_mul(n[i + j], mm, stat); \
+ d[i + j] = TYPE##_##ADD(d[i + j], \
+ TYPE##_mul(n[i + j], mm, stat), stat); \
} \
} \
clear_tail(d, oprsz, simd_maxsz(desc)); \
}
-DO_FMUL_IDX(gvec_fmul_idx_h, float16, H2)
-DO_FMUL_IDX(gvec_fmul_idx_s, float32, H4)
-DO_FMUL_IDX(gvec_fmul_idx_d, float64, )
+#define float16_nop(N, M, S) (M)
+#define float32_nop(N, M, S) (M)
+#define float64_nop(N, M, S) (M)
+
+DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
+DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
+DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, )
+/*
+ * Non-fused multiply-accumulate operations, for Neon. NB that unlike
+ * the fused ops below they assume accumulate both from and into Vd.
+ */
+DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
+DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
+DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
+DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
+
+#undef float16_nop
+#undef float32_nop
+#undef float64_nop
#undef DO_FMUL_IDX
#define DO_FMLA_IDX(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
void *stat, uint32_t desc) \
{ \
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
TYPE *d = vd, *n = vn, *m = vm, *a = va; \
@@ -1524,3 +1824,116 @@ DO_ABA(gvec_uaba_s, uint32_t)
DO_ABA(gvec_uaba_d, uint64_t)
#undef DO_ABA
+
+#define DO_NEON_PAIRWISE(NAME, OP) \
+ void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
+ void *stat, uint32_t oprsz) \
+ { \
+ float_status *fpst = stat; \
+ float32 *d = vd; \
+ float32 *n = vn; \
+ float32 *m = vm; \
+ float32 r0, r1; \
+ \
+ /* Read all inputs before writing outputs in case vm == vd */ \
+ r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
+ r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
+ \
+ d[H4(0)] = r0; \
+ d[H4(1)] = r1; \
+ } \
+ \
+ void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
+ void *stat, uint32_t oprsz) \
+ { \
+ float_status *fpst = stat; \
+ float16 *d = vd; \
+ float16 *n = vn; \
+ float16 *m = vm; \
+ float16 r0, r1, r2, r3; \
+ \
+ /* Read all inputs before writing outputs in case vm == vd */ \
+ r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
+ r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
+ r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
+ r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
+ \
+ d[H4(0)] = r0; \
+ d[H4(1)] = r1; \
+ d[H4(2)] = r2; \
+ d[H4(3)] = r3; \
+ }
+
+DO_NEON_PAIRWISE(neon_padd, add)
+DO_NEON_PAIRWISE(neon_pmax, max)
+DO_NEON_PAIRWISE(neon_pmin, min)
+
+#undef DO_NEON_PAIRWISE
+
+#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ float_status *fpst = stat; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], shift, fpst); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
+
+#undef DO_VCVT_FIXED
+
+#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ float_status *fpst = stat; \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ uint32_t rmode = simd_data(desc); \
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
+ TYPE *d = vd, *n = vn; \
+ set_float_rounding_mode(rmode, fpst); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], 0, fpst); \
+ } \
+ set_float_rounding_mode(prev_rmode, fpst); \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
+
+#undef DO_VCVT_RMODE
+
+#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ float_status *fpst = stat; \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ uint32_t rmode = simd_data(desc); \
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
+ TYPE *d = vd, *n = vn; \
+ set_float_rounding_mode(rmode, fpst); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], fpst); \
+ } \
+ set_float_rounding_mode(prev_rmode, fpst); \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
+DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
+
+#undef DO_VRINT_RMODE
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
index 34ca164..8891ab3 100644
--- a/target/arm/vfp-uncond.decode
+++ b/target/arm/vfp-uncond.decode
@@ -44,10 +44,15 @@
@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VSEL 1111 1110 0. cc:2 .... .... 1001 .0.0 .... \
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1
VSEL 1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
- vm=%vm_sp vn=%vn_sp vd=%vd_sp dp=0
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2
VSEL 1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
- vm=%vm_dp vn=%vn_dp vd=%vd_dp dp=1
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3
+
+VMAXNM_hp 1111 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
+VMINNM_hp 1111 1110 1.00 .... .... 1001 .1.0 .... @vfp_dnm_s
VMAXNM_sp 1111 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s
@@ -55,13 +60,23 @@ VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s
VMAXNM_dp 1111 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
VMINNM_dp 1111 1110 1.00 .... .... 1011 .1.0 .... @vfp_dnm_d
+VRINT 1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=1
VRINT 1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
- vm=%vm_sp vd=%vd_sp dp=0
+ vm=%vm_sp vd=%vd_sp sz=2
VRINT 1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
- vm=%vm_dp vd=%vd_dp dp=1
+ vm=%vm_dp vd=%vd_dp sz=3
# VCVT float to int with specified rounding mode; Vd is always single-precision
+VCVT 1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
+ vm=%vm_sp vd=%vd_sp sz=1
VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
- vm=%vm_sp vd=%vd_sp dp=0
+ vm=%vm_sp vd=%vd_sp sz=2
VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
- vm=%vm_dp vd=%vd_sp dp=1
+ vm=%vm_dp vd=%vd_sp sz=3
+
+VMOVX 1111 1110 1.11 0000 .... 1010 01 . 0 .... \
+ vd=%vd_sp vm=%vm_sp
+
+VINS 1111 1110 1.11 0000 .... 1010 11 . 0 .... \
+ vd=%vd_sp vm=%vm_sp
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
index 2c793e3..51f143b 100644
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@@ -74,13 +74,13 @@ VDUP ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
vn=%vn_dp
VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+VMOV_half ---- 1110 000 l:1 .... rt:4 1001 . 001 0000 vn=%vn_sp
VMOV_single ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 vn=%vn_sp
VMOV_64_sp ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... vm=%vm_sp
VMOV_64_dp ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 .... vm=%vm_dp
-# Note that the half-precision variants of VLDR and VSTR are
-# not part of this decodetree at all because they have bits [9:8] == 0b01
+VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp
VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp
VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp
@@ -103,33 +103,47 @@ VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
vd=%vd_dp p=1 u=0 w=1
# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_hp ---- 1110 0.00 .... .... 1001 .0.0 .... @vfp_dnm_s
VMLA_sp ---- 1110 0.00 .... .... 1010 .0.0 .... @vfp_dnm_s
VMLA_dp ---- 1110 0.00 .... .... 1011 .0.0 .... @vfp_dnm_d
+VMLS_hp ---- 1110 0.00 .... .... 1001 .1.0 .... @vfp_dnm_s
VMLS_sp ---- 1110 0.00 .... .... 1010 .1.0 .... @vfp_dnm_s
VMLS_dp ---- 1110 0.00 .... .... 1011 .1.0 .... @vfp_dnm_d
+VNMLS_hp ---- 1110 0.01 .... .... 1001 .0.0 .... @vfp_dnm_s
VNMLS_sp ---- 1110 0.01 .... .... 1010 .0.0 .... @vfp_dnm_s
VNMLS_dp ---- 1110 0.01 .... .... 1011 .0.0 .... @vfp_dnm_d
+VNMLA_hp ---- 1110 0.01 .... .... 1001 .1.0 .... @vfp_dnm_s
VNMLA_sp ---- 1110 0.01 .... .... 1010 .1.0 .... @vfp_dnm_s
VNMLA_dp ---- 1110 0.01 .... .... 1011 .1.0 .... @vfp_dnm_d
+VMUL_hp ---- 1110 0.10 .... .... 1001 .0.0 .... @vfp_dnm_s
VMUL_sp ---- 1110 0.10 .... .... 1010 .0.0 .... @vfp_dnm_s
VMUL_dp ---- 1110 0.10 .... .... 1011 .0.0 .... @vfp_dnm_d
+VNMUL_hp ---- 1110 0.10 .... .... 1001 .1.0 .... @vfp_dnm_s
VNMUL_sp ---- 1110 0.10 .... .... 1010 .1.0 .... @vfp_dnm_s
VNMUL_dp ---- 1110 0.10 .... .... 1011 .1.0 .... @vfp_dnm_d
+VADD_hp ---- 1110 0.11 .... .... 1001 .0.0 .... @vfp_dnm_s
VADD_sp ---- 1110 0.11 .... .... 1010 .0.0 .... @vfp_dnm_s
VADD_dp ---- 1110 0.11 .... .... 1011 .0.0 .... @vfp_dnm_d
+VSUB_hp ---- 1110 0.11 .... .... 1001 .1.0 .... @vfp_dnm_s
VSUB_sp ---- 1110 0.11 .... .... 1010 .1.0 .... @vfp_dnm_s
VSUB_dp ---- 1110 0.11 .... .... 1011 .1.0 .... @vfp_dnm_d
+VDIV_hp ---- 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
+VFMA_hp ---- 1110 1.10 .... .... 1001 .0. 0 .... @vfp_dnm_s
+VFMS_hp ---- 1110 1.10 .... .... 1001 .1. 0 .... @vfp_dnm_s
+VFNMA_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s
+VFNMS_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s
+
VFMA_sp ---- 1110 1.10 .... .... 1010 .0. 0 .... @vfp_dnm_s
VFMS_sp ---- 1110 1.10 .... .... 1010 .1. 0 .... @vfp_dnm_s
VFNMA_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s
@@ -140,6 +154,8 @@ VFMS_dp ---- 1110 1.10 .... .... 1011 .1.0 .... @vfp_dnm_d
VFNMA_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d
VFNMS_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d
+VMOV_imm_hp ---- 1110 1.11 .... .... 1001 0000 .... \
+ vd=%vd_sp imm=%vmov_imm
VMOV_imm_sp ---- 1110 1.11 .... .... 1010 0000 .... \
vd=%vd_sp imm=%vmov_imm
VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \
@@ -148,15 +164,20 @@ VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \
VMOV_reg_sp ---- 1110 1.11 0000 .... 1010 01.0 .... @vfp_dm_ss
VMOV_reg_dp ---- 1110 1.11 0000 .... 1011 01.0 .... @vfp_dm_dd
+VABS_hp ---- 1110 1.11 0000 .... 1001 11.0 .... @vfp_dm_ss
VABS_sp ---- 1110 1.11 0000 .... 1010 11.0 .... @vfp_dm_ss
VABS_dp ---- 1110 1.11 0000 .... 1011 11.0 .... @vfp_dm_dd
+VNEG_hp ---- 1110 1.11 0001 .... 1001 01.0 .... @vfp_dm_ss
VNEG_sp ---- 1110 1.11 0001 .... 1010 01.0 .... @vfp_dm_ss
VNEG_dp ---- 1110 1.11 0001 .... 1011 01.0 .... @vfp_dm_dd
+VSQRT_hp ---- 1110 1.11 0001 .... 1001 11.0 .... @vfp_dm_ss
VSQRT_sp ---- 1110 1.11 0001 .... 1010 11.0 .... @vfp_dm_ss
VSQRT_dp ---- 1110 1.11 0001 .... 1011 11.0 .... @vfp_dm_dd
+VCMP_hp ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
VCMP_sp ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
vd=%vd_sp vm=%vm_sp
VCMP_dp ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
@@ -175,12 +196,15 @@ VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
vd=%vd_sp vm=%vm_dp
+VRINTR_hp ---- 1110 1.11 0110 .... 1001 01.0 .... @vfp_dm_ss
VRINTR_sp ---- 1110 1.11 0110 .... 1010 01.0 .... @vfp_dm_ss
VRINTR_dp ---- 1110 1.11 0110 .... 1011 01.0 .... @vfp_dm_dd
+VRINTZ_hp ---- 1110 1.11 0110 .... 1001 11.0 .... @vfp_dm_ss
VRINTZ_sp ---- 1110 1.11 0110 .... 1010 11.0 .... @vfp_dm_ss
VRINTZ_dp ---- 1110 1.11 0110 .... 1011 11.0 .... @vfp_dm_dd
+VRINTX_hp ---- 1110 1.11 0111 .... 1001 01.0 .... @vfp_dm_ss
VRINTX_sp ---- 1110 1.11 0111 .... 1010 01.0 .... @vfp_dm_ss
VRINTX_dp ---- 1110 1.11 0111 .... 1011 01.0 .... @vfp_dm_dd
@@ -190,6 +214,8 @@ VCVT_sp ---- 1110 1.11 0111 .... 1010 11.0 .... @vfp_dm_ds
VCVT_dp ---- 1110 1.11 0111 .... 1011 11.0 .... @vfp_dm_sd
# VCVT from integer to floating point: Vm always single; Vd depends on size
+VCVT_int_hp ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
VCVT_int_sp ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
vd=%vd_sp vm=%vm_sp
VCVT_int_dp ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
@@ -203,12 +229,16 @@ VJCVT ---- 1110 1.11 1001 .... 1011 11.0 .... @vfp_dm_sd
# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
# for the convenience of the trans_VCVT_fix functions.
%vcvt_fix_op 18:1 16:1 7:1
+VCVT_fix_hp ---- 1110 1.11 1.1. .... 1001 .1.0 .... \
+ vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
VCVT_fix_sp ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
VCVT_fix_dp ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
+VCVT_hp_int ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
VCVT_sp_int ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
vd=%vd_sp vm=%vm_sp
VCVT_dp_int ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 64266ec..5666393 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -236,6 +236,11 @@ void vfp_set_fpscr(CPUARMState *env, uint32_t val)
#define VFP_HELPER(name, p) HELPER(glue(glue(vfp_,name),p))
#define VFP_BINOP(name) \
+dh_ctype_f16 VFP_HELPER(name, h)(dh_ctype_f16 a, dh_ctype_f16 b, void *fpstp) \
+{ \
+ float_status *fpst = fpstp; \
+ return float16_ ## name(a, b, fpst); \
+} \
float32 VFP_HELPER(name, s)(float32 a, float32 b, void *fpstp) \
{ \
float_status *fpst = fpstp; \
@@ -256,6 +261,11 @@ VFP_BINOP(minnum)
VFP_BINOP(maxnum)
#undef VFP_BINOP
+dh_ctype_f16 VFP_HELPER(neg, h)(dh_ctype_f16 a)
+{
+ return float16_chs(a);
+}
+
float32 VFP_HELPER(neg, s)(float32 a)
{
return float32_chs(a);
@@ -266,6 +276,11 @@ float64 VFP_HELPER(neg, d)(float64 a)
return float64_chs(a);
}
+dh_ctype_f16 VFP_HELPER(abs, h)(dh_ctype_f16 a)
+{
+ return float16_abs(a);
+}
+
float32 VFP_HELPER(abs, s)(float32 a)
{
return float32_abs(a);
@@ -276,6 +291,11 @@ float64 VFP_HELPER(abs, d)(float64 a)
return float64_abs(a);
}
+dh_ctype_f16 VFP_HELPER(sqrt, h)(dh_ctype_f16 a, CPUARMState *env)
+{
+ return float16_sqrt(a, &env->vfp.fp_status_f16);
+}
+
float32 VFP_HELPER(sqrt, s)(float32 a, CPUARMState *env)
{
return float32_sqrt(a, &env->vfp.fp_status);
@@ -310,19 +330,20 @@ static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp)
}
/* XXX: check quiet/signaling case */
-#define DO_VFP_cmp(p, type) \
-void VFP_HELPER(cmp, p)(type a, type b, CPUARMState *env) \
+#define DO_VFP_cmp(P, FLOATTYPE, ARGTYPE, FPST) \
+void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
{ \
softfloat_to_vfp_compare(env, \
- type ## _compare_quiet(a, b, &env->vfp.fp_status)); \
+ FLOATTYPE ## _compare_quiet(a, b, &env->vfp.FPST)); \
} \
-void VFP_HELPER(cmpe, p)(type a, type b, CPUARMState *env) \
+void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
{ \
softfloat_to_vfp_compare(env, \
- type ## _compare(a, b, &env->vfp.fp_status)); \
+ FLOATTYPE ## _compare(a, b, &env->vfp.FPST)); \
}
-DO_VFP_cmp(s, float32)
-DO_VFP_cmp(d, float64)
+DO_VFP_cmp(h, float16, dh_ctype_f16, fp_status_f16)
+DO_VFP_cmp(s, float32, float32, fp_status)
+DO_VFP_cmp(d, float64, float64, fp_status)
#undef DO_VFP_cmp
/* Integer to float and float to integer conversions */
@@ -373,13 +394,13 @@ float32 VFP_HELPER(fcvts, d)(float64 x, CPUARMState *env)
}
/* VFP3 fixed point conversion. */
-#define VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
-float##fsz HELPER(vfp_##name##to##p)(uint##isz##_t x, uint32_t shift, \
+#define VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
+ftype HELPER(vfp_##name##to##p)(uint##isz##_t x, uint32_t shift, \
void *fpstp) \
{ return itype##_to_##float##fsz##_scalbn(x, -shift, fpstp); }
-#define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, ROUND, suff) \
-uint##isz##_t HELPER(vfp_to##name##p##suff)(float##fsz x, uint32_t shift, \
+#define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, ROUND, suff) \
+uint##isz##_t HELPER(vfp_to##name##p##suff)(ftype x, uint32_t shift, \
void *fpst) \
{ \
if (unlikely(float##fsz##_is_any_nan(x))) { \
@@ -389,116 +410,42 @@ uint##isz##_t HELPER(vfp_to##name##p##suff)(float##fsz x, uint32_t shift, \
return float##fsz##_to_##itype##_scalbn(x, ROUND, shift, fpst); \
}
-#define VFP_CONV_FIX(name, p, fsz, isz, itype) \
-VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
+#define VFP_CONV_FIX(name, p, fsz, ftype, isz, itype) \
+VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
float_round_to_zero, _round_to_zero) \
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
get_float_rounding_mode(fpst), )
-#define VFP_CONV_FIX_A64(name, p, fsz, isz, itype) \
-VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
+#define VFP_CONV_FIX_A64(name, p, fsz, ftype, isz, itype) \
+VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
get_float_rounding_mode(fpst), )
-VFP_CONV_FIX(sh, d, 64, 64, int16)
-VFP_CONV_FIX(sl, d, 64, 64, int32)
-VFP_CONV_FIX_A64(sq, d, 64, 64, int64)
-VFP_CONV_FIX(uh, d, 64, 64, uint16)
-VFP_CONV_FIX(ul, d, 64, 64, uint32)
-VFP_CONV_FIX_A64(uq, d, 64, 64, uint64)
-VFP_CONV_FIX(sh, s, 32, 32, int16)
-VFP_CONV_FIX(sl, s, 32, 32, int32)
-VFP_CONV_FIX_A64(sq, s, 32, 64, int64)
-VFP_CONV_FIX(uh, s, 32, 32, uint16)
-VFP_CONV_FIX(ul, s, 32, 32, uint32)
-VFP_CONV_FIX_A64(uq, s, 32, 64, uint64)
+VFP_CONV_FIX(sh, d, 64, float64, 64, int16)
+VFP_CONV_FIX(sl, d, 64, float64, 64, int32)
+VFP_CONV_FIX_A64(sq, d, 64, float64, 64, int64)
+VFP_CONV_FIX(uh, d, 64, float64, 64, uint16)
+VFP_CONV_FIX(ul, d, 64, float64, 64, uint32)
+VFP_CONV_FIX_A64(uq, d, 64, float64, 64, uint64)
+VFP_CONV_FIX(sh, s, 32, float32, 32, int16)
+VFP_CONV_FIX(sl, s, 32, float32, 32, int32)
+VFP_CONV_FIX_A64(sq, s, 32, float32, 64, int64)
+VFP_CONV_FIX(uh, s, 32, float32, 32, uint16)
+VFP_CONV_FIX(ul, s, 32, float32, 32, uint32)
+VFP_CONV_FIX_A64(uq, s, 32, float32, 64, uint64)
+VFP_CONV_FIX(sh, h, 16, dh_ctype_f16, 32, int16)
+VFP_CONV_FIX(sl, h, 16, dh_ctype_f16, 32, int32)
+VFP_CONV_FIX_A64(sq, h, 16, dh_ctype_f16, 64, int64)
+VFP_CONV_FIX(uh, h, 16, dh_ctype_f16, 32, uint16)
+VFP_CONV_FIX(ul, h, 16, dh_ctype_f16, 32, uint32)
+VFP_CONV_FIX_A64(uq, h, 16, dh_ctype_f16, 64, uint64)
#undef VFP_CONV_FIX
#undef VFP_CONV_FIX_FLOAT
#undef VFP_CONV_FLOAT_FIX_ROUND
#undef VFP_CONV_FIX_A64
-uint32_t HELPER(vfp_sltoh)(uint32_t x, uint32_t shift, void *fpst)
-{
- return int32_to_float16_scalbn(x, -shift, fpst);
-}
-
-uint32_t HELPER(vfp_ultoh)(uint32_t x, uint32_t shift, void *fpst)
-{
- return uint32_to_float16_scalbn(x, -shift, fpst);
-}
-
-uint32_t HELPER(vfp_sqtoh)(uint64_t x, uint32_t shift, void *fpst)
-{
- return int64_to_float16_scalbn(x, -shift, fpst);
-}
-
-uint32_t HELPER(vfp_uqtoh)(uint64_t x, uint32_t shift, void *fpst)
-{
- return uint64_to_float16_scalbn(x, -shift, fpst);
-}
-
-uint32_t HELPER(vfp_toshh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_int16_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
-uint32_t HELPER(vfp_touhh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_uint16_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
-uint32_t HELPER(vfp_toslh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_int32_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
-uint32_t HELPER(vfp_toulh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_uint32_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
-uint64_t HELPER(vfp_tosqh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_int64_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
-uint64_t HELPER(vfp_touqh)(uint32_t x, uint32_t shift, void *fpst)
-{
- if (unlikely(float16_is_any_nan(x))) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_uint64_scalbn(x, get_float_rounding_mode(fpst),
- shift, fpst);
-}
-
/* Set the current fp rounding mode and return the old one.
* The argument is a softfloat float_round_ value.
*/
@@ -512,23 +459,6 @@ uint32_t HELPER(set_rmode)(uint32_t rmode, void *fpstp)
return prev_rmode;
}
-/* Set the current fp rounding mode in the standard fp status and return
- * the old one. This is for NEON instructions that need to change the
- * rounding mode but wish to use the standard FPSCR values for everything
- * else. Always set the rounding mode back to the correct value after
- * modifying it.
- * The argument is a softfloat float_round_ value.
- */
-uint32_t HELPER(set_neon_rmode)(uint32_t rmode, CPUARMState *env)
-{
- float_status *fp_status = &env->vfp.standard_fp_status;
-
- uint32_t prev_rmode = get_float_rounding_mode(fp_status);
- set_float_rounding_mode(rmode, fp_status);
-
- return prev_rmode;
-}
-
/* Half precision conversions. */
float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, void *fpstp, uint32_t ahp_mode)
{
@@ -582,38 +512,6 @@ uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, void *fpstp, uint32_t ahp_mode)
return r;
}
-#define float32_two make_float32(0x40000000)
-#define float32_three make_float32(0x40400000)
-#define float32_one_point_five make_float32(0x3fc00000)
-
-float32 HELPER(recps_f32)(CPUARMState *env, float32 a, float32 b)
-{
- float_status *s = &env->vfp.standard_fp_status;
- if ((float32_is_infinity(a) && float32_is_zero_or_denormal(b)) ||
- (float32_is_infinity(b) && float32_is_zero_or_denormal(a))) {
- if (!(float32_is_zero(a) || float32_is_zero(b))) {
- float_raise(float_flag_input_denormal, s);
- }
- return float32_two;
- }
- return float32_sub(float32_two, float32_mul(a, b, s), s);
-}
-
-float32 HELPER(rsqrts_f32)(CPUARMState *env, float32 a, float32 b)
-{
- float_status *s = &env->vfp.standard_fp_status;
- float32 product;
- if ((float32_is_infinity(a) && float32_is_zero_or_denormal(b)) ||
- (float32_is_infinity(b) && float32_is_zero_or_denormal(a))) {
- if (!(float32_is_zero(a) || float32_is_zero(b))) {
- float_raise(float_flag_input_denormal, s);
- }
- return float32_one_point_five;
- }
- product = float32_mul(a, b, s);
- return float32_div(float32_sub(float32_three, product, s), float32_two, s);
-}
-
/* NEON helpers. */
/* Constants 256 and 512 are used in some helpers; we avoid relying on
@@ -1056,6 +954,13 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a)
}
/* VFPv4 fused multiply-accumulate */
+dh_ctype_f16 VFP_HELPER(muladd, h)(dh_ctype_f16 a, dh_ctype_f16 b,
+ dh_ctype_f16 c, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return float16_muladd(a, b, c, 0, fpst);
+}
+
float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c, void *fpstp)
{
float_status *fpst = fpstp;
@@ -1069,6 +974,11 @@ float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c, void *fpstp)
}
/* ARMv8 round to integral */
+dh_ctype_f16 HELPER(rinth_exact)(dh_ctype_f16 x, void *fp_status)
+{
+ return float16_round_to_int(x, fp_status);
+}
+
float32 HELPER(rints_exact)(float32 x, void *fp_status)
{
return float32_round_to_int(x, fp_status);
@@ -1079,6 +989,22 @@ float64 HELPER(rintd_exact)(float64 x, void *fp_status)
return float64_round_to_int(x, fp_status);
}
+dh_ctype_f16 HELPER(rinth)(dh_ctype_f16 x, void *fp_status)
+{
+ int old_flags = get_float_exception_flags(fp_status), new_flags;
+ float16 ret;
+
+ ret = float16_round_to_int(x, fp_status);
+
+ /* Suppress any inexact exceptions the conversion produced */
+ if (!(old_flags & float_flag_inexact)) {
+ new_flags = get_float_exception_flags(fp_status);
+ set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
+ }
+
+ return ret;
+}
+
float32 HELPER(rints)(float32 x, void *fp_status)
{
int old_flags = get_float_exception_flags(fp_status), new_flags;