diff options
Diffstat (limited to 'target/arm/mve_helper.c')
-rw-r--r-- | target/arm/mve_helper.c | 99 |
1 files changed, 63 insertions, 36 deletions
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 2b882db..bbbaa53 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -213,7 +213,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) * For loads, predicated lanes are zeroed instead of retaining * their previous values. */ -#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN) \ +#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ @@ -230,25 +230,35 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) addr = ADDRFN(base, m[H##ESIZE(e)]); \ d[H##ESIZE(e)] = (mask & 1) ? \ cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ } \ mve_advance_vpt(env); \ } /* We know here TYPE is unsigned so always the same as the offset type */ -#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN) \ +#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ TYPE *d = vd; \ TYPE *m = vm; \ uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ addr = ADDRFN(base, m[H##ESIZE(e)]); \ if (mask & 1) { \ cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ } \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ } \ mve_advance_vpt(env); \ } @@ -258,8 +268,10 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) * accesses, controlled by the predicate mask for the relevant beat, * and with a single 32-bit offset in the first of the two Qm elements. * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little). + * Address writeback happens on the odd beats and updates the address + * stored in the even-beat element. */ -#define DO_VLDR64_SG(OP, ADDRFN) \ +#define DO_VLDR64_SG(OP, ADDRFN, WB) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ @@ -276,25 +288,35 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) addr = ADDRFN(base, m[H4(e & ~1)]); \ addr += 4 * (e & 1); \ d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ } \ mve_advance_vpt(env); \ } -#define DO_VSTR64_SG(OP, ADDRFN) \ +#define DO_VSTR64_SG(OP, ADDRFN, WB) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ uint32_t *d = vd; \ uint32_t *m = vm; \ uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ - for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ addr = ADDRFN(base, m[H4(e & ~1)]); \ addr += 4 * (e & 1); \ if (mask & 1) { \ cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ } \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ } \ mve_advance_vpt(env); \ } @@ -304,36 +326,41 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) #define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) #define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) -DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD) -DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD) -DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD) - -DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD) -DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD) -DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD) -DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD) -DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD) -DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD) -DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD) - -DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH) -DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH) -DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH) -DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW) -DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD) - -DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD) -DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD) -DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD) -DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD) -DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD) -DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD) -DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD) - -DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH) -DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH) -DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW) -DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD) +DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false) + +DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false) + +DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false) +DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false) +DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false) + +DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false) +DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) +DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true) +DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true) +DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) /* * The mergemask(D, R, M) macro performs the operation "*D = R" but |