diff options
Diffstat (limited to 'target/arm/translate-sve.c')
-rw-r--r-- | target/arm/translate-sve.c | 393 |
1 files changed, 328 insertions, 65 deletions
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 62b5f30..41f8b12 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -1286,6 +1286,19 @@ static bool trans_ADDVL(DisasContext *s, arg_ADDVL *a) return true; } +static bool trans_ADDSVL(DisasContext *s, arg_ADDSVL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * streaming_vec_reg_size(s)); + } + return true; +} + static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a) { if (!dc_isar_feature(aa64_sve, s)) { @@ -1299,6 +1312,19 @@ static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a) return true; } +static bool trans_ADDSPL(DisasContext *s, arg_ADDSPL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 rd = cpu_reg_sp(s, a->rd); + TCGv_i64 rn = cpu_reg_sp(s, a->rn); + tcg_gen_addi_i64(rd, rn, a->imm * streaming_pred_reg_size(s)); + } + return true; +} + static bool trans_RDVL(DisasContext *s, arg_RDVL *a) { if (!dc_isar_feature(aa64_sve, s)) { @@ -1311,6 +1337,18 @@ static bool trans_RDVL(DisasContext *s, arg_RDVL *a) return true; } +static bool trans_RDSVL(DisasContext *s, arg_RDSVL *a) +{ + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (sme_enabled_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + tcg_gen_movi_i64(reg, a->imm * streaming_vec_reg_size(s)); + } + return true; +} + /* *** SVE Compute Vector Address Group */ @@ -1320,10 +1358,10 @@ static bool do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn) return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, a->imm); } -TRANS_FEAT(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32) -TRANS_FEAT(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64) -TRANS_FEAT(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32) -TRANS_FEAT(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32) +TRANS_FEAT_NONSTREAMING(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32) +TRANS_FEAT_NONSTREAMING(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64) +TRANS_FEAT_NONSTREAMING(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32) +TRANS_FEAT_NONSTREAMING(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32) /* *** SVE Integer Misc - Unpredicated Group @@ -1333,14 +1371,15 @@ static gen_helper_gvec_2 * const fexpa_fns[4] = { NULL, gen_helper_sve_fexpa_h, gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d, }; -TRANS_FEAT(FEXPA, aa64_sve, gen_gvec_ool_zz, - fexpa_fns[a->esz], a->rd, a->rn, 0) +TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz, + fexpa_fns[a->esz], a->rd, a->rn, 0) static gen_helper_gvec_3 * const ftssel_fns[4] = { NULL, gen_helper_sve_ftssel_h, gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d, }; -TRANS_FEAT(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz, ftssel_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz, + ftssel_fns[a->esz], a, 0) /* *** SVE Predicate Logical Operations Group @@ -1785,7 +1824,8 @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s) /* Note pat == 31 is #all, to set all elements. */ -TRANS_FEAT(SETFFR, aa64_sve, do_predset, 0, FFR_PRED_NUM, 31, false) +TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve, + do_predset, 0, FFR_PRED_NUM, 31, false) /* Note pat == 32 is #unimp, to set no elements. */ TRANS_FEAT(PFALSE, aa64_sve, do_predset, 0, a->rd, 32, false) @@ -1799,11 +1839,13 @@ static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a) .rd = a->rd, .pg = a->pg, .s = a->s, .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM, }; + + s->is_nonstreaming = true; return trans_AND_pppp(s, &alt_a); } -TRANS_FEAT(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM) -TRANS_FEAT(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn) +TRANS_FEAT_NONSTREAMING(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM) +TRANS_FEAT_NONSTREAMING(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn) static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a, void (*gen_fn)(TCGv_i32, TCGv_ptr, @@ -2533,7 +2575,8 @@ TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, static gen_helper_gvec_3 * const compact_fns[4] = { NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d }; -TRANS_FEAT(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz, compact_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz, + compact_fns[a->esz], a, 0) /* Call the helper that computes the ARM LastActiveElement pseudocode * function, scaled by the element size. This includes the not found @@ -2858,6 +2901,8 @@ TRANS_FEAT(REVH, aa64_sve, gen_gvec_ool_arg_zpz, revh_fns[a->esz], a, 0) TRANS_FEAT(REVW, aa64_sve, gen_gvec_ool_arg_zpz, a->esz == 3 ? gen_helper_sve_revw_d : NULL, a, 0) +TRANS_FEAT(REVD, aa64_sme, gen_gvec_ool_arg_zpz, gen_helper_sme_revd_q, a, 0) + TRANS_FEAT(SPLICE, aa64_sve, gen_gvec_ool_arg_zpzz, gen_helper_sve_splice, a, a->esz) @@ -3856,9 +3901,9 @@ static gen_helper_gvec_3_ptr * const ftmad_fns[4] = { NULL, gen_helper_sve_ftmad_h, gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d, }; -TRANS_FEAT(FTMAD, aa64_sve, gen_gvec_fpst_zzz, - ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm, - a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) +TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz, + ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm, + a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR) /* *** SVE Floating Point Accumulating Reduction Group @@ -3881,6 +3926,7 @@ static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a) if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -3918,12 +3964,18 @@ static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a) DO_FP3(FADD_zzz, fadd) DO_FP3(FSUB_zzz, fsub) DO_FP3(FMUL_zzz, fmul) -DO_FP3(FTSMUL, ftsmul) DO_FP3(FRECPS, recps) DO_FP3(FRSQRTS, rsqrts) #undef DO_FP3 +static gen_helper_gvec_3_ptr * const ftsmul_fns[4] = { + NULL, gen_helper_gvec_ftsmul_h, + gen_helper_gvec_ftsmul_s, gen_helper_gvec_ftsmul_d +}; +TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz, + ftsmul_fns[a->esz], a, 0) + /* *** SVE Floating Point Arithmetic - Predicated Group */ @@ -4256,7 +4308,8 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, * The load should begin at the address Rn + IMM. */ -static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) +void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, + int len, int rn, int imm) { int len_align = QEMU_ALIGN_DOWN(len, 8); int len_remain = len % 8; @@ -4282,7 +4335,7 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) t0 = tcg_temp_new_i64(); for (i = 0; i < len_align; i += 8) { tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_st_i64(t0, cpu_env, vofs + i); + tcg_gen_st_i64(t0, base, vofs + i); tcg_gen_addi_i64(clean_addr, clean_addr, 8); } tcg_temp_free_i64(t0); @@ -4295,6 +4348,12 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) clean_addr = new_tmp_a64_local(s); tcg_gen_mov_i64(clean_addr, t0); + if (base != cpu_env) { + TCGv_ptr b = tcg_temp_local_new_ptr(); + tcg_gen_mov_ptr(b, base); + base = b; + } + gen_set_label(loop); t0 = tcg_temp_new_i64(); @@ -4302,7 +4361,7 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) tcg_gen_addi_i64(clean_addr, clean_addr, 8); tp = tcg_temp_new_ptr(); - tcg_gen_add_ptr(tp, cpu_env, i); + tcg_gen_add_ptr(tp, base, i); tcg_gen_addi_ptr(i, i, 8); tcg_gen_st_i64(t0, tp, vofs); tcg_temp_free_ptr(tp); @@ -4310,6 +4369,11 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); tcg_temp_free_ptr(i); + + if (base != cpu_env) { + tcg_temp_free_ptr(base); + assert(len_remain == 0); + } } /* @@ -4338,13 +4402,14 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) default: g_assert_not_reached(); } - tcg_gen_st_i64(t0, cpu_env, vofs + len_align); + tcg_gen_st_i64(t0, base, vofs + len_align); tcg_temp_free_i64(t0); } } /* Similarly for stores. */ -static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) +void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, + int len, int rn, int imm) { int len_align = QEMU_ALIGN_DOWN(len, 8); int len_remain = len % 8; @@ -4370,7 +4435,7 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) t0 = tcg_temp_new_i64(); for (i = 0; i < len_align; i += 8) { - tcg_gen_ld_i64(t0, cpu_env, vofs + i); + tcg_gen_ld_i64(t0, base, vofs + i); tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); tcg_gen_addi_i64(clean_addr, clean_addr, 8); } @@ -4384,11 +4449,17 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) clean_addr = new_tmp_a64_local(s); tcg_gen_mov_i64(clean_addr, t0); + if (base != cpu_env) { + TCGv_ptr b = tcg_temp_local_new_ptr(); + tcg_gen_mov_ptr(b, base); + base = b; + } + gen_set_label(loop); t0 = tcg_temp_new_i64(); tp = tcg_temp_new_ptr(); - tcg_gen_add_ptr(tp, cpu_env, i); + tcg_gen_add_ptr(tp, base, i); tcg_gen_ld_i64(t0, tp, vofs); tcg_gen_addi_ptr(i, i, 8); tcg_temp_free_ptr(tp); @@ -4399,12 +4470,17 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); tcg_temp_free_ptr(i); + + if (base != cpu_env) { + tcg_temp_free_ptr(base); + assert(len_remain == 0); + } } /* Predicate register stores can be any multiple of 2. */ if (len_remain) { t0 = tcg_temp_new_i64(); - tcg_gen_ld_i64(t0, cpu_env, vofs + len_align); + tcg_gen_ld_i64(t0, base, vofs + len_align); switch (len_remain) { case 2: @@ -4436,7 +4512,7 @@ static bool trans_LDR_zri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = vec_full_reg_size(s); int off = vec_full_reg_offset(s, a->rd); - do_ldr(s, off, size, a->rn, a->imm * size); + gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size); } return true; } @@ -4449,7 +4525,7 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = pred_full_reg_size(s); int off = pred_full_reg_offset(s, a->rd); - do_ldr(s, off, size, a->rn, a->imm * size); + gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size); } return true; } @@ -4462,7 +4538,7 @@ static bool trans_STR_zri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = vec_full_reg_size(s); int off = vec_full_reg_offset(s, a->rd); - do_str(s, off, size, a->rn, a->imm * size); + gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size); } return true; } @@ -4475,7 +4551,7 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = pred_full_reg_size(s); int off = pred_full_reg_offset(s, a->rd); - do_str(s, off, size, a->rn, a->imm * size); + gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size); } return true; } @@ -4793,6 +4869,7 @@ static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (sve_access_check(s)) { TCGv_i64 addr = new_tmp_a64(s); tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); @@ -4894,6 +4971,7 @@ static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (sve_access_check(s)) { int vsz = vec_full_reg_size(s); int elements = vsz >> dtype_esz[a->dtype]; @@ -5048,6 +5126,7 @@ static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a) if (a->rm == 31) { return false; } + s->is_nonstreaming = true; if (sve_access_check(s)) { TCGv_i64 addr = new_tmp_a64(s); tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); @@ -5062,6 +5141,7 @@ static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a) if (!dc_isar_feature(aa64_sve_f64mm, s)) { return false; } + s->is_nonstreaming = true; if (sve_access_check(s)) { TCGv_i64 addr = new_tmp_a64(s); tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32); @@ -5657,6 +5737,7 @@ static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5688,6 +5769,7 @@ static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5722,6 +5804,7 @@ static bool trans_LDNT1_zprz(DisasContext *s, arg_LD1_zprz *a) if (!dc_isar_feature(aa64_sve2, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5845,6 +5928,7 @@ static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5875,6 +5959,7 @@ static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) if (!dc_isar_feature(aa64_sve, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5909,6 +5994,7 @@ static bool trans_STNT1_zprz(DisasContext *s, arg_ST1_zprz *a) if (!dc_isar_feature(aa64_sve2, s)) { return false; } + s->is_nonstreaming = true; if (!sve_access_check(s)) { return true; } @@ -5953,6 +6039,17 @@ static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a) return true; } +static bool trans_PRF_ns(DisasContext *s, arg_PRF_ns *a) +{ + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + /* Prefetch is a nop within QEMU. */ + s->is_nonstreaming = true; + (void)sve_access_check(s); + return true; +} + /* * Move Prefix * @@ -6181,9 +6278,13 @@ static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel) gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h, NULL, gen_helper_sve2_pmull_d, }; - if (a->esz == 0 - ? !dc_isar_feature(aa64_sve2_pmull128, s) - : !dc_isar_feature(aa64_sve, s)) { + + if (a->esz == 0) { + if (!dc_isar_feature(aa64_sve2_pmull128, s)) { + return false; + } + s->is_nonstreaming = true; + } else if (!dc_isar_feature(aa64_sve, s)) { return false; } return gen_gvec_ool_arg_zzz(s, fns[a->esz], a, sel); @@ -6371,22 +6472,22 @@ static gen_helper_gvec_3 * const bext_fns[4] = { gen_helper_sve2_bext_b, gen_helper_sve2_bext_h, gen_helper_sve2_bext_s, gen_helper_sve2_bext_d, }; -TRANS_FEAT(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, - bext_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bext_fns[a->esz], a, 0) static gen_helper_gvec_3 * const bdep_fns[4] = { gen_helper_sve2_bdep_b, gen_helper_sve2_bdep_h, gen_helper_sve2_bdep_s, gen_helper_sve2_bdep_d, }; -TRANS_FEAT(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, - bdep_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bdep_fns[a->esz], a, 0) static gen_helper_gvec_3 * const bgrp_fns[4] = { gen_helper_sve2_bgrp_b, gen_helper_sve2_bgrp_h, gen_helper_sve2_bgrp_s, gen_helper_sve2_bgrp_d, }; -TRANS_FEAT(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, - bgrp_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz, + bgrp_fns[a->esz], a, 0) static gen_helper_gvec_3 * const cadd_fns[4] = { gen_helper_sve2_cadd_b, gen_helper_sve2_cadd_h, @@ -7094,21 +7195,21 @@ DO_SVE2_ZZZ_NARROW(RSUBHNT, rsubhnt) static gen_helper_gvec_flags_4 * const match_fns[4] = { gen_helper_sve2_match_ppzz_b, gen_helper_sve2_match_ppzz_h, NULL, NULL }; -TRANS_FEAT(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz]) +TRANS_FEAT_NONSTREAMING(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz]) static gen_helper_gvec_flags_4 * const nmatch_fns[4] = { gen_helper_sve2_nmatch_ppzz_b, gen_helper_sve2_nmatch_ppzz_h, NULL, NULL }; -TRANS_FEAT(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz]) +TRANS_FEAT_NONSTREAMING(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz]) static gen_helper_gvec_4 * const histcnt_fns[4] = { NULL, NULL, gen_helper_sve2_histcnt_s, gen_helper_sve2_histcnt_d }; -TRANS_FEAT(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz, - histcnt_fns[a->esz], a, 0) +TRANS_FEAT_NONSTREAMING(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz, + histcnt_fns[a->esz], a, 0) -TRANS_FEAT(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz, - a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0) +TRANS_FEAT_NONSTREAMING(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz, + a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0) DO_ZPZZ_FP(FADDP, aa64_sve2, sve2_faddp_zpzz) DO_ZPZZ_FP(FMAXNMP, aa64_sve2, sve2_fmaxnmp_zpzz) @@ -7120,10 +7221,12 @@ DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz) * SVE Integer Multiply-Add (unpredicated) */ -TRANS_FEAT(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz, gen_helper_fmmla_s, - a->rd, a->rn, a->rm, a->ra, 0, FPST_FPCR) -TRANS_FEAT(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz, gen_helper_fmmla_d, - a->rd, a->rn, a->rm, a->ra, 0, FPST_FPCR) +TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz, + gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra, + 0, FPST_FPCR) +TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz, + gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra, + 0, FPST_FPCR) static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = { NULL, gen_helper_sve2_sqdmlal_zzzw_h, @@ -7220,20 +7323,21 @@ TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz, TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0) -TRANS_FEAT(AESMC, aa64_sve2_aes, gen_gvec_ool_zz, - gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt) +TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz, + gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt) -TRANS_FEAT(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz, - gen_helper_crypto_aese, a, false) -TRANS_FEAT(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz, - gen_helper_crypto_aese, a, true) +TRANS_FEAT_NONSTREAMING(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz, + gen_helper_crypto_aese, a, false) +TRANS_FEAT_NONSTREAMING(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz, + gen_helper_crypto_aese, a, true) -TRANS_FEAT(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, - gen_helper_crypto_sm4e, a, 0) -TRANS_FEAT(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, - gen_helper_crypto_sm4ekey, a, 0) +TRANS_FEAT_NONSTREAMING(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, + gen_helper_crypto_sm4e, a, 0) +TRANS_FEAT_NONSTREAMING(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz, + gen_helper_crypto_sm4ekey, a, 0) -TRANS_FEAT(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz, gen_gvec_rax1, a) +TRANS_FEAT_NONSTREAMING(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz, + gen_gvec_rax1, a) TRANS_FEAT(FCVTNT_sh, aa64_sve2, gen_gvec_fpst_arg_zpz, gen_helper_sve2_fcvtnt_sh, a, 0, FPST_FPCR) @@ -7284,20 +7388,20 @@ TRANS_FEAT(FMLALT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, true) TRANS_FEAT(FMLSLB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, false) TRANS_FEAT(FMLSLT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, true) -TRANS_FEAT(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, - gen_helper_gvec_smmla_b, a, 0) -TRANS_FEAT(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, - gen_helper_gvec_usmmla_b, a, 0) -TRANS_FEAT(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, - gen_helper_gvec_ummla_b, a, 0) +TRANS_FEAT_NONSTREAMING(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_smmla_b, a, 0) +TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_usmmla_b, a, 0) +TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_ummla_b, a, 0) TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, gen_helper_gvec_bfdot, a, 0) TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz, gen_helper_gvec_bfdot_idx, a) -TRANS_FEAT(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, - gen_helper_gvec_bfmmla, a, 0) +TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_bfmmla, a, 0) static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel) { @@ -7317,3 +7421,162 @@ static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel) TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false) TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true) + +static bool trans_PSEL(DisasContext *s, arg_psel *a) +{ + int vl = vec_full_reg_size(s); + int pl = pred_gvec_reg_size(s); + int elements = vl >> a->esz; + TCGv_i64 tmp, didx, dbit; + TCGv_ptr ptr; + + if (!dc_isar_feature(aa64_sme, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + tmp = tcg_temp_new_i64(); + dbit = tcg_temp_new_i64(); + didx = tcg_temp_new_i64(); + ptr = tcg_temp_new_ptr(); + + /* Compute the predicate element. */ + tcg_gen_addi_i64(tmp, cpu_reg(s, a->rv), a->imm); + if (is_power_of_2(elements)) { + tcg_gen_andi_i64(tmp, tmp, elements - 1); + } else { + tcg_gen_remu_i64(tmp, tmp, tcg_constant_i64(elements)); + } + + /* Extract the predicate byte and bit indices. */ + tcg_gen_shli_i64(tmp, tmp, a->esz); + tcg_gen_andi_i64(dbit, tmp, 7); + tcg_gen_shri_i64(didx, tmp, 3); + if (HOST_BIG_ENDIAN) { + tcg_gen_xori_i64(didx, didx, 7); + } + + /* Load the predicate word. */ + tcg_gen_trunc_i64_ptr(ptr, didx); + tcg_gen_add_ptr(ptr, ptr, cpu_env); + tcg_gen_ld8u_i64(tmp, ptr, pred_full_reg_offset(s, a->pm)); + + /* Extract the predicate bit and replicate to MO_64. */ + tcg_gen_shr_i64(tmp, tmp, dbit); + tcg_gen_andi_i64(tmp, tmp, 1); + tcg_gen_neg_i64(tmp, tmp); + + /* Apply to either copy the source, or write zeros. */ + tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd), + pred_full_reg_offset(s, a->pn), tmp, pl, pl); + + tcg_temp_free_i64(tmp); + tcg_temp_free_i64(dbit); + tcg_temp_free_i64(didx); + tcg_temp_free_ptr(ptr); + return true; +} + +static void gen_sclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a) +{ + tcg_gen_smax_i32(d, a, n); + tcg_gen_smin_i32(d, d, m); +} + +static void gen_sclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a) +{ + tcg_gen_smax_i64(d, a, n); + tcg_gen_smin_i64(d, d, m); +} + +static void gen_sclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec a) +{ + tcg_gen_smax_vec(vece, d, a, n); + tcg_gen_smin_vec(vece, d, d, m); +} + +static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const TCGOpcode vecop[] = { + INDEX_op_smin_vec, INDEX_op_smax_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_b, + .opt_opc = vecop, + .vece = MO_8 }, + { .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_h, + .opt_opc = vecop, + .vece = MO_16 }, + { .fni4 = gen_sclamp_i32, + .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_s, + .opt_opc = vecop, + .vece = MO_32 }, + { .fni8 = gen_sclamp_i64, + .fniv = gen_sclamp_vec, + .fno = gen_helper_gvec_sclamp_d, + .opt_opc = vecop, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 } + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); +} + +TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a) + +static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a) +{ + tcg_gen_umax_i32(d, a, n); + tcg_gen_umin_i32(d, d, m); +} + +static void gen_uclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a) +{ + tcg_gen_umax_i64(d, a, n); + tcg_gen_umin_i64(d, d, m); +} + +static void gen_uclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n, + TCGv_vec m, TCGv_vec a) +{ + tcg_gen_umax_vec(vece, d, a, n); + tcg_gen_umin_vec(vece, d, d, m); +} + +static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, + uint32_t a, uint32_t oprsz, uint32_t maxsz) +{ + static const TCGOpcode vecop[] = { + INDEX_op_umin_vec, INDEX_op_umax_vec, 0 + }; + static const GVecGen4 ops[4] = { + { .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_b, + .opt_opc = vecop, + .vece = MO_8 }, + { .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_h, + .opt_opc = vecop, + .vece = MO_16 }, + { .fni4 = gen_uclamp_i32, + .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_s, + .opt_opc = vecop, + .vece = MO_32 }, + { .fni8 = gen_uclamp_i64, + .fniv = gen_uclamp_vec, + .fno = gen_helper_gvec_uclamp_d, + .opt_opc = vecop, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 } + }; + tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); +} + +TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a) |