diff options
Diffstat (limited to 'target')
-rw-r--r-- | target/arm/cpu.c | 18 | ||||
-rw-r--r-- | target/arm/helper-sve.h | 294 | ||||
-rw-r--r-- | target/arm/helper.h | 19 | ||||
-rw-r--r-- | target/arm/sve.decode | 248 | ||||
-rw-r--r-- | target/arm/sve_helper.c | 1250 | ||||
-rw-r--r-- | target/arm/translate-a64.h | 26 | ||||
-rw-r--r-- | target/arm/translate-sve.c | 1458 | ||||
-rw-r--r-- | target/arm/translate.c | 43 | ||||
-rw-r--r-- | target/arm/vec_helper.c | 69 | ||||
-rw-r--r-- | target/microblaze/mmu.c | 1 | ||||
-rw-r--r-- | target/microblaze/translate.c | 15 | ||||
-rw-r--r-- | target/s390x/cpu_models.c | 1 |
12 files changed, 3416 insertions, 26 deletions
diff --git a/target/arm/cpu.c b/target/arm/cpu.c index ab047b9..e1de45e 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -767,6 +767,24 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) return; } +#ifndef CONFIG_USER_ONLY + /* The NVIC and M-profile CPU are two halves of a single piece of + * hardware; trying to use one without the other is a command line + * error and will result in segfaults if not caught here. + */ + if (arm_feature(env, ARM_FEATURE_M)) { + if (!env->nvic) { + error_setg(errp, "This board cannot be used with Cortex-M CPUs"); + return; + } + } else { + if (env->nvic) { + error_setg(errp, "This board can only be used with Cortex-M CPUs"); + return; + } + } +#endif + cpu_exec_realizefn(cs, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index 94f4356..2e76084 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -195,6 +195,15 @@ DEF_HELPER_FLAGS_5(sve_lsl_zpzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(sve_lsl_zpzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG, @@ -416,6 +425,230 @@ DEF_HELPER_FLAGS_4(sve_cpy_z_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(sve_ext, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_insr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_3(sve_rev_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_tbl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_sunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_sunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_sunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_uunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_uunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_uunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_zip_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_punpk_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_zip_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_uzp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_trn_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_compact_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_compact_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_2(sve_last_active_element, TCG_CALL_NO_RWG, s32, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revb_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revb_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revb_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_rbit_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_splice, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(sve_and_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_bic_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_eor_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) @@ -425,3 +658,64 @@ DEF_HELPER_FLAGS_5(sve_orn_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_nor_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_nand_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_brkpa, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpb, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpas, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpbs, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brka_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkb_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brka_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkb_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brkas_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkbs_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkas_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkbs_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_while, TCG_CALL_NO_RWG, i32, ptr, i32, i32) + +DEF_HELPER_FLAGS_4(sve_subri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_smaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_smini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_umaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_umini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_5(gvec_recps_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_recps_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_recps_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_rsqrts_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) diff --git a/target/arm/helper.h b/target/arm/helper.h index 0c6a144..879a722 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -601,6 +601,25 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + #ifdef TARGET_AARCH64 #include "helper-a64.h" #include "helper-sve.h" diff --git a/target/arm/sve.decode b/target/arm/sve.decode index 4761d19..6f436f9 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -24,6 +24,7 @@ %imm4_16_p1 16:4 !function=plus1 %imm6_22_5 22:1 5:5 +%imm7_22_16 22:2 16:5 %imm8_16_10 16:5 10:3 %imm9_16_10 16:s6 10:3 @@ -41,6 +42,8 @@ # Signed 8-bit immediate, optionally shifted left by 8. %sh8_i8s 5:9 !function=expand_imm_sh8s +# Unsigned 8-bit immediate, optionally shifted left by 8. +%sh8_i8u 5:9 !function=expand_imm_sh8u # Either a copy of rd (at bit 0), or a different source # as propagated via the MOVPRFX instruction. @@ -58,6 +61,7 @@ &rri_esz rd rn imm esz &rrr_esz rd rn rm esz &rpr_esz rd pg rn esz +&rpr_s rd pg rn s &rprr_s rd pg rn rm s &rprr_esz rd pg rn rm esz &rprrr_esz rd pg rn rm ra esz @@ -65,6 +69,8 @@ &ptrue rd esz pat s &incdec_cnt rd pat esz imm d u &incdec2_cnt rd rn pat esz imm d u +&incdec_pred rd pg esz d u +&incdec2_pred rd rn pg esz d u ########################################################################### # Named instruction formats. These are generally used to @@ -77,6 +83,9 @@ @pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz @rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz +# Two operand with governing predicate, flags setting +@pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s + # Three operand with unused vector element size @rd_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 &rrr_esz esz=0 @@ -85,6 +94,15 @@ # Three operand, vector element size @rd_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 &rrr_esz +@pd_pn_pm ........ esz:2 .. rm:4 ....... rn:4 . rd:4 &rrr_esz +@rdn_rm ........ esz:2 ...... ...... rm:5 rd:5 \ + &rrr_esz rn=%reg_movprfx +@rdn_sh_i8u ........ esz:2 ...... ...... ..... rd:5 \ + &rri_esz rn=%reg_movprfx imm=%sh8_i8u +@rdn_i8u ........ esz:2 ...... ... imm:8 rd:5 \ + &rri_esz rn=%reg_movprfx +@rdn_i8s ........ esz:2 ...... ... imm:s8 rd:5 \ + &rri_esz rn=%reg_movprfx # Three operand with "memory" size, aka immediate left shift @rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri @@ -94,6 +112,8 @@ &rprr_esz rn=%reg_movprfx @rdm_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \ &rprr_esz rm=%reg_movprfx +@rd_pg4_rn_rm ........ esz:2 . rm:5 .. pg:4 rn:5 rd:5 &rprr_esz +@pd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4 &rprr_esz # Three register operand, with governing predicate, vector element size @rda_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 \ @@ -103,6 +123,7 @@ # One register operand, with governing predicate, vector element size @rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz +@rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz # Two register operands with a 6-bit signed immediate. @rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri @@ -125,6 +146,11 @@ @rdn_dbm ........ .. .... dbm:13 rd:5 \ &rr_dbm rn=%reg_movprfx +# Predicate output, vector and immediate input, +# controlling predicate, element size. +@pd_pg_rn_i7 ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4 &rpri_esz +@pd_pg_rn_i5 ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4 &rpri_esz + # Basic Load/Store with 9-bit immediate offset @pd_rn_i9 ........ ........ ...... rn:5 . rd:4 \ &rri imm=%imm9_16_10 @@ -138,6 +164,12 @@ @incdec2_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \ &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx +# One register, predicate. +# User must fill in U and D. +@incdec_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 &incdec_pred +@incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \ + &incdec2_pred rn=%reg_movprfx + ########################################################################### # Instruction patterns. Grouped according to the SVE encodingindex.xhtml. @@ -369,6 +401,145 @@ CPY_z_i 00000101 .. 01 .... 00 . ........ ..... @rdn_pg4 imm=%sh8_i8s EXT 00000101 001 ..... 000 ... rm:5 rd:5 \ &rrri rn=%reg_movprfx imm=%imm8_16_10 +### SVE Permute - Unpredicated Group + +# SVE broadcast general register +DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn + +# SVE broadcast indexed element +DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \ + &rri imm=%imm7_22_16 + +# SVE insert SIMD&FP scalar register +INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm + +# SVE insert general register +INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm + +# SVE reverse vector elements +REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn + +# SVE vector table lookup +TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm + +# SVE unpack vector elements +UNPK 00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5 + +### SVE Permute - Predicates Group + +# SVE permute predicate elements +ZIP1_p 00000101 .. 10 .... 010 000 0 .... 0 .... @pd_pn_pm +ZIP2_p 00000101 .. 10 .... 010 001 0 .... 0 .... @pd_pn_pm +UZP1_p 00000101 .. 10 .... 010 010 0 .... 0 .... @pd_pn_pm +UZP2_p 00000101 .. 10 .... 010 011 0 .... 0 .... @pd_pn_pm +TRN1_p 00000101 .. 10 .... 010 100 0 .... 0 .... @pd_pn_pm +TRN2_p 00000101 .. 10 .... 010 101 0 .... 0 .... @pd_pn_pm + +# SVE reverse predicate elements +REV_p 00000101 .. 11 0100 010 000 0 .... 0 .... @pd_pn + +# SVE unpack predicate elements +PUNPKLO 00000101 00 11 0000 010 000 0 .... 0 .... @pd_pn_e0 +PUNPKHI 00000101 00 11 0001 010 000 0 .... 0 .... @pd_pn_e0 + +### SVE Permute - Interleaving Group + +# SVE permute vector elements +ZIP1_z 00000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm +ZIP2_z 00000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm +UZP1_z 00000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm +UZP2_z 00000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm +TRN1_z 00000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm +TRN2_z 00000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm + +### SVE Permute - Predicated Group + +# SVE compress active elements +# Note esz >= 2 +COMPACT 00000101 .. 100001 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally broadcast element to vector +CLASTA_z 00000101 .. 10100 0 100 ... ..... ..... @rdn_pg_rm +CLASTB_z 00000101 .. 10100 1 100 ... ..... ..... @rdn_pg_rm + +# SVE conditionally copy element to SIMD&FP scalar +CLASTA_v 00000101 .. 10101 0 100 ... ..... ..... @rd_pg_rn +CLASTB_v 00000101 .. 10101 1 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally copy element to general register +CLASTA_r 00000101 .. 11000 0 101 ... ..... ..... @rd_pg_rn +CLASTB_r 00000101 .. 11000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element to SIMD&FP scalar register +LASTA_v 00000101 .. 10001 0 100 ... ..... ..... @rd_pg_rn +LASTB_v 00000101 .. 10001 1 100 ... ..... ..... @rd_pg_rn + +# SVE copy element to general register +LASTA_r 00000101 .. 10000 0 101 ... ..... ..... @rd_pg_rn +LASTB_r 00000101 .. 10000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element from SIMD&FP scalar register +CPY_m_v 00000101 .. 100000 100 ... ..... ..... @rd_pg_rn + +# SVE copy element from general register to vector (predicated) +CPY_m_r 00000101 .. 101000 101 ... ..... ..... @rd_pg_rn + +# SVE reverse within elements +# Note esz >= operation size +REVB 00000101 .. 1001 00 100 ... ..... ..... @rd_pg_rn +REVH 00000101 .. 1001 01 100 ... ..... ..... @rd_pg_rn +REVW 00000101 .. 1001 10 100 ... ..... ..... @rd_pg_rn +RBIT 00000101 .. 1001 11 100 ... ..... ..... @rd_pg_rn + +# SVE vector splice (predicated) +SPLICE 00000101 .. 101 100 100 ... ..... ..... @rdn_pg_rm + +### SVE Select Vectors Group + +# SVE select vector elements (predicated) +SEL_zpzz 00000101 .. 1 ..... 11 .... ..... ..... @rd_pg4_rn_rm + +### SVE Integer Compare - Vectors Group + +# SVE integer compare_vectors +CMPHS_ppzz 00100100 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzz 00100100 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzz 00100100 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzz 00100100 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_rm +CMPEQ_ppzz 00100100 .. 0 ..... 101 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzz 00100100 .. 0 ..... 101 ... ..... 1 .... @pd_pg_rn_rm + +# SVE integer compare with wide elements +# Note these require esz != 3. +CMPEQ_ppzw 00100100 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzw 00100100 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzw 00100100 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzw 00100100 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm +CMPLT_ppzw 00100100 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm +CMPLE_ppzw 00100100 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm +CMPHS_ppzw 00100100 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzw 00100100 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm +CMPLO_ppzw 00100100 .. 0 ..... 111 ... ..... 0 .... @pd_pg_rn_rm +CMPLS_ppzw 00100100 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm + +### SVE Integer Compare - Unsigned Immediate Group + +# SVE integer compare with unsigned immediate +CMPHS_ppzi 00100100 .. 1 ....... 0 ... ..... 0 .... @pd_pg_rn_i7 +CMPHI_ppzi 00100100 .. 1 ....... 0 ... ..... 1 .... @pd_pg_rn_i7 +CMPLO_ppzi 00100100 .. 1 ....... 1 ... ..... 0 .... @pd_pg_rn_i7 +CMPLS_ppzi 00100100 .. 1 ....... 1 ... ..... 1 .... @pd_pg_rn_i7 + +### SVE Integer Compare - Signed Immediate Group + +# SVE integer compare with signed immediate +CMPGE_ppzi 00100101 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_i5 +CMPGT_ppzi 00100101 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_i5 +CMPLT_ppzi 00100101 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_i5 +CMPLE_ppzi 00100101 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_i5 +CMPEQ_ppzi 00100101 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_i5 +CMPNE_ppzi 00100101 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_i5 + ### SVE Predicate Logical Operations Group # SVE predicate logical operations @@ -410,6 +581,83 @@ PFIRST 00100101 01 011 000 11000 00 .... 0 .... @pd_pn_e0 # SVE predicate next active PNEXT 00100101 .. 011 001 11000 10 .... 0 .... @pd_pn +### SVE Partition Break Group + +# SVE propagate break from previous partition +BRKPA 00100101 0. 00 .... 11 .... 0 .... 0 .... @pd_pg_pn_pm_s +BRKPB 00100101 0. 00 .... 11 .... 0 .... 1 .... @pd_pg_pn_pm_s + +# SVE partition break condition +BRKA_z 00100101 0. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKB_z 00100101 1. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKA_m 00100101 0. 01000001 .... 0 .... 1 .... @pd_pg_pn_s +BRKB_m 00100101 1. 01000001 .... 0 .... 1 .... @pd_pg_pn_s + +# SVE propagate break to next partition +BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s + +### SVE Predicate Count Group + +# SVE predicate count +CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn + +# SVE inc/dec register by predicate count +INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1 + +# SVE inc/dec vector by predicate count +INCDECP_z 00100101 .. 10110 d:1 10000 00 .... ..... @incdec2_pred u=1 + +# SVE saturating inc/dec register by predicate count +SINCDECP_r_32 00100101 .. 1010 d:1 u:1 10001 00 .... ..... @incdec_pred +SINCDECP_r_64 00100101 .. 1010 d:1 u:1 10001 10 .... ..... @incdec_pred + +# SVE saturating inc/dec vector by predicate count +SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred + +### SVE Integer Compare - Scalars Group + +# SVE conditionally terminate scalars +CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000 + +# SVE integer compare scalar count and limit +WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 1 rn:5 eq:1 rd:4 + +### SVE Integer Wide Immediate - Unpredicated Group + +# SVE broadcast floating-point immediate (unpredicated) +FDUP 00100101 esz:2 111 00 1110 imm:8 rd:5 + +# SVE broadcast integer immediate (unpredicated) +DUP_i 00100101 esz:2 111 00 011 . ........ rd:5 imm=%sh8_i8s + +# SVE integer add/subtract immediate (unpredicated) +ADD_zzi 00100101 .. 100 000 11 . ........ ..... @rdn_sh_i8u +SUB_zzi 00100101 .. 100 001 11 . ........ ..... @rdn_sh_i8u +SUBR_zzi 00100101 .. 100 011 11 . ........ ..... @rdn_sh_i8u +SQADD_zzi 00100101 .. 100 100 11 . ........ ..... @rdn_sh_i8u +UQADD_zzi 00100101 .. 100 101 11 . ........ ..... @rdn_sh_i8u +SQSUB_zzi 00100101 .. 100 110 11 . ........ ..... @rdn_sh_i8u +UQSUB_zzi 00100101 .. 100 111 11 . ........ ..... @rdn_sh_i8u + +# SVE integer min/max immediate (unpredicated) +SMAX_zzi 00100101 .. 101 000 110 ........ ..... @rdn_i8s +UMAX_zzi 00100101 .. 101 001 110 ........ ..... @rdn_i8u +SMIN_zzi 00100101 .. 101 010 110 ........ ..... @rdn_i8s +UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u + +# SVE integer multiply immediate (unpredicated) +MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s + +### SVE Floating Point Arithmetic - Unpredicated Group + +# SVE floating-point arithmetic (unpredicated) +FADD_zzz 01100101 .. 0 ..... 000 000 ..... ..... @rd_rn_rm +FSUB_zzz 01100101 .. 0 ..... 000 001 ..... ..... @rd_rn_rm +FMUL_zzz 01100101 .. 0 ..... 000 010 ..... ..... @rd_rn_rm +FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm +FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm +FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm + ### SVE Memory - 32-bit Gather and Unsized Contiguous Group # SVE load predicate register diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index b825e44..128bbf9 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -74,6 +74,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) return flags; } +/* This is an iterative function, called for each Pd and Pg word + * moving backward. + */ +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) +{ + if (likely(g)) { + /* Compute C from first (i.e last) !(D & G). + Use bit 2 to signal first G bit seen. */ + if (!(flags & 4)) { + flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ + flags |= (d & pow2floor(g)) == 0; + } + + /* Accumulate Z from each D & G. */ + flags |= ((d & g) != 0) << 1; + + /* Compute N from last (i.e first) D & G. Replace previous. */ + flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); + } + return flags; +} + /* The same for a single word predicate. */ uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) { @@ -238,6 +260,26 @@ static inline uint64_t expand_pred_s(uint8_t byte) return word[byte & 0x11]; } +/* Swap 16-bit words within a 32-bit word. */ +static inline uint32_t hswap32(uint32_t h) +{ + return rol32(h, 16); +} + +/* Swap 16-bit words within a 64-bit word. */ +static inline uint64_t hswap64(uint64_t h) +{ + uint64_t m = 0x0000ffff0000ffffull; + h = rol64(h, 32); + return ((h & m) << 16) | ((h >> 16) & m); +} + +/* Swap 32-bit words within a 64-bit word. */ +static inline uint64_t wswap64(uint64_t h) +{ + return rol64(h, 32); +} + #define LOGICAL_PPPP(NAME, FUNC) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ { \ @@ -616,6 +658,20 @@ DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) +DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) +DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) +DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) + +DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) +DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) + +DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) + +DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) +DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) +DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) +DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) + /* Three-operand expander, unpredicated, in which the third operand is "wide". */ #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ @@ -748,6 +804,46 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_VPZ #undef DO_VPZ_D +/* Two vector operand, one scalar operand, unpredicated. */ +#define DO_ZZI(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ + TYPE s = s64, *d = vd, *n = vn; \ + for (i = 0; i < opr_sz; ++i) { \ + d[i] = OP(n[i], s); \ + } \ +} + +#define DO_SUBR(X, Y) (Y - X) + +DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) +DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) +DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) +DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) + +DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) +DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) +DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) +DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) + +DO_ZZI(sve_smini_b, int8_t, DO_MIN) +DO_ZZI(sve_smini_h, int16_t, DO_MIN) +DO_ZZI(sve_smini_s, int32_t, DO_MIN) +DO_ZZI(sve_smini_d, int64_t, DO_MIN) + +DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) +DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) +DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) +DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) + +DO_ZZI(sve_umini_b, uint8_t, DO_MIN) +DO_ZZI(sve_umini_h, uint16_t, DO_MIN) +DO_ZZI(sve_umini_s, uint32_t, DO_MIN) +DO_ZZI(sve_umini_d, uint64_t, DO_MIN) + +#undef DO_ZZI + #undef DO_AND #undef DO_ORR #undef DO_EOR @@ -762,6 +858,7 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_ASR #undef DO_LSR #undef DO_LSL +#undef DO_SUBR /* Similar to the ARM LastActiveElement pseudocode function, except the result is multiplied by the element size. This includes the not found @@ -1560,3 +1657,1156 @@ void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) memcpy(vd + n_siz, &tmp, n_ofs); } } + +#define DO_INSR(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ + *(TYPE *)(vd + H(0)) = val; \ +} + +DO_INSR(sve_insr_b, uint8_t, H1) +DO_INSR(sve_insr_h, uint16_t, H1_2) +DO_INSR(sve_insr_s, uint32_t, H1_4) +DO_INSR(sve_insr_d, uint64_t, ) + +#undef DO_INSR + +void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = bswap64(b); + *(uint64_t *)(vd + j) = bswap64(f); + } +} + +void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = hswap64(b); + *(uint64_t *)(vd + j) = hswap64(f); + } +} + +void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = rol64(b, 32); + *(uint64_t *)(vd + j) = rol64(f, 32); + } +} + +void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = b; + *(uint64_t *)(vd + j) = f; + } +} + +#define DO_TBL(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + uintptr_t elem = opr_sz / sizeof(TYPE); \ + TYPE *d = vd, *n = vn, *m = vm; \ + ARMVectorReg tmp; \ + if (unlikely(vd == vn)) { \ + n = memcpy(&tmp, vn, opr_sz); \ + } \ + for (i = 0; i < elem; i++) { \ + TYPE j = m[H(i)]; \ + d[H(i)] = j < elem ? n[H(j)] : 0; \ + } \ +} + +DO_TBL(sve_tbl_b, uint8_t, H1) +DO_TBL(sve_tbl_h, uint16_t, H2) +DO_TBL(sve_tbl_s, uint32_t, H4) +DO_TBL(sve_tbl_d, uint64_t, ) + +#undef TBL + +#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPED *d = vd; \ + TYPES *n = vn; \ + ARMVectorReg tmp; \ + if (unlikely(vn - vd < opr_sz)) { \ + n = memcpy(&tmp, n, opr_sz / 2); \ + } \ + for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ + d[HD(i)] = n[HS(i)]; \ + } \ +} + +DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) +DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) +DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4) + +DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) +DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) +DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4) + +#undef DO_UNPK + +/* Mask of bits included in the even numbered predicates of width esz. + * We also use this for expand_bits/compress_bits, and so extend the + * same pattern out to 16-bit units. + */ +static const uint64_t even_bit_esz_masks[5] = { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0f0f0f0f0f0f0f0full, + 0x00ff00ff00ff00ffull, + 0x0000ffff0000ffffull, +}; + +/* Zero-extend units of 2**N bits to units of 2**(N+1) bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_shuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits. + */ +static uint64_t expand_bits(uint64_t x, int n) +{ + int i; + + x &= 0xffffffffu; + for (i = 4; i >= n; i--) { + int sh = 1 << i; + x = ((x << sh) | x) & even_bit_esz_masks[i]; + } + return x; +} + +/* Compress units of 2**(N+1) bits to units of 2**N bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_unshuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. + */ +static uint64_t compress_bits(uint64_t x, int n) +{ + int i; + + for (i = n; i <= 4; i++) { + int sh = 1 << i; + x &= even_bit_esz_masks[i]; + x = (x >> sh) | x; + } + return x & 0xffffffffu; +} + +void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + uint64_t mm = *(uint64_t *)vm; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + mm = extract64(mm, high * half, half); + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[0] = nn + (mm << (1 << esz)); + } else { + ARMPredicateReg tmp_n, tmp_m; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if ((vn - vd) < (uintptr_t)oprsz) { + vn = memcpy(&tmp_n, vn, oprsz); + } + if ((vm - vd) < (uintptr_t)oprsz) { + vm = memcpy(&tmp_m, vm, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((high & 3) == 0) { + uint32_t *n = vn, *m = vm; + high >>= 2; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = n[H4(high + i)]; + uint64_t mm = m[H4(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[i] = nn + (mm << (1 << esz)); + } + } else { + uint8_t *n = vn, *m = vm; + uint16_t *d16 = vd; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + uint16_t mm = m[H1(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d16[H2(i)] = nn + (mm << (1 << esz)); + } + } + } +} + +void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz; + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t l, h; + intptr_t i; + + if (oprsz <= 8) { + l = compress_bits(n[0] >> odd, esz); + h = compress_bits(m[0] >> odd, esz); + d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); + } else { + ARMPredicateReg tmp_m; + intptr_t oprsz_16 = oprsz / 16; + + if ((vm - vd) < (uintptr_t)oprsz) { + m = memcpy(&tmp_m, vm, oprsz); + } + + for (i = 0; i < oprsz_16; i++) { + l = n[2 * i + 0]; + h = n[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[i] = l + (h << 32); + } + + /* For VL which is not a power of 2, the results from M do not + align nicely with the uint64_t for D. Put the aligned results + from M into TMP_M and then copy it into place afterward. */ + if (oprsz & 15) { + d[i] = compress_bits(n[2 * i] >> odd, esz); + + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + tmp_m.p[i] = l + (h << 32); + } + tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); + + swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); + } else { + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[oprsz_16 + i] = l + (h << 32); + } + } + } +} + +void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t mask; + int shr, shl; + intptr_t i; + + shl = 1 << esz; + shr = 0; + mask = even_bit_esz_masks[esz]; + if (odd) { + mask <<= shl; + shr = shl; + shl = 0; + } + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = (n[i] & mask) >> shr; + uint64_t mm = (m[i] & mask) << shl; + d[i] = nn + mm; + } +} + +/* Reverse units of 2**N bits. */ +static uint64_t reverse_bits_64(uint64_t x, int n) +{ + int i, sh; + + x = bswap64(x); + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + uint64_t mask = even_bit_esz_masks[i]; + x = ((x & mask) << sh) | ((x >> sh) & mask); + } + return x; +} + +static uint8_t reverse_bits_8(uint8_t x, int n) +{ + static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; + int i, sh; + + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); + } + return x; +} + +void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + intptr_t i, oprsz_2 = oprsz / 2; + + if (oprsz <= 8) { + uint64_t l = *(uint64_t *)vn; + l = reverse_bits_64(l << (64 - 8 * oprsz), esz); + *(uint64_t *)vd = l; + } else if ((oprsz & 15) == 0) { + for (i = 0; i < oprsz_2; i += 8) { + intptr_t ih = oprsz - 8 - i; + uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); + uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); + *(uint64_t *)(vd + i) = h; + *(uint64_t *)(vd + ih) = l; + } + } else { + for (i = 0; i < oprsz_2; i += 1) { + intptr_t il = H1(i); + intptr_t ih = H1(oprsz - 1 - i); + uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); + uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); + *(uint8_t *)(vd + il) = h; + *(uint8_t *)(vd + ih) = l; + } + } +} + +void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + nn = expand_bits(nn, 0); + d[0] = nn; + } else { + ARMPredicateReg tmp_n; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if ((vn - vd) < (uintptr_t)oprsz) { + vn = memcpy(&tmp_n, vn, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((high & 3) == 0) { + uint32_t *n = vn; + high >>= 2; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = n[H4(high + i)]; + d[i] = expand_bits(nn, 0); + } + } else { + uint16_t *d16 = vd; + uint8_t *n = vn; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + d16[H2(i)] = expand_bits(nn, 0); + } + } + } +} + +#define DO_ZIP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t i, oprsz_2 = oprsz / 2; \ + ARMVectorReg tmp_n, tmp_m; \ + /* We produce output faster than we consume input. \ + Therefore we must be mindful of possible overlap. */ \ + if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ + vn = memcpy(&tmp_n, vn, oprsz_2); \ + } \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz_2); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \ + } \ +} + +DO_ZIP(sve_zip_b, uint8_t, H1) +DO_ZIP(sve_zip_h, uint16_t, H1_2) +DO_ZIP(sve_zip_s, uint32_t, H1_4) +DO_ZIP(sve_zip_d, uint64_t, ) + +#define DO_UZP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t oprsz_2 = oprsz / 2; \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i; \ + ARMVectorReg tmp_m; \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \ + } \ +} + +DO_UZP(sve_uzp_b, uint8_t, H1) +DO_UZP(sve_uzp_h, uint16_t, H1_2) +DO_UZP(sve_uzp_s, uint32_t, H1_4) +DO_UZP(sve_uzp_d, uint64_t, ) + +#define DO_TRN(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i; \ + for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ + TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ + TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ + *(TYPE *)(vd + H(i + 0)) = ae; \ + *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ + } \ +} + +DO_TRN(sve_trn_b, uint8_t, H1) +DO_TRN(sve_trn_h, uint16_t, H1_2) +DO_TRN(sve_trn_s, uint32_t, H1_4) +DO_TRN(sve_trn_d, uint64_t, ) + +#undef DO_ZIP +#undef DO_UZP +#undef DO_TRN + +void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { + d[H4(j)] = n[H4(i)]; + j++; + } + } + for (; j < opr_sz; j++) { + d[H4(j)] = 0; + } +} + +void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i)] & 1) { + d[j] = n[i]; + j++; + } + } + for (; j < opr_sz; j++) { + d[j] = 0; + } +} + +/* Similar to the ARM LastActiveElement pseudocode function, except the + * result is multiplied by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + + return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz); +} + +void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) +{ + intptr_t opr_sz = simd_oprsz(desc) / 8; + int esz = simd_data(desc); + uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; + intptr_t i, first_i, last_i; + ARMVectorReg tmp; + + first_i = last_i = 0; + first_g = last_g = 0; + + /* Find the extent of the active elements within VG. */ + for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { + pg = *(uint64_t *)(vg + i) & mask; + if (pg) { + if (last_g == 0) { + last_g = pg; + last_i = i; + } + first_g = pg; + first_i = i; + } + } + + len = 0; + if (first_g != 0) { + first_i = first_i * 8 + ctz64(first_g); + last_i = last_i * 8 + 63 - clz64(last_g); + len = last_i - first_i + (1 << esz); + if (vd == vm) { + vm = memcpy(&tmp, vm, opr_sz * 8); + } + swap_memmove(vd, vn + first_i, len); + } + swap_memmove(vd + len, vm, opr_sz * 8 - len); +} + +void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_b(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_h(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_s(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + d[i] = (pg[H1(i)] & 1 ? nn : mm); + } +} + +/* Two operand comparison controlled by a predicate. + * ??? It is very tempting to want to be able to expand this inline + * with x86 instructions, e.g. + * + * vcmpeqw zm, zn, %ymm0 + * vpmovmskb %ymm0, %eax + * and $0x5555, %eax + * and pg, %eax + * + * or even aarch64, e.g. + * + * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 + * cmeq v0.8h, zn, zm + * and v0.8h, v0.8h, mask + * addv h0, v0.8h + * and v0.8b, pg + * + * However, coming up with an abstraction that allows vector inputs and + * a scalar output, and also handles the byte-ordering of sub-uint64_t + * scalar outputs, is tricky. + */ +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull) + +DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) +DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) +DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) +DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) + +DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) +DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) +DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) +DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) + +DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) +DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) +DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) +DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) + +DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) +DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) +DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) +DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) + +DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) +DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) +DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) +DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) + +DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) +DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) +DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) +DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) + +#undef DO_CMP_PPZZ_B +#undef DO_CMP_PPZZ_H +#undef DO_CMP_PPZZ_S +#undef DO_CMP_PPZZ_D +#undef DO_CMP_PPZZ + +/* Similar, but the second source is "wide". */ +#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + TYPEW mm = *(TYPEW *)(vm + i - 8); \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 7); \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) + +DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==) +DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==) +DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==) + +DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=) +DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=) +DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=) + +DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) +DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) +DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) + +DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) +DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) +DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) + +DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) +DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) +DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) + +DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) +DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) +DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) + +DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) +DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) +DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) + +DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) +DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) +DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) + +DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) +DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) +DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) + +DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) +DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) +DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) + +#undef DO_CMP_PPZW_B +#undef DO_CMP_PPZW_H +#undef DO_CMP_PPZW_S +#undef DO_CMP_PPZW + +/* Similar, but the second source is immediate. */ +#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + TYPE mm = simd_data(desc); \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZI_B(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZI_H(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZI_S(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZI_D(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull) + +DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) +DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) +DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) +DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) + +DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) +DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) +DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) +DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) + +DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) +DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) +DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) +DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) + +DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) +DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) +DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) +DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) + +DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) +DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) +DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) +DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) + +DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) +DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) +DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) +DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) + +DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) +DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) +DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) +DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) + +DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) +DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) +DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) +DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) + +DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) +DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) +DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) +DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) + +DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) +DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) +DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) +DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) + +#undef DO_CMP_PPZI_B +#undef DO_CMP_PPZI_H +#undef DO_CMP_PPZI_S +#undef DO_CMP_PPZI_D +#undef DO_CMP_PPZI + +/* Similar to the ARM LastActive pseudocode function. */ +static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) +{ + intptr_t i; + + for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { + uint64_t pg = *(uint64_t *)(vg + i); + if (pg) { + return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; + } + } + return 0; +} + +/* Compute a mask into RETB that is true for all G, up to and including + * (if after) or excluding (if !after) the first G & N. + * Return true if BRK found. + */ +static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, + bool brk, bool after) +{ + uint64_t b; + + if (brk) { + b = 0; + } else if ((g & n) == 0) { + /* For all G, no N are set; break not found. */ + b = g; + } else { + /* Break somewhere in N. Locate it. */ + b = g & n; /* guard true, pred true */ + b = b & -b; /* first such */ + if (after) { + b = b | (b - 1); /* break after same */ + } else { + b = b - 1; /* break before same */ + } + brk = true; + } + + *retb = b; + return brk; +} + +/* Compute a zeroing BRK. */ +static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_b & this_g; + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_d, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = this_b & this_g; + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +/* Compute a merging BRK. */ +static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = (this_b & this_g) | (d[i] & ~this_g); + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < oprsz / 8; ++i) { + uint64_t this_b, this_d = d[i], this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) +{ + /* It is quicker to zero the whole predicate than loop on OPRSZ. + * The compiler should turn this into 4 64-bit integer stores. + */ + memset(d, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; +} + +void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, true); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, true); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, false); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, false); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_z(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_z(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_z(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_z(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_m(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_m(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_m(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_m(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + + if (!last_active_pred(vn, vg, oprsz)) { + do_zero(vd, oprsz); + } +} + +/* As if PredTest(Ones(PL), D, esz). */ +static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, + uint64_t esz_mask) +{ + uint32_t flags = PREDTEST_INIT; + intptr_t i; + + for (i = 0; i < oprsz / 8; i++) { + flags = iter_predtest_fwd(d->p[i], esz_mask, flags); + } + if (oprsz & 7) { + uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); + flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); + } + return flags; +} + +uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + + if (last_active_pred(vn, vg, oprsz)) { + return predtest_ones(vd, oprsz, -1); + } else { + return do_zero(vd, oprsz); + } +} + +uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t t = n[i] & g[i] & mask; + sum += ctpop64(t); + } + return sum; +} + +uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + uint32_t flags; + intptr_t i; + + /* Begin with a zero predicate register. */ + flags = do_zero(d, oprsz); + if (count == 0) { + return flags; + } + + /* Scale from predicate element count to bits. */ + count <<= esz; + /* Bound to the bits in the predicate. */ + count = MIN(count, oprsz * 8); + + /* Set all of the requested bits. */ + for (i = 0; i < count / 64; ++i) { + d->p[i] = esz_mask; + } + if (count & 63) { + d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + } + + return predtest_ones(d, oprsz, esz_mask); +} diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h index dd9c09f..63d958c 100644 --- a/target/arm/translate-a64.h +++ b/target/arm/translate-a64.h @@ -67,18 +67,26 @@ static inline void assert_fp_access_checked(DisasContext *s) static inline int vec_reg_offset(DisasContext *s, int regno, int element, TCGMemOp size) { - int offs = 0; + int element_size = 1 << size; + int offs = element * element_size; #ifdef HOST_WORDS_BIGENDIAN /* This is complicated slightly because vfp.zregs[n].d[0] is - * still the low half and vfp.zregs[n].d[1] the high half - * of the 128 bit vector, even on big endian systems. - * Calculate the offset assuming a fully bigendian 128 bits, - * then XOR to account for the order of the two 64 bit halves. + * still the lowest and vfp.zregs[n].d[15] the highest of the + * 256 byte vector, even on big endian systems. + * + * Calculate the offset assuming fully little-endian, + * then XOR to account for the order of the 8-byte units. + * + * For 16 byte elements, the two 8 byte halves will not form a + * host int128 if the host is bigendian, since they're in the + * wrong order. However the only 16 byte operation we have is + * a move, so we can ignore this for the moment. More complicated + * operations will have to special case loading and storing from + * the zregs array. */ - offs += (16 - ((element + 1) * (1 << size))); - offs ^= 8; -#else - offs += element * (1 << size); + if (element_size < 8) { + offs ^= 8 - element_size; + } #endif offs += offsetof(CPUARMState, vfp.zregs[regno]); assert_fp_access_checked(s); diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index c48d4b5..226c975 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -33,6 +33,15 @@ #include "trace-tcg.h" #include "translate-a64.h" + +typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t, + TCGv_i64, uint32_t, uint32_t); + +typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + /* * Helpers for extracting complex instruction fields. */ @@ -68,6 +77,11 @@ static inline int expand_imm_sh8s(int x) return (int8_t)x << (x & 0x100 ? 8 : 0); } +static inline int expand_imm_sh8u(int x) +{ + return (uint8_t)x << (x & 0x100 ? 8 : 0); +} + /* * Include the generated decoder. */ @@ -373,6 +387,8 @@ static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) return do_zpzz_ool(s, a, fns[a->esz]); } +DO_ZPZZ(SEL, sel) + #undef DO_ZPZZ /* @@ -1957,6 +1973,1448 @@ static bool trans_EXT(DisasContext *s, arg_EXT *a, uint32_t insn) } /* + *** SVE Permute - Unpredicated Group + */ + +static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a, uint32_t insn) +{ + if ((a->imm & 0x1f) == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned dofs = vec_full_reg_offset(s, a->rd); + unsigned esz, index; + + esz = ctz32(a->imm); + index = a->imm >> (esz + 1); + + if ((index << esz) < vsz) { + unsigned nofs = vec_reg_offset(s, a->rn, index, esz); + tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz); + } else { + tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0); + } + } + return true; +} + +static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val) +{ + typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_insr * const fns[4] = { + gen_helper_sve_insr_b, gen_helper_sve_insr_h, + gen_helper_sve_insr_s, gen_helper_sve_insr_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + + fns[a->esz](t_zd, t_zn, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_i32(desc); +} + +static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64)); + do_insr_i64(s, a, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_insr_i64(s, a, cpu_reg(s, a->rm)); + } + return true; +} + +static bool trans_REV_v(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4] = { + gen_helper_sve_rev_b, gen_helper_sve_rev_h, + gen_helper_sve_rev_s, gen_helper_sve_rev_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_TBL(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_tbl_b, gen_helper_sve_tbl_h, + gen_helper_sve_tbl_s, gen_helper_sve_tbl_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_UNPK(DisasContext *s, arg_UNPK *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h }, + { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s }, + { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d }, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + + (a->h ? vsz / 2 : 0), + vsz, vsz, 0, fns[a->esz][a->u]); + } + return true; +} + +/* + *** SVE Permute - Predicates Group + */ + +static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd, + gen_helper_gvec_3 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_ptr t_m = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm)); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_m, t_desc); + + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + tcg_temp_free_ptr(t_m); + tcg_temp_free_i32(t_desc); + return true; +} + +static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd, + gen_helper_gvec_2 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + return true; +} + +static bool trans_ZIP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_zip_p); +} + +static bool trans_ZIP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_zip_p); +} + +static bool trans_UZP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_uzp_p); +} + +static bool trans_UZP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_uzp_p); +} + +static bool trans_TRN1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_trn_p); +} + +static bool trans_TRN2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_trn_p); +} + +static bool trans_REV_p(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_rev_p); +} + +static bool trans_PUNPKLO(DisasContext *s, arg_PUNPKLO *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_punpk_p); +} + +static bool trans_PUNPKHI(DisasContext *s, arg_PUNPKHI *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 1, gen_helper_sve_punpk_p); +} + +/* + *** SVE Permute - Interleaving Group + */ + +static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_zip_b, gen_helper_sve_zip_h, + gen_helper_sve_zip_s, gen_helper_sve_zip_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned high_ofs = high ? vsz / 2 : 0; + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + high_ofs, + vec_full_reg_offset(s, a->rm) + high_ofs, + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool do_zzz_data_ool(DisasContext *s, arg_rrr_esz *a, int data, + gen_helper_gvec_3 *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, data, fn); + } + return true; +} + +static bool trans_ZIP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, false); +} + +static bool trans_ZIP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, true); +} + +static gen_helper_gvec_3 * const uzp_fns[4] = { + gen_helper_sve_uzp_b, gen_helper_sve_uzp_h, + gen_helper_sve_uzp_s, gen_helper_sve_uzp_d, +}; + +static bool trans_UZP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, uzp_fns[a->esz]); +} + +static bool trans_UZP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, uzp_fns[a->esz]); +} + +static gen_helper_gvec_3 * const trn_fns[4] = { + gen_helper_sve_trn_b, gen_helper_sve_trn_h, + gen_helper_sve_trn_s, gen_helper_sve_trn_d, +}; + +static bool trans_TRN1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, trn_fns[a->esz]); +} + +static bool trans_TRN2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, trn_fns[a->esz]); +} + +/* + *** SVE Permute Vector - Predicated Group + */ + +static bool trans_COMPACT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +/* Call the helper that computes the ARM LastActiveElement pseudocode + * function, scaled by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg) +{ + /* Predicate sizes may be smaller and cannot use simd_desc. We cannot + * round up, as we do elsewhere, because we need the exact size. + */ + TCGv_ptr t_p = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + unsigned vsz = pred_full_reg_size(s); + unsigned desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_last_active_element(ret, t_p, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_p); +} + +/* Increment LAST to the offset of the next element in the vector, + * wrapping around to 0. + */ +static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + tcg_gen_addi_i32(last, last, 1 << esz); + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* If LAST < 0, set LAST to the offset of the last element in the vector. */ +static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz - (1 << esz)); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* Load an unsigned element of ESZ from BASE+OFS. */ +static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz) +{ + TCGv_i64 r = tcg_temp_new_i64(); + + switch (esz) { + case 0: + tcg_gen_ld8u_i64(r, base, ofs); + break; + case 1: + tcg_gen_ld16u_i64(r, base, ofs); + break; + case 2: + tcg_gen_ld32u_i64(r, base, ofs); + break; + case 3: + tcg_gen_ld_i64(r, base, ofs); + break; + default: + g_assert_not_reached(); + } + return r; +} + +/* Load an unsigned element of ESZ from RM[LAST]. */ +static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last, + int rm, int esz) +{ + TCGv_ptr p = tcg_temp_new_ptr(); + TCGv_i64 r; + + /* Convert offset into vector into offset into ENV. + * The final adjustment for the vector register base + * is added via constant offset to the load. + */ +#ifdef HOST_WORDS_BIGENDIAN + /* Adjust for element ordering. See vec_reg_offset. */ + if (esz < 3) { + tcg_gen_xori_i32(last, last, 8 - (1 << esz)); + } +#endif + tcg_gen_ext_i32_ptr(p, last); + tcg_gen_add_ptr(p, p, cpu_env); + + r = load_esz(p, vec_full_reg_offset(s, rm), esz); + tcg_temp_free_ptr(p); + + return r; +} + +/* Compute CLAST for a Zreg. */ +static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before) +{ + TCGv_i32 last; + TCGLabel *over; + TCGv_i64 ele; + unsigned vsz, esz = a->esz; + + if (!sve_access_check(s)) { + return true; + } + + last = tcg_temp_local_new_i32(); + over = gen_new_label(); + + find_last_active(s, last, esz, a->pg); + + /* There is of course no movcond for a 2048-bit vector, + * so we must branch over the actual store. + */ + tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over); + + if (!before) { + incr_last_active(s, last, esz); + } + + ele = load_last_active(s, last, a->rm, esz); + tcg_temp_free_i32(last); + + vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele); + tcg_temp_free_i64(ele); + + /* If this insn used MOVPRFX, we may need a second move. */ + if (a->rd != a->rn) { + TCGLabel *done = gen_new_label(); + tcg_gen_br(done); + + gen_set_label(over); + do_mov_z(s, a->rd, a->rn); + + gen_set_label(done); + } else { + gen_set_label(over); + } + return true; +} + +static bool trans_CLASTA_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, false); +} + +static bool trans_CLASTB_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, true); +} + +/* Compute CLAST for a scalar. */ +static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm, + bool before, TCGv_i64 reg_val) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ele, cmp, zero; + + find_last_active(s, last, esz, pg); + + /* Extend the original value of last prior to incrementing. */ + cmp = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(cmp, last); + + if (!before) { + incr_last_active(s, last, esz); + } + + /* The conceit here is that while last < 0 indicates not found, after + * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address + * from which we can load garbage. We then discard the garbage with + * a conditional move. + */ + ele = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + + zero = tcg_const_i64(0); + tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, zero, ele, reg_val); + + tcg_temp_free_i64(zero); + tcg_temp_free_i64(cmp); + tcg_temp_free_i64(ele); +} + +/* Compute CLAST for a Vreg. */ +static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + int esz = a->esz; + int ofs = vec_reg_offset(s, a->rd, 0, esz); + TCGv_i64 reg = load_esz(cpu_env, ofs, esz); + + do_clast_scalar(s, esz, a->pg, a->rn, before, reg); + write_fp_dreg(s, a->rd, reg); + tcg_temp_free_i64(reg); + } + return true; +} + +static bool trans_CLASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, false); +} + +static bool trans_CLASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, true); +} + +/* Compute CLAST for a Xreg. */ +static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + TCGv_i64 reg; + + if (!sve_access_check(s)) { + return true; + } + + reg = cpu_reg(s, a->rd); + switch (a->esz) { + case 0: + tcg_gen_ext8u_i64(reg, reg); + break; + case 1: + tcg_gen_ext16u_i64(reg, reg); + break; + case 2: + tcg_gen_ext32u_i64(reg, reg); + break; + case 3: + break; + default: + g_assert_not_reached(); + } + + do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg); + return true; +} + +static bool trans_CLASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, false); +} + +static bool trans_CLASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, true); +} + +/* Compute LAST for a scalar. */ +static TCGv_i64 do_last_scalar(DisasContext *s, int esz, + int pg, int rm, bool before) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ret; + + find_last_active(s, last, esz, pg); + if (before) { + wrap_last_active(s, last, esz); + } else { + incr_last_active(s, last, esz); + } + + ret = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + return ret; +} + +/* Compute LAST for a Vreg. */ +static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + write_fp_dreg(s, a->rd, val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, false); +} + +static bool trans_LASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, true); +} + +/* Compute LAST for a Xreg. */ +static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + tcg_gen_mov_i64(cpu_reg(s, a->rd), val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, false); +} + +static bool trans_LASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, true); +} + +static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int ofs = vec_reg_offset(s, a->rn, 0, a->esz); + TCGv_i64 t = load_esz(cpu_env, ofs, a->esz); + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_REVB(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + gen_helper_sve_revb_h, + gen_helper_sve_revb_s, + gen_helper_sve_revb_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVH(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + NULL, + gen_helper_sve_revh_s, + gen_helper_sve_revh_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVW(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ool(s, a, a->esz == 3 ? gen_helper_sve_revw_d : NULL); +} + +static bool trans_RBIT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_rbit_b, + gen_helper_sve_rbit_h, + gen_helper_sve_rbit_s, + gen_helper_sve_rbit_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_SPLICE(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + vsz, vsz, a->esz, gen_helper_sve_splice); + } + return true; +} + +/* + *** SVE Integer Compare - Vectors Group + */ + +static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_flags_4 *gen_fn) +{ + TCGv_ptr pd, zn, zm, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, 0)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + zm = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, zm, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(zm); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZZ(NAME, name) \ +static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \ + gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZZ(CMPEQ, cmpeq) +DO_PPZZ(CMPNE, cmpne) +DO_PPZZ(CMPGT, cmpgt) +DO_PPZZ(CMPGE, cmpge) +DO_PPZZ(CMPHI, cmphi) +DO_PPZZ(CMPHS, cmphs) + +#undef DO_PPZZ + +#define DO_PPZW(NAME, name) \ +static bool trans_##NAME##_ppzw(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \ + gen_helper_sve_##name##_ppzw_s, NULL \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZW(CMPEQ, cmpeq) +DO_PPZW(CMPNE, cmpne) +DO_PPZW(CMPGT, cmpgt) +DO_PPZW(CMPGE, cmpge) +DO_PPZW(CMPHI, cmphi) +DO_PPZW(CMPHS, cmphs) +DO_PPZW(CMPLT, cmplt) +DO_PPZW(CMPLE, cmple) +DO_PPZW(CMPLO, cmplo) +DO_PPZW(CMPLS, cmpls) + +#undef DO_PPZW + +/* + *** SVE Integer Compare - Immediate Groups + */ + +static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a, + gen_helper_gvec_flags_3 *gen_fn) +{ + TCGv_ptr pd, zn, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, a->imm)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZI(NAME, name) \ +static bool trans_##NAME##_ppzi(DisasContext *s, arg_rpri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_3 * const fns[4] = { \ + gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h, \ + gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d, \ + }; \ + return do_ppzi_flags(s, a, fns[a->esz]); \ +} + +DO_PPZI(CMPEQ, cmpeq) +DO_PPZI(CMPNE, cmpne) +DO_PPZI(CMPGT, cmpgt) +DO_PPZI(CMPGE, cmpge) +DO_PPZI(CMPHI, cmphi) +DO_PPZI(CMPHS, cmphs) +DO_PPZI(CMPLT, cmplt) +DO_PPZI(CMPLE, cmple) +DO_PPZI(CMPLO, cmplo) +DO_PPZI(CMPLS, cmpls) + +#undef DO_PPZI + +/* + *** SVE Partition Break Group + */ + +static bool do_brk3(DisasContext *s, arg_rprr_s *a, + gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr m = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, m, g, t); + do_pred_flags(t); + } else { + fn(d, n, m, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(m); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool do_brk2(DisasContext *s, arg_rpr_s *a, + gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, g, t); + do_pred_flags(t); + } else { + fn(d, n, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool trans_BRKPA(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpa, gen_helper_sve_brkpas); +} + +static bool trans_BRKPB(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpb, gen_helper_sve_brkpbs); +} + +static bool trans_BRKA_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_m, gen_helper_sve_brkas_m); +} + +static bool trans_BRKB_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m); +} + +static bool trans_BRKA_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_z, gen_helper_sve_brkas_z); +} + +static bool trans_BRKB_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z); +} + +static bool trans_BRKN(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkn, gen_helper_sve_brkns); +} + +/* + *** SVE Predicate Count Group + */ + +static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg) +{ + unsigned psz = pred_full_reg_size(s); + + if (psz <= 8) { + uint64_t psz_mask; + + tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn)); + if (pn != pg) { + TCGv_i64 g = tcg_temp_new_i64(); + tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_and_i64(val, val, g); + tcg_temp_free_i64(g); + } + + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + psz_mask = MAKE_64BIT_MASK(0, psz * 8); + tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask); + + tcg_gen_ctpop_i64(val, val); + } else { + TCGv_ptr t_pn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + unsigned desc; + TCGv_i32 t_desc; + + desc = psz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_cntp(val, t_pn, t_pg, t_desc); + tcg_temp_free_ptr(t_pn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(t_desc); + } +} + +static bool trans_CNTP(DisasContext *s, arg_CNTP *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg); + } + return true; +} + +static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + if (a->d) { + tcg_gen_sub_i64(reg, reg, val); + } else { + tcg_gen_add_i64(reg, reg, val); + } + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 val = tcg_temp_new_i64(); + GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds; + + do_cntp(s, val, a->esz, a->pg, a->pg); + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), val, vsz, vsz); + } + return true; +} + +static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_32(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_64(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_temp_new_i64(); + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d); + } + return true; +} + +/* + *** SVE Integer Compare Scalars Group + */ + +static bool trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ); + TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf); + TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf); + TCGv_i64 cmp = tcg_temp_new_i64(); + + tcg_gen_setcond_i64(cond, cmp, rn, rm); + tcg_gen_extrl_i64_i32(cpu_NF, cmp); + tcg_temp_free_i64(cmp); + + /* VF = !NF & !CF. */ + tcg_gen_xori_i32(cpu_VF, cpu_NF, 1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF); + + /* Both NF and VF actually look at bit 31. */ + tcg_gen_neg_i32(cpu_NF, cpu_NF); + tcg_gen_neg_i32(cpu_VF, cpu_VF); + return true; +} + +static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1); + TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1); + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i32 t2, t3; + TCGv_ptr ptr; + unsigned desc, vsz = vec_full_reg_size(s); + TCGCond cond; + + if (!a->sf) { + if (a->u) { + tcg_gen_ext32u_i64(op0, op0); + tcg_gen_ext32u_i64(op1, op1); + } else { + tcg_gen_ext32s_i64(op0, op0); + tcg_gen_ext32s_i64(op1, op1); + } + } + + /* For the helper, compress the different conditions into a computation + * of how many iterations for which the condition is true. + * + * This is slightly complicated by 0 <= UINT64_MAX, which is nominally + * 2**64 iterations, overflowing to 0. Of course, predicate registers + * aren't that large, so any value >= predicate size is sufficient. + */ + tcg_gen_sub_i64(t0, op1, op0); + + /* t0 = MIN(op1 - op0, vsz). */ + tcg_gen_movi_i64(t1, vsz); + tcg_gen_umin_i64(t0, t0, t1); + if (a->eq) { + /* Equality means one more iteration. */ + tcg_gen_addi_i64(t0, t0, 1); + } + + /* t0 = (condition true ? t0 : 0). */ + cond = (a->u + ? (a->eq ? TCG_COND_LEU : TCG_COND_LTU) + : (a->eq ? TCG_COND_LE : TCG_COND_LT)); + tcg_gen_movi_i64(t1, 0); + tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); + + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, t0); + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); + + desc = (vsz / 8) - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + t3 = tcg_const_i32(desc); + + ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd)); + + gen_helper_sve_while(t2, ptr, t2, t3); + do_pred_flags(t2); + + tcg_temp_free_ptr(ptr); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t3); + return true; +} + +/* + *** SVE Integer Wide Immediate - Unpredicated Group + */ + +static bool trans_FDUP(DisasContext *s, arg_FDUP *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + uint64_t imm; + + /* Decode the VFP immediate. */ + imm = vfp_expand_imm(a->esz, a->imm); + imm = dup_const(a->esz, imm); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm); + } + return true; +} + +static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm)); + } + return true; +} + +static bool trans_ADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_addi(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + a->imm = -a->imm; + return trans_ADD_zzi(s, a, insn); +} + +static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + static const GVecGen2s op[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_b, + .opc = INDEX_op_sub_vec, + .vece = MO_8, + .scalar_first = true }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_h, + .opc = INDEX_op_sub_vec, + .vece = MO_16, + .scalar_first = true }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_s, + .opc = INDEX_op_sub_vec, + .vece = MO_32, + .scalar_first = true }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_d, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64, + .scalar_first = true } + }; + + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, c, &op[a->esz]); + tcg_temp_free_i64(c); + } + return true; +} + +static bool trans_MUL_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_muli(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, uint32_t insn, + bool u, bool d) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_const_i64(a->imm); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, u, d); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_SQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, false); +} + +static bool trans_UQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, false); +} + +static bool trans_SQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, true); +} + +static bool trans_UQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, true); +} + +static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + c, vsz, vsz, 0, fn); + tcg_temp_free_i64(c); + } + return true; +} + +#define DO_ZZI(NAME, name) \ +static bool trans_##NAME##_zzi(DisasContext *s, arg_rri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_2i * const fns[4] = { \ + gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h, \ + gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d, \ + }; \ + return do_zzi_ool(s, a, fns[a->esz]); \ +} + +DO_ZZI(SMAX, smax) +DO_ZZI(UMAX, umax) +DO_ZZI(SMIN, smin) +DO_ZZI(UMIN, umin) + +#undef DO_ZZI + +/* + *** SVE Floating Point Arithmetic - Unpredicated Group + */ + +static bool do_zzz_fp(DisasContext *s, arg_rrr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + + +#define DO_FP3(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_3_ptr * const fns[4] = { \ + NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ + }; \ + return do_zzz_fp(s, a, fns[a->esz]); \ +} + +DO_FP3(FADD_zzz, fadd) +DO_FP3(FSUB_zzz, fsub) +DO_FP3(FMUL_zzz, fmul) +DO_FP3(FTSMUL, ftsmul) +DO_FP3(FRECPS, recps) +DO_FP3(FRSQRTS, rsqrts) + +#undef DO_FP3 + +/* *** SVE Memory - 32-bit Gather and Unsized Contiguous Group */ diff --git a/target/arm/translate.c b/target/arm/translate.c index 0ff5edf..f405c82 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -9965,7 +9965,8 @@ static bool thumb_insn_is_16bit(DisasContext *s, uint32_t insn) * end up actually treating this as two 16-bit insns, though, * if it's half of a bl/blx pair that might span a page boundary. */ - if (arm_dc_feature(s, ARM_FEATURE_THUMB2)) { + if (arm_dc_feature(s, ARM_FEATURE_THUMB2) || + arm_dc_feature(s, ARM_FEATURE_M)) { /* Thumb2 cores (including all M profile ones) always treat * 32-bit insns as 32-bit. */ @@ -10085,10 +10086,38 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) int conds; int logic_cc; - /* The only 32 bit insn that's allowed for Thumb1 is the combined - * BL/BLX prefix and suffix. + /* + * ARMv6-M supports a limited subset of Thumb2 instructions. + * Other Thumb1 architectures allow only 32-bit + * combined BL/BLX prefix and suffix. */ - if ((insn & 0xf800e800) != 0xf000e800) { + if (arm_dc_feature(s, ARM_FEATURE_M) && + !arm_dc_feature(s, ARM_FEATURE_V7)) { + int i; + bool found = false; + const uint32_t armv6m_insn[] = {0xf3808000 /* msr */, + 0xf3b08040 /* dsb */, + 0xf3b08050 /* dmb */, + 0xf3b08060 /* isb */, + 0xf3e08000 /* mrs */, + 0xf000d000 /* bl */}; + const uint32_t armv6m_mask[] = {0xffe0d000, + 0xfff0d0f0, + 0xfff0d0f0, + 0xfff0d0f0, + 0xffe0d000, + 0xf800d000}; + + for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) { + if ((insn & armv6m_mask[i]) == armv6m_insn[i]) { + found = true; + break; + } + } + if (!found) { + goto illegal_op; + } + } else if ((insn & 0xf800e800) != 0xf000e800) { ARCH(6T2); } @@ -11009,7 +11038,11 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) } break; case 3: /* Special control operations. */ - ARCH(7); + if (!arm_dc_feature(s, ARM_FEATURE_V7) && + !(arm_dc_feature(s, ARM_FEATURE_V6) && + arm_dc_feature(s, ARM_FEATURE_M))) { + goto illegal_op; + } op = (insn >> 4) & 0xf; switch (op) { case 2: /* clrex */ diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index 25e209d..f504dd5 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -426,3 +426,72 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, } clear_tail(d, opr_sz, simd_maxsz(desc)); } + +/* Floating-point trigonometric starting value. + * See the ARM ARM pseudocode function FPTrigSMul. + */ +static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) +{ + float16 result = float16_mul(op1, op1, stat); + if (!float16_is_any_nan(result)) { + result = float16_set_sign(result, op2 & 1); + } + return result; +} + +static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) +{ + float32 result = float32_mul(op1, op1, stat); + if (!float32_is_any_nan(result)) { + result = float32_set_sign(result, op2 & 1); + } + return result; +} + +static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) +{ + float64 result = float64_mul(op1, op1, stat); + if (!float64_is_any_nan(result)) { + result = float64_set_sign(result, op2 & 1); + } + return result; +} + +#define DO_3OP(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], m[i], stat); \ + } \ +} + +DO_3OP(gvec_fadd_h, float16_add, float16) +DO_3OP(gvec_fadd_s, float32_add, float32) +DO_3OP(gvec_fadd_d, float64_add, float64) + +DO_3OP(gvec_fsub_h, float16_sub, float16) +DO_3OP(gvec_fsub_s, float32_sub, float32) +DO_3OP(gvec_fsub_d, float64_sub, float64) + +DO_3OP(gvec_fmul_h, float16_mul, float16) +DO_3OP(gvec_fmul_s, float32_mul, float32) +DO_3OP(gvec_fmul_d, float64_mul, float64) + +DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) +DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) +DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) + +#ifdef TARGET_AARCH64 + +DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) +DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) +DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) + +DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) +DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) +DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) + +#endif +#undef DO_3OP diff --git a/target/microblaze/mmu.c b/target/microblaze/mmu.c index f4ceaea..fcf86b1 100644 --- a/target/microblaze/mmu.c +++ b/target/microblaze/mmu.c @@ -159,7 +159,6 @@ unsigned int mmu_translate(struct microblaze_mmu *mmu, lu->vaddr = tlb_tag; lu->paddr = tlb_rpn & mmu->c_addr_mask; - lu->paddr = tlb_rpn; lu->size = tlb_size; lu->err = ERR_HIT; lu->idx = i; diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c index 6c64946..78ca265 100644 --- a/target/microblaze/translate.c +++ b/target/microblaze/translate.c @@ -90,7 +90,6 @@ typedef struct DisasContext { uint32_t jmp_pc; int abort_at_next_insn; - int nr_nops; struct TranslationBlock *tb; int singlestep_enabled; } DisasContext; @@ -1576,17 +1575,12 @@ static inline void decode(DisasContext *dc, uint32_t ir) dc->ir = ir; LOG_DIS("%8.8x\t", dc->ir); - if (dc->ir) - dc->nr_nops = 0; - else { + if (ir == 0) { trap_illegal(dc, dc->cpu->env.pvr.regs[2] & PVR2_OPCODE_0x0_ILL_MASK); - - LOG_DIS("nr_nops=%d\t", dc->nr_nops); - dc->nr_nops++; - if (dc->nr_nops > 4) { - cpu_abort(CPU(dc->cpu), "fetching nop sequence\n"); - } + /* Don't decode nop/zero instructions any further. */ + return; } + /* bit 2 seems to indicate insn type. */ dc->type_b = ir & (1 << 29); @@ -1633,7 +1627,6 @@ void gen_intermediate_code(CPUState *cs, struct TranslationBlock *tb) dc->singlestep_enabled = cs->singlestep_enabled; dc->cpustate_changed = 0; dc->abort_at_next_insn = 0; - dc->nr_nops = 0; if (pc_start & 3) { cpu_abort(cs, "Microblaze: unaligned PC=%x\n", pc_start); diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c index e10035a..cfdbccf 100644 --- a/target/s390x/cpu_models.c +++ b/target/s390x/cpu_models.c @@ -79,6 +79,7 @@ static S390CPUDef s390_cpu_defs[] = { CPUDEF_INIT(0x2964, 13, 2, 47, 0x08000000U, "z13.2", "IBM z13 GA2"), CPUDEF_INIT(0x2965, 13, 2, 47, 0x08000000U, "z13s", "IBM z13s GA1"), CPUDEF_INIT(0x3906, 14, 1, 47, 0x08000000U, "z14", "IBM z14 GA1"), + CPUDEF_INIT(0x3907, 14, 1, 47, 0x08000000U, "z14ZR1", "IBM z14 Model ZR1 GA1"), }; #define QEMU_MAX_CPU_TYPE 0x2827 |