diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/add-max.ll | 151 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/bf16.ll | 218 |
2 files changed, 168 insertions, 201 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll index b3a7057..c551375 100644 --- a/llvm/test/CodeGen/AMDGPU/add-max.ll +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { -; SDAG-LABEL: add_max_u32_ssv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_u32_ssv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: v_max_u32_e32 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add i32 %a, %b +; GCN-LABEL: add_max_u32_ssv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { -; GCN-LABEL: add_max_u32_sss: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_co_i32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_max_u32 s0, s0, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b +; SDAG-LABEL: add_max_u32_sss: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_max_u32_e32 v0, s2, v0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: add_max_u32_sss: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0 +; GISEL-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 4) %ret = bitcast i32 %max to float ret float %ret @@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 100) %ret = bitcast i32 %max to float ret float %ret } -define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { -; SDAG-LABEL: add_max_u32_slv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_u32_slv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_addk_co_i32 s0, 0x64 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: v_max_u32_e32 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add i32 %a, 100 - %max = call i32 @llvm.umax.i32(i32 %add, i32 %b) +define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) { +; GCN-LABEL: add_max_u32_slv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } @@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.smax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umin.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.smin.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret } define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) { -; SDAG-LABEL: add_max_v2u16_ssv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_v2u16_ssv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s2, s0, 16 -; GISEL-NEXT: s_lshr_b32 s3, s1, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_add_co_i32 s2, s2, s3 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b +; GCN-LABEL: add_max_v2u16_ssv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) { ; SDAG-LABEL: add_max_v2u16_sss: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 +; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_pk_max_u16 v0, v0, s2 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_v2u16_sss: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s3, s0, 16 -; GISEL-NEXT: s_lshr_b32 s4, s1, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_add_co_i32 s3, s3, s4 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GISEL-NEXT: s_and_b32 s3, s2, 0xffff -; GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GISEL-NEXT: s_lshr_b32 s2, s2, 16 -; GISEL-NEXT: s_max_u32 s0, s0, s3 -; GISEL-NEXT: s_max_u32 s1, s1, s2 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 ; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>) %ret = bitcast <2 x i16> %max to float ret float %ret } define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) { -; SDAG-LABEL: add_max_v2u16_slv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_v2u16_slv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064 -; GISEL-NEXT: s_addk_co_i32 s1, 0x64 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, <i16 100, i16 100> +; GCN-LABEL: add_max_v2u16_slv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 711d57b..30ad46d9 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX1250-NEXT: v_cls_i32_e32 v3, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14 ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX1250-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i64 %x to bfloat @@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3 ; GFX1250-NEXT: v_cls_i32_e32 v6, v3 ; GFX1250-NEXT: v_cls_i32_e32 v7, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5 -; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4 +; GFX1250-NEXT: v_min_u32_e32 v5, v7, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v4, v6, v4 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 ; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i64> %x to <2 x bfloat> @@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16: ; %bb.0: ; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v2, v3 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v6, v4, v5 +; GFX1250TRUE16-NEXT: v_cls_i32_e32 v6, v5 +; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5 ; GFX1250TRUE16-NEXT: v_cls_i32_e32 v10, v3 -; GFX1250TRUE16-NEXT: v_cls_i32_e32 v9, v5 ; GFX1250TRUE16-NEXT: v_cls_i32_e32 v11, v1 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1 -; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6 -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] -; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14 +; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14 +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250TRUE16-NEXT: v_min_u32_e32 v7, v10, v8 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v8, v11, v9 +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1] ; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8 -; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1] -; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v8 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250TRUE16-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v4 ; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX1250TRUE16-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16: ; %bb.0: ; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250FAKE16-NEXT: v_xor_b32_e32 v8, v4, v5 -; GFX1250FAKE16-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX1250FAKE16-NEXT: v_cls_i32_e32 v6, v5 +; GFX1250FAKE16-NEXT: v_xor_b32_e32 v7, v2, v3 ; GFX1250FAKE16-NEXT: v_cls_i32_e32 v10, v3 -; GFX1250FAKE16-NEXT: v_cls_i32_e32 v9, v5 ; GFX1250FAKE16-NEXT: v_cls_i32_e32 v11, v1 -; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 -; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6 -; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14 +; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 -; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8 +; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v7, v10, v7 +; GFX1250FAKE16-NEXT: v_min_u32_e32 v6, v6, v8 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8 +; GFX1250FAKE16-NEXT: v_min_u32_e32 v9, v11, v9 +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 -; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] -; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8 -; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v9, v[0:1] +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250FAKE16-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6 +; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX1250FAKE16-NEXT: v_ldexp_f32 v0, v0, v4 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v6 ; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 @@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX1250-NEXT: v_cls_i32_e32 v9, v7 ; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7 -; GFX1250-NEXT: v_cls_i32_e32 v12, v7 -; GFX1250-NEXT: v_cls_i32_e32 v13, v5 -; GFX1250-NEXT: v_cls_i32_e32 v14, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8 -; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3 -; GFX1250-NEXT: v_cls_i32_e32 v15, v1 -; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 -; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8 -; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] -; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 -; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10 +; GFX1250-NEXT: v_cls_i32_e32 v10, v5 +; GFX1250-NEXT: v_xor_b32_e32 v14, v0, v1 +; GFX1250-NEXT: v_cls_i32_e32 v12, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11 -; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] -; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14 +; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v8, v9, v8 +; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v13 +; GFX1250-NEXT: v_cls_i32_e32 v13, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11 +; GFX1250-NEXT: v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14 +; GFX1250-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v9, v12, v9 +; GFX1250-NEXT: v_min_u32_e32 v11, v13, v14 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v10, v[4:5] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3] +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1] +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 -; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9 +; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v10 ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6 +; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v8 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v6 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11 ; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v3, v1 ; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5 |
