aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll98
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll192
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll (renamed from llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll)374
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll322
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll470
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll488
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll201
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddsat.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll472
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll379
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll201
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubsat.ll54
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags_V1.ll18
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag_V1.ll19
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir6
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_predicated_io/predicated_io_generic.ll36
-rw-r--r--llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll48
-rw-r--r--llvm/test/CodeGen/WebAssembly/saturating-truncation.ll87
-rw-r--r--llvm/test/CodeGen/X86/and-mask-variable.ll212
-rw-r--r--llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll36
-rw-r--r--llvm/test/Instrumentation/AllocToken/extralibfuncs.ll4
-rw-r--r--llvm/test/Instrumentation/AllocToken/nonlibcalls.ll4
-rw-r--r--llvm/test/Instrumentation/AllocToken/remark.ll4
-rw-r--r--llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll35
-rw-r--r--llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s95
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll9
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll9
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll9
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll9
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll195
-rw-r--r--llvm/test/Transforms/SimplifyCFG/merge-calls-alloc-token.ll6
-rw-r--r--llvm/test/tools/llvm-profgen/Inputs/coff-profile.exebin1629184 -> 1195520 bytes
-rw-r--r--llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript24
-rw-r--r--llvm/test/tools/llvm-profgen/coff-profile.test100
38 files changed, 2665 insertions, 1634 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d721b73c..896603d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -70,12 +70,12 @@
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
#
-# DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_ABDS (opcode [[G_ABDS:[0-9]+]]): 1 type index, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
-# DEBUG-NEXT: G_ABDU (opcode 66): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: G_ABDU (opcode [[G_ABDU:[0-9]+]]): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode [[G_ABDU]] is aliased to [[G_ABDS]]
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 7cc5051..003aa04 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -8759,9 +8759,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8780,20 +8779,19 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB113_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v7, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -8827,10 +8825,9 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8856,11 +8853,11 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -8900,9 +8897,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -8918,18 +8914,17 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v4, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v3, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -8962,10 +8957,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -8988,7 +8982,6 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -17064,9 +17057,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17085,20 +17077,19 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s4
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
@@ -17131,10 +17122,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17158,11 +17148,11 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
@@ -17201,9 +17191,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
-; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17226,7 +17215,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
@@ -17262,10 +17250,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17286,7 +17273,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index c98fff9..34a4899 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -5804,9 +5804,8 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5839,10 +5838,9 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5880,9 +5878,8 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -5911,10 +5908,9 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -11573,9 +11569,8 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11609,10 +11604,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11651,9 +11645,8 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
-; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -11683,10 +11676,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index d326966..b72eba8 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -17,12 +17,9 @@ define %struct.uint96 @v_add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
; CHECK-LABEL: v_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; CHECK-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]
-; CHECK-NEXT: v_mov_b32_e32 v0, v5
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
-; CHECK-NEXT: v_mov_b32_e32 v1, v6
; CHECK-NEXT: s_setpc_b64 s[30:31]
%sum64 = add i64 %val64A, %val64B
%obit = icmp ult i64 %sum64, %val64A
@@ -38,16 +35,14 @@ define <2 x i64> @v_uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_add_co_u32_e64 v4, s[4:5], v0, v4
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
-; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_addc_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -63,16 +58,14 @@ define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
-; CHECK-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4
-; CHECK-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]
-; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -87,10 +80,9 @@ define i64 @v_uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-LABEL: v_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -109,7 +101,6 @@ define i64 @v_uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, v0
@@ -147,10 +138,9 @@ define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-LABEL: v_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -167,10 +157,9 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-LABEL: v_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -190,15 +179,13 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s6, s0, s2
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: s_addc_u32 s7, s1, s3
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; CHECK-NEXT: s_mov_b32 s0, s6
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
+; CHECK-NEXT: s_addc_u32 s1, s1, s3
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
-; CHECK-NEXT: s_mov_b32 s1, s7
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
%obit = icmp ult i64 %sum64, %val64A
@@ -212,24 +199,24 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s6, s2, s6
-; CHECK-NEXT: v_mov_b32_e32 v9, s3
-; CHECK-NEXT: s_addc_u32 s7, s3, s7
-; CHECK-NEXT: v_mov_b32_e32 v8, s2
-; CHECK-NEXT: s_add_u32 s4, s0, s4
-; CHECK-NEXT: v_mov_b32_e32 v7, s1
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
-; CHECK-NEXT: s_addc_u32 s5, s1, s5
-; CHECK-NEXT: v_mov_b32_e32 v6, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
-; CHECK-NEXT: v_readfirstlane_b32 s2, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; CHECK-NEXT: v_readfirstlane_b32 s0, v6
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: v_mov_b32_e32 v3, s5
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: s_add_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_addc_u32 s8, s3, s7
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_add_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
+; CHECK-NEXT: s_addc_u32 s1, s1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v7
+; CHECK-NEXT: v_readfirstlane_b32 s2, v6
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -246,24 +233,24 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s6, s2, s6
-; CHECK-NEXT: v_mov_b32_e32 v9, s3
-; CHECK-NEXT: s_subb_u32 s7, s3, s7
-; CHECK-NEXT: v_mov_b32_e32 v8, s2
-; CHECK-NEXT: s_sub_u32 s4, s0, s4
-; CHECK-NEXT: v_mov_b32_e32 v7, s1
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
-; CHECK-NEXT: s_subb_u32 s5, s1, s5
-; CHECK-NEXT: v_mov_b32_e32 v6, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]
-; CHECK-NEXT: v_readfirstlane_b32 s2, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; CHECK-NEXT: v_readfirstlane_b32 s0, v6
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: v_mov_b32_e32 v3, s5
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: s_sub_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_subb_u32 s8, s3, s7
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_sub_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
+; CHECK-NEXT: s_subb_u32 s1, s1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v7
+; CHECK-NEXT: v_readfirstlane_b32 s2, v6
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -280,15 +267,15 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s2, s0, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_addc_u32 s3, s1, s3
+; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
+; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v5, s3
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v4, s2
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -305,10 +292,11 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -350,15 +338,15 @@ define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s2, s0, -1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_addc_u32 s3, s1, -1
+; CHECK-NEXT: s_sub_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
+; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v5, s3
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v4, s2
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -374,15 +362,15 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s2, s0, 1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_addc_u32 s3, s1, 0
+; CHECK-NEXT: s_sub_u32 s0, s0, -1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
+; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v5, s3
-; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v4, s2
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
index 2ad6e68..f730199 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
@@ -70,7 +70,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call i64 asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -118,7 +118,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg() {
define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -154,7 +154,7 @@ define void @func_uses_asm_physreg_agpr() {
define void @func_uses_asm_physreg_agpr_tuple() {
; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -168,7 +168,7 @@ declare void @unknown()
define amdgpu_kernel void @kernel_calls_extern() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -180,8 +180,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK-NEXT: call void @unknown() #[[ATTR10:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR29:[0-9]+]]
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
@@ -192,7 +192,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
; CHECK-NEXT: call void [[INDIRECT]]()
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -204,8 +204,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR10]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR29]]
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
@@ -316,7 +316,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
@@ -342,7 +342,7 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -354,7 +354,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -378,7 +378,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -390,7 +390,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK-NEXT: [[DEF:%.*]] = call ptr asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -402,7 +402,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR5]] {
; CHECK-NEXT: [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -414,7 +414,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -426,7 +426,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
define amdgpu_kernel void @kernel_uses_asm_clobber() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -438,7 +438,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber() {
define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -450,7 +450,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -462,7 +462,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -474,7 +474,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -486,7 +486,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -498,7 +498,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -510,7 +510,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -522,7 +522,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
define amdgpu_kernel void @vreg_use_exceeds_register_file() {
; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -534,7 +534,7 @@ define amdgpu_kernel void @vreg_use_exceeds_register_file() {
define amdgpu_kernel void @vreg_def_exceeds_register_file() {
; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
; CHECK-NEXT: [[DEF:%.*]] = call <257 x i32> asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -546,7 +546,7 @@ define amdgpu_kernel void @vreg_def_exceeds_register_file() {
define amdgpu_kernel void @multiple() {
; CHECK-LABEL: define amdgpu_kernel void @multiple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
; CHECK-NEXT: [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -558,7 +558,7 @@ define amdgpu_kernel void @multiple() {
define amdgpu_kernel void @earlyclobber_0() {
; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call <8 x i32> asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -570,7 +570,7 @@ define amdgpu_kernel void @earlyclobber_0() {
define amdgpu_kernel void @earlyclobber_1() {
; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR12:[0-9]+]] {
; CHECK-NEXT: [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -582,7 +582,7 @@ define amdgpu_kernel void @earlyclobber_1() {
define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() {
; CHECK-LABEL: define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR13:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -594,7 +594,7 @@ define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() {
define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() {
; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR13]] {
; CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i32>, <16 x i32> } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -606,7 +606,7 @@ define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() {
define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() {
; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR14:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i32> } asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -618,7 +618,7 @@ define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() {
define amdgpu_kernel void @mixed_physreg_vreg_tuples_0() {
; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -630,7 +630,7 @@ define amdgpu_kernel void @mixed_physreg_vreg_tuples_0() {
define amdgpu_kernel void @mixed_physreg_vreg_tuples_1() {
; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -642,7 +642,7 @@ define amdgpu_kernel void @mixed_physreg_vreg_tuples_1() {
define amdgpu_kernel void @physreg_raises_limit() {
; CHECK-LABEL: define amdgpu_kernel void @physreg_raises_limit(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR16:[0-9]+]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -652,10 +652,9 @@ define amdgpu_kernel void @physreg_raises_limit() {
ret void
}
-; FIXME: This should require 9. We cannot allocate an a128 at a0.
define amdgpu_kernel void @physreg_tuple_alignment_raises_limit() {
; CHECK-LABEL: define amdgpu_kernel void @physreg_tuple_alignment_raises_limit(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -667,7 +666,7 @@ define amdgpu_kernel void @physreg_tuple_alignment_raises_limit() {
define amdgpu_kernel void @align3_virtreg() {
; CHECK-LABEL: define amdgpu_kernel void @align3_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR6]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -679,7 +678,7 @@ define amdgpu_kernel void @align3_virtreg() {
define amdgpu_kernel void @align3_align4_virtreg() {
; CHECK-LABEL: define amdgpu_kernel void @align3_align4_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -691,7 +690,7 @@ define amdgpu_kernel void @align3_align4_virtreg() {
define amdgpu_kernel void @align2_align4_virtreg() {
; CHECK-LABEL: define amdgpu_kernel void @align2_align4_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15]] {
; CHECK-NEXT: call void asm sideeffect "
; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
@@ -703,7 +702,7 @@ define amdgpu_kernel void @align2_align4_virtreg() {
define amdgpu_kernel void @kernel_uses_write_register_a55() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_a55(
-; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR17:[0-9]+]] {
; CHECK-NEXT: call void @llvm.write_register.i32(metadata [[META0:![0-9]+]], i32 0)
; CHECK-NEXT: ret void
;
@@ -713,71 +712,313 @@ define amdgpu_kernel void @kernel_uses_write_register_a55() {
define amdgpu_kernel void @kernel_uses_write_register_v55() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_v55(
-; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.write_register.i32(metadata [[META1:![0-9]+]], i32 0)
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
call void @llvm.write_register.i64(metadata !1, i32 0)
+ call void @use_most()
ret void
}
define amdgpu_kernel void @kernel_uses_write_register_a55_57() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_a55_57(
-; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-SAME: ) #[[ATTR18:[0-9]+]] {
; CHECK-NEXT: call void @llvm.write_register.i96(metadata [[META2:![0-9]+]], i96 0)
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
call void @llvm.write_register.i64(metadata !2, i96 0)
+ call void @use_most()
ret void
}
define amdgpu_kernel void @kernel_uses_read_register_a55(ptr addrspace(1) %ptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_register_a55(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR19:[0-9]+]] {
; CHECK-NEXT: [[REG:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0]])
; CHECK-NEXT: store i32 [[REG]], ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
%reg = call i32 @llvm.read_register.i64(metadata !0)
store i32 %reg, ptr addrspace(1) %ptr
+ call void @use_most()
ret void
}
define amdgpu_kernel void @kernel_uses_read_volatile_register_a55(ptr addrspace(1) %ptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_volatile_register_a55(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR19]] {
; CHECK-NEXT: [[REG:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META0]])
; CHECK-NEXT: store i32 [[REG]], ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
%reg = call i32 @llvm.read_volatile_register.i64(metadata !0)
store i32 %reg, ptr addrspace(1) %ptr
+ call void @use_most()
ret void
}
define amdgpu_kernel void @kernel_uses_read_register_a56_59(ptr addrspace(1) %ptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_register_a56_59(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR20:[0-9]+]] {
; CHECK-NEXT: [[REG:%.*]] = call i128 @llvm.read_register.i128(metadata [[META3:![0-9]+]])
; CHECK-NEXT: store i128 [[REG]], ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
%reg = call i128 @llvm.read_register.i64(metadata !3)
store i128 %reg, ptr addrspace(1) %ptr
+ call void @use_most()
ret void
}
define amdgpu_kernel void @kernel_uses_write_register_out_of_bounds_a256() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_out_of_bounds_a256(
-; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-SAME: ) #[[ATTR9]] {
; CHECK-NEXT: call void @llvm.write_register.i32(metadata [[META4:![0-9]+]], i32 0)
+; CHECK-NEXT: call void @use_most()
; CHECK-NEXT: ret void
;
call void @llvm.write_register.i64(metadata !4, i32 0)
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_uses() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_uses(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(i64 poison)
+ call void asm sideeffect "; use $0", "a"(i32 poison)
+ call void asm sideeffect "; use $0", "a"(i128 poison)
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_defs() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_defs(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT: [[TMP3:%.*]] = call i128 asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call i64 asm sideeffect "; def $0", "=a"()
+ call i32 asm sideeffect "; def $0", "=a"()
+ call i128 asm sideeffect "; def $0", "=a"()
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_use_defs() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_use_defs(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: [[TMP1:%.*]] = call i128 asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(i32 poison)
+ call i128 asm sideeffect "; def $0", "=a"()
+ call void @use_most()
+ ret void
+}
+
+define void @callgraph_b() {
+; CHECK-LABEL: define void @callgraph_b(
+; CHECK-SAME: ) #[[ATTR15]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> asm sideeffect "
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call <4 x i32> asm sideeffect "; def $0", "=a"()
+ call void asm sideeffect "; use $0", "a"(<8 x i32> poison)
+ call void @use_most()
+ ret void
+}
+
+define void @callgraph_c() {
+; CHECK-LABEL: define void @callgraph_c(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call i32 asm sideeffect "; def $0", "=a"()
+ call void asm sideeffect "; use $0", "a"(<2 x i32> poison)
+ call void @use_most()
+ ret void
+}
+
+define void @callgraph_a(i1 %cond) {
+; CHECK-LABEL: define void @callgraph_a(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR15]] {
+; CHECK-NEXT: br i1 [[COND]], label [[A:%.*]], label [[B:%.*]]
+; CHECK: a:
+; CHECK-NEXT: call void @callgraph_b()
+; CHECK-NEXT: ret void
+; CHECK: b:
+; CHECK-NEXT: call void @callgraph_c()
+; CHECK-NEXT: ret void
+;
+ br i1 %cond, label %a, label %b
+
+a:
+ call void @callgraph_b()
+ ret void
+
+b:
+ call void @callgraph_c()
+ ret void
+}
+
+
+define void @kernel_max_callgraph(i1 %cond) {
+; CHECK-LABEL: define void @kernel_max_callgraph(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR15]] {
+; CHECK-NEXT: call void @callgraph_a(i1 [[COND]])
+; CHECK-NEXT: ret void
+;
+ call void @callgraph_a(i1 %cond)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_all_virtregs() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_all_virtregs(
+; CHECK-SAME: ) #[[ATTR21:[0-9]+]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a,a,a,a,a,a,a,a"(<32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison)
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_all_virtregs_plus_1() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_all_virtregs_plus_1(
+; CHECK-SAME: ) #[[ATTR21]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a,a,a,a,a,a,a,a,a"(<32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, i32 poison)
+ call void @use_most()
+ ret void
+}
+
+define void @recursive() {
+; CHECK-LABEL: define void @recursive(
+; CHECK-SAME: ) #[[ATTR22:[0-9]+]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: call void @recursive()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<7 x i32> poison)
+ call void @use_most()
+ call void @recursive()
+ ret void
+}
+
+define void @indirect_0() {
+; CHECK-LABEL: define void @indirect_0(
+; CHECK-SAME: ) #[[ATTR22]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<7 x i32> poison)
+ call void @use_most()
+ ret void
+}
+
+define void @indirect_1() {
+; CHECK-LABEL: define void @indirect_1(
+; CHECK-SAME: ) #[[ATTR23:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i32> asm sideeffect "
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call <3 x i32> asm sideeffect "; def $0", "=a"()
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @knowable_indirect_call(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @knowable_indirect_call(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR22]] {
+; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @indirect_0, ptr @indirect_1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @indirect_1
+; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; CHECK: 2:
+; CHECK-NEXT: call void @indirect_1()
+; CHECK-NEXT: br label [[TMP6:%.*]]
+; CHECK: 3:
+; CHECK-NEXT: br i1 true, label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK: 4:
+; CHECK-NEXT: call void @indirect_0()
+; CHECK-NEXT: br label [[TMP6]]
+; CHECK: 5:
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ %fptr = select i1 %cond, ptr @indirect_0, ptr @indirect_1
+ call void %fptr()
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @calls_poison(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @calls_poison(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void poison()
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void poison()
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @calls_null(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @calls_null(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void null()
+; CHECK-NEXT: call void @use_most()
+; CHECK-NEXT: ret void
+;
+ call void null()
+ call void @use_most()
+ ret void
+}
+
+define amdgpu_kernel void @indirect_unknown(ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @indirect_unknown(
+; CHECK-SAME: ptr [[FPTR:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void [[FPTR]]()
+; CHECK-NEXT: ret void
+;
+ call void %fptr()
ret void
}
attributes #0 = { "amdgpu-agpr-alloc"="0" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
!0 = !{!"a55"}
!1 = !{!"v55"}
@@ -787,16 +1028,35 @@ attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nounwind "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="1" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="2" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="4" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="6" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="5" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="14" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="256" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="32" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="9" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="64" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="49" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="33" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="13" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="56" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="58" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="56" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="60" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="256" "amdgpu-waves-per-eu"="1,1" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR22]] = { "amdgpu-agpr-alloc"="7" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="3" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR24:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR25:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR26:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR27:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR28:[0-9]+]] = { nocallback nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR29]] = { "amdgpu-agpr-alloc"="0" }
;.
; CHECK: [[META0]] = !{!"a55"}
; CHECK: [[META1]] = !{!"v55"}
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 2ae6fc2..4a6fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -691,7 +691,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GCN-ISEL-LABEL: name: suaddo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: S_ADD_U64_PSEUDO
+; GCN-ISEL: S_UADDO_PSEUDO
+; GCN-ISEL: S_ADD_CO_PSEUDO
define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
; CISI-LABEL: suaddo64:
@@ -700,21 +701,23 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s11, 0xf000
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
-; CISI-NEXT: s_add_u32 s6, s4, s6
-; CISI-NEXT: v_mov_b32_e32 v0, s4
-; CISI-NEXT: s_addc_u32 s7, s5, s7
-; CISI-NEXT: v_mov_b32_e32 v1, s5
-; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT: v_mov_b32_e32 v2, s6
+; CISI-NEXT: s_add_u32 s4, s4, s6
+; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
+; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
+; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
+; CISI-NEXT: v_mov_b32_e32 v0, s4
+; CISI-NEXT: v_mov_b32_e32 v1, s5
+; CISI-NEXT: s_cselect_b64 s[4:5], -1, 0
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
-; CISI-NEXT: v_mov_b32_e32 v3, s7
-; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT: s_waitcnt expcnt(0)
+; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -722,37 +725,37 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
@@ -761,10 +764,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
-; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX1010-NEXT: global_store_byte v2, v3, s[10:11]
@@ -775,11 +780,13 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
@@ -790,11 +797,13 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
@@ -804,12 +813,13 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s6, s4, s6
-; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_add_u32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -819,12 +829,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-LABEL: suaddo64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[12:13], s[14:15]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX1250-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
@@ -841,7 +853,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vuaddo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_ADD_U64_PSEUDO
+; GCN-ISEL: V_ADD_CO_U32_e64
+; GCN-ISEL: V_ADDC_U32_e64
define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vuaddo64:
@@ -854,9 +867,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -876,7 +888,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -894,7 +905,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -909,8 +919,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -923,9 +932,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -938,9 +946,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -955,10 +962,9 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -969,16 +975,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1671,7 +1678,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GCN-ISEL-LABEL: name: susubo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: S_SUB_U64_PSEUDO
+; GCN-ISEL: S_USUBO_PSEUDO
+; GCN-ISEL: S_SUB_CO_PSEUDO
define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
; CISI-LABEL: susubo64:
@@ -1680,21 +1688,23 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s11, 0xf000
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
-; CISI-NEXT: s_sub_u32 s6, s4, s6
-; CISI-NEXT: v_mov_b32_e32 v0, s4
-; CISI-NEXT: s_subb_u32 s7, s5, s7
-; CISI-NEXT: v_mov_b32_e32 v1, s5
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT: v_mov_b32_e32 v2, s6
+; CISI-NEXT: s_sub_u32 s4, s4, s6
+; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
+; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
+; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
+; CISI-NEXT: v_mov_b32_e32 v0, s4
+; CISI-NEXT: v_mov_b32_e32 v1, s5
+; CISI-NEXT: s_cselect_b64 s[4:5], -1, 0
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
-; CISI-NEXT: v_mov_b32_e32 v3, s7
-; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT: s_waitcnt expcnt(0)
+; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -1702,37 +1712,37 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
@@ -1741,10 +1751,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
-; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX1010-NEXT: global_store_byte v2, v3, s[10:11]
@@ -1755,11 +1767,13 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
@@ -1770,11 +1784,13 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
@@ -1784,12 +1800,13 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s6, s4, s6
-; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_sub_u32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1799,12 +1816,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-LABEL: susubo64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
@@ -1821,7 +1840,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vusubo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_SUB_U64_PSEUDO
+; GCN-ISEL: V_SUB_CO_U32_e64
+; GCN-ISEL: V_SUBB_U32_e64
define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vusubo64:
@@ -1834,9 +1854,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -1856,7 +1875,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1874,7 +1892,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -1889,8 +1906,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -1903,9 +1919,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -1918,9 +1933,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -1935,10 +1949,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -1949,16 +1962,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 697bcc3..5f6d622 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -206,8 +206,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
-; GCN-IR-NEXT: s_addc_u32 s19, s17, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16
@@ -217,9 +220,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s2, -1
; GCN-IR-NEXT: s_addc_u32 s19, s3, -1
; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s20
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT: s_add_u32 s14, s8, s20
+; GCN-IR-NEXT: s_addc_u32 s15, s9, 0
+; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -227,19 +230,22 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11]
; GCN-IR-NEXT: s_sub_u32 s8, s18, s16
; GCN-IR-NEXT: s_subb_u32 s8, s19, s17
-; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s8, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s16, s16, s14
-; GCN-IR-NEXT: s_subb_u32 s17, s17, s15
-; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
+; GCN-IR-NEXT: s_ashr_i32 s12, s8, 31
+; GCN-IR-NEXT: s_mov_b32 s13, s12
+; GCN-IR-NEXT: s_and_b32 s8, s12, 1
+; GCN-IR-NEXT: s_and_b64 s[20:21], s[12:13], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s16, s16, s20
+; GCN-IR-NEXT: s_subb_u32 s17, s17, s21
+; GCN-IR-NEXT: s_add_u32 s14, s14, 1
+; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
+; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
+; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow7
@@ -389,25 +395,25 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-LABEL: v_test_sdiv:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v3
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v1, v12, vcc
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v13
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v13
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v1, v10, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v11
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v11
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7
-; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11
+; GCN-IR-NEXT: v_min_u32_e32 v9, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v8, v9
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
@@ -416,70 +422,69 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v14, v12
-; GCN-IR-NEXT: v_mov_b32_e32 v15, v13
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v10
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v11
; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5]
; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v4, v10
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v8
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v11
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[6:7], v14
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v4, v8
+; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, v4, v9
+; GCN-IR-NEXT: v_addc_u32_e64 v17, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16
+; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-IR-NEXT: .LBB1_6: ; %Flow5
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v13, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v15, v14
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v11, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v13, v12
; GCN-IR-NEXT: v_xor_b32_e32 v3, v4, v0
; GCN-IR-NEXT: v_xor_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0
@@ -1293,34 +1298,37 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s2, s2, s4
; GCN-IR-NEXT: s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3]
-; GCN-IR-NEXT: s_add_u32 s10, s14, 0xffffffc5
+; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[2:3]
+; GCN-IR-NEXT: s_add_u32 s10, s16, 0xffffffc5
; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec
; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17]
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s10
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s12
-; GCN-IR-NEXT: s_add_u32 s16, s2, -1
-; GCN-IR-NEXT: s_addc_u32 s17, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s10, 58, s14
-; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT: s_add_u32 s14, s2, -1
+; GCN-IR-NEXT: s_addc_u32 s15, s3, -1
+; GCN-IR-NEXT: s_sub_u32 s16, 58, s16
+; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1328,19 +1336,22 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s6, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
-; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s6, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s6, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_and_b32 s6, s10, 1
+; GCN-IR-NEXT: s_and_b64 s[18:19], s[10:11], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
+; GCN-IR-NEXT: s_add_u32 s16, s16, 1
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
; GCN-IR-NEXT: .LBB10_4: ; %Flow6
@@ -1472,17 +1483,17 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-LABEL: v_test_sdiv_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1490,69 +1501,68 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v10
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB11_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v14, vcc, 58, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v6
+; GCN-IR-NEXT: v_subb_u32_e64 v15, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-IR-NEXT: .LBB11_6: ; %Flow5
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v11
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 24, %x
ret i64 %result
@@ -1665,17 +1675,17 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-LABEL: v_test_sdiv_pow2_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1684,70 +1694,69 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v10
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v14, vcc, 47, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v6
+; GCN-IR-NEXT: v_subb_u32_e64 v15, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-IR-NEXT: .LBB12_6: ; %Flow5
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v11
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 32768, %x
ret i64 %result
@@ -1767,20 +1776,20 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-LABEL: v_test_sdiv_pow2_k_den_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v10, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v8
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v8, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4
; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0
; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5
-; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1
-; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v8
+; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1
+; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v6
; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v10
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v8
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
@@ -1790,61 +1799,60 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB13_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0
-; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0
+; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 0xffffffcf, v6
+; GCN-IR-NEXT: v_lshr_b64 v[4:5], v[4:5], v7
+; GCN-IR-NEXT: v_addc_u32_e64 v11, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2
-; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s12, v6
+; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v4
-; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v3
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v2
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v0, v6, v0
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT: v_and_b32_e32 v6, 0x8000, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
+; GCN-IR-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10
+; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB13_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1
; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0
; GCN-IR-NEXT: .LBB13_6: ; %Flow5
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v10
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v11
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v8
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v9
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %x, 32768
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 465024a..33b0a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -170,35 +170,38 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7]
-; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[2:3]
; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s18
+; GCN-IR-NEXT: s_sub_u32 s12, s10, s16
; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec
; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14
-; GCN-IR-NEXT: s_add_u32 s16, s6, -1
-; GCN-IR-NEXT: s_addc_u32 s17, s7, -1
+; GCN-IR-NEXT: s_add_u32 s14, s6, -1
+; GCN-IR-NEXT: s_addc_u32 s15, s7, -1
; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s10, s4, s18
-; GCN-IR-NEXT: s_addc_u32 s11, s5, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT: s_add_u32 s16, s4, s16
+; GCN-IR-NEXT: s_addc_u32 s17, s5, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -206,19 +209,22 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s4, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s4, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s4, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
-; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_and_b32 s4, s10, 1
+; GCN-IR-NEXT: s_and_b64 s[18:19], s[10:11], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
+; GCN-IR-NEXT: s_add_u32 s16, s16, 1
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow7
@@ -373,12 +379,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-LABEL: v_test_srem:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v14
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v14
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v4
; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4
; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -386,12 +392,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13
+; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -400,7 +406,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v15, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v12
; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5]
; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
@@ -408,54 +414,53 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v6, v12
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, v6, v11
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT: v_addc_u32_e64 v17, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16
+; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
@@ -469,10 +474,10 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v14
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v15
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v13
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %x, %y
ret i64 %result
@@ -1148,35 +1153,38 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9]
; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11]
-; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s14, s12, s20
+; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s14, s12, s18
; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17]
; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec
; GCN-IR-NEXT: s_cselect_b32 s11, 0, s7
; GCN-IR-NEXT: s_cselect_b32 s10, 0, s6
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21]
; GCN-IR-NEXT: s_mov_b64 s[2:3], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s15, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s14
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s16
-; GCN-IR-NEXT: s_add_u32 s18, s8, -1
-; GCN-IR-NEXT: s_addc_u32 s19, s9, -1
+; GCN-IR-NEXT: s_add_u32 s16, s8, -1
+; GCN-IR-NEXT: s_addc_u32 s17, s9, -1
; GCN-IR-NEXT: s_not_b64 s[2:3], s[12:13]
-; GCN-IR-NEXT: s_add_u32 s12, s2, s20
-; GCN-IR-NEXT: s_addc_u32 s13, s3, 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], 0
+; GCN-IR-NEXT: s_add_u32 s18, s2, s18
+; GCN-IR-NEXT: s_addc_u32 s19, s3, 0
+; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
; GCN-IR-NEXT: s_mov_b32 s3, 0
; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1184,19 +1192,22 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s2, s18, s14
-; GCN-IR-NEXT: s_subb_u32 s2, s19, s15
-; GCN-IR-NEXT: s_ashr_i32 s16, s2, 31
-; GCN-IR-NEXT: s_mov_b32 s17, s16
-; GCN-IR-NEXT: s_and_b32 s2, s16, 1
-; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT: s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], s[2:3]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11]
+; GCN-IR-NEXT: s_sub_u32 s2, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s2, s17, s15
+; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31
+; GCN-IR-NEXT: s_mov_b32 s13, s12
+; GCN-IR-NEXT: s_and_b32 s2, s12, 1
+; GCN-IR-NEXT: s_and_b64 s[20:21], s[12:13], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s14, s14, s20
+; GCN-IR-NEXT: s_subb_u32 s15, s15, s21
+; GCN-IR-NEXT: s_add_u32 s18, s18, 1
+; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
+; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
+; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
; GCN-IR-NEXT: .LBB8_4: ; %Flow7
@@ -1461,34 +1472,37 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9]
; GCN-IR-NEXT: s_sub_u32 s4, s2, s8
; GCN-IR-NEXT: s_subb_u32 s5, s3, s8
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[4:5]
-; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc5
+; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[4:5]
+; GCN-IR-NEXT: s_add_u32 s2, s14, 0xffffffc5
; GCN-IR-NEXT: s_addc_u32 s3, 0, -1
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[2:3], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 63
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[10:11], exec
; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s3, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-IR-NEXT: s_or_b32 s9, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
+; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
+; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_lshl_b64 s[2:3], 24, s2
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s8
-; GCN-IR-NEXT: s_add_u32 s14, s4, -1
-; GCN-IR-NEXT: s_addc_u32 s15, s5, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s12
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT: s_add_u32 s12, s4, -1
+; GCN-IR-NEXT: s_addc_u32 s13, s5, -1
+; GCN-IR-NEXT: s_sub_u32 s14, 58, s14
+; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1496,19 +1510,22 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshr_b32 s6, s3, 31
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s6, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s6, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s6, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s6, s12, s10
+; GCN-IR-NEXT: s_subb_u32 s6, s13, s11
+; GCN-IR-NEXT: s_ashr_i32 s8, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s9, s8
+; GCN-IR-NEXT: s_and_b32 s6, s8, 1
+; GCN-IR-NEXT: s_and_b64 s[16:17], s[8:9], s[4:5]
+; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
+; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
+; GCN-IR-NEXT: s_add_u32 s14, s14, 1
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
; GCN-IR-NEXT: .LBB10_4: ; %Flow6
@@ -1647,9 +1664,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1663,53 +1680,52 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB11_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, 58, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v6
+; GCN-IR-NEXT: v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1838,9 +1854,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1855,54 +1871,53 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, 47, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v6
+; GCN-IR-NEXT: v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1937,20 +1952,20 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-LABEL: v_test_srem_pow2_k_den_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v10
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
@@ -1961,51 +1976,50 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB13_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT: v_addc_u32_e64 v13, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB13_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -2014,10 +2028,10 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v13
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v11
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %x, 32768
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index e1574dc..bb5918b2 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -14,15 +14,16 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_add_u32 s0, s2, s8
-; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: s_add_u32 s2, s2, s8
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_addc_u32 s1, s3, s9
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
+; SI-NEXT: s_addc_u32 s3, s3, s9
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -33,15 +34,15 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_addc_u32 s1, s3, s5
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -52,14 +53,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_add_u32 s4, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_addc_u32 s5, s3, s7
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_add_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s4, s3, s7
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -71,12 +72,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s4, s2, s6
-; GFX10-NEXT: s_addc_u32 s5, s3, s7
-; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: s_add_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10-NEXT: s_addc_u32 s3, s3, s7
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -87,14 +90,16 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s4, s2, s4
-; GFX11-NEXT: s_addc_u32 s5, s3, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_add_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-NEXT: s_addc_u32 s3, s3, s5
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
@@ -436,21 +441,23 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_add_u32 s6, s4, s6
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: s_addc_u32 s7, s5, s7
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_add_u32 s4, s4, s6
+; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
+; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
+; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
-; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -458,37 +465,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_uaddo_i64:
@@ -497,10 +504,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
@@ -510,12 +519,13 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s6, s4, s6
-; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_add_u32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -551,10 +561,10 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
-; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -574,10 +584,9 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
@@ -590,10 +599,9 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -607,12 +615,11 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v2, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_uaddo_i64:
@@ -624,14 +631,12 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v2, s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 9230174..7f89581 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -693,52 +693,47 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_uaddsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uaddsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uaddsat_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uaddsat_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 1ed04f8..41199b0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,11 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
@@ -157,9 +160,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s6, -1
; GCN-IR-NEXT: s_addc_u32 s15, s7, -1
; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s2, s2, s16
-; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: s_add_u32 s10, s2, s16
+; GCN-IR-NEXT: s_addc_u32 s11, s3, 0
+; GCN-IR-NEXT: s_mov_b64 s[2:3], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -167,19 +170,22 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s4, s14, s12
-; GCN-IR-NEXT: s_subb_u32 s4, s15, s13
-; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s11, s10
-; GCN-IR-NEXT: s_and_b32 s4, s10, 1
-; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s10
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s11
-; GCN-IR-NEXT: s_add_u32 s2, s2, 1
-; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[2:3], 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s2, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s2, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s2, s2, 31
+; GCN-IR-NEXT: s_mov_b32 s3, s2
+; GCN-IR-NEXT: s_and_b32 s4, s2, 1
+; GCN-IR-NEXT: s_and_b64 s[16:17], s[2:3], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s16
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s17
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow7
@@ -313,19 +319,19 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v14, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v15, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[6:7], v14, v15
+; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v8, v9
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[8:9]
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
@@ -333,55 +339,54 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8
-; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v10
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v0, v14
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v15
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[0:1], v[0:1], v10
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v6, v8
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v6, v9
+; GCN-IR-NEXT: v_addc_u32_e64 v13, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
+; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v6
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v10, v0
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v11, v1, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v3
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v6
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1
; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0
@@ -923,34 +928,37 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3]
-; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5
+; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3]
+; GCN-IR-NEXT: s_add_u32 s8, s14, 0xffffffc5
; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec
; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s8
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10
-; GCN-IR-NEXT: s_add_u32 s14, s2, -1
-; GCN-IR-NEXT: s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s12
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT: s_add_u32 s12, s2, -1
+; GCN-IR-NEXT: s_addc_u32 s13, s3, -1
+; GCN-IR-NEXT: s_sub_u32 s14, 58, s14
+; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -958,19 +966,22 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s4, s12, s10
+; GCN-IR-NEXT: s_subb_u32 s4, s13, s11
+; GCN-IR-NEXT: s_ashr_i32 s8, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s9, s8
+; GCN-IR-NEXT: s_and_b32 s4, s8, 1
+; GCN-IR-NEXT: s_and_b64 s[16:17], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
+; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
+; GCN-IR-NEXT: s_add_u32 s14, s14, 1
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
; GCN-IR-NEXT: .LBB8_4: ; %Flow6
@@ -1094,12 +1105,12 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v10
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v8
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5]
@@ -1109,55 +1120,54 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB9_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4
+; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, 47, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v6
+; GCN-IR-NEXT: v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB9_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
@@ -1184,13 +1194,13 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v6
+; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
@@ -1198,52 +1208,51 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB10_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB10_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v6
+; GCN-IR-NEXT: v_lshr_b64 v[0:1], v[0:1], v7
+; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6
+; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v0
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v4
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v6
+; GCN-IR-NEXT: v_and_b32_e32 v6, 0x8000, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB10_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB10_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
@@ -1290,52 +1299,58 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s8, 59, s12
+; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s8, 59, s10
; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec
; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
+; GCN-IR-NEXT: s_add_u32 s11, s8, 1
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s10
-; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc4
-; GCN-IR-NEXT: s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], s11
+; GCN-IR-NEXT: s_add_u32 s10, s10, 0xffffffc4
+; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, 23, s8
-; GCN-IR-NEXT: s_subb_u32 s4, 0, s9
-; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
-; GCN-IR-NEXT: s_and_b32 s4, s10, 1
-; GCN-IR-NEXT: s_and_b32 s10, s10, 24
-; GCN-IR-NEXT: s_sub_u32 s8, s8, s10
-; GCN-IR-NEXT: s_subb_u32 s9, s9, 0
-; GCN-IR-NEXT: s_add_u32 s2, s2, 1
-; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s4, 23, s2
+; GCN-IR-NEXT: s_subb_u32 s4, 0, s3
+; GCN-IR-NEXT: s_ashr_i32 s8, s4, 31
+; GCN-IR-NEXT: s_and_b32 s4, s8, 1
+; GCN-IR-NEXT: s_and_b32 s8, s8, 24
+; GCN-IR-NEXT: s_sub_u32 s2, s2, s8
+; GCN-IR-NEXT: s_subb_u32 s3, s3, 0
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-IR-NEXT: s_or_b32 s12, s12, s13
+; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3
; GCN-IR-NEXT: .LBB11_4: ; %Flow6
@@ -1384,13 +1399,13 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 59, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 59, v6
+; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
@@ -1398,51 +1413,50 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffc4, v6
+; GCN-IR-NEXT: v_lshr_b64 v[0:1], v[0:1], v7
+; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6
+; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v0
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v4
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v6
+; GCN-IR-NEXT: v_and_b32_e32 v6, 24, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index b846ce7..cdcc914 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -170,35 +170,38 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7]
-; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[2:3]
; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s18
+; GCN-IR-NEXT: s_sub_u32 s12, s10, s16
; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec
; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
+; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14
-; GCN-IR-NEXT: s_add_u32 s16, s6, -1
-; GCN-IR-NEXT: s_addc_u32 s17, s7, -1
+; GCN-IR-NEXT: s_add_u32 s14, s6, -1
+; GCN-IR-NEXT: s_addc_u32 s15, s7, -1
; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s10, s4, s18
-; GCN-IR-NEXT: s_addc_u32 s11, s5, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT: s_add_u32 s16, s4, s16
+; GCN-IR-NEXT: s_addc_u32 s17, s5, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -206,19 +209,22 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s4, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s4, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s4, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
-; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_and_b32 s4, s10, 1
+; GCN-IR-NEXT: s_and_b64 s[18:19], s[10:11], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
+; GCN-IR-NEXT: s_add_u32 s16, s16, 1
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
+; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow7
@@ -362,12 +368,12 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13
+; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -383,54 +389,53 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v6, v12
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, v6, v11
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT: v_addc_u32_e64 v15, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
@@ -948,34 +953,37 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3]
-; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5
+; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3]
+; GCN-IR-NEXT: s_add_u32 s8, s14, 0xffffffc5
; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec
; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s8
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10
-; GCN-IR-NEXT: s_add_u32 s14, s2, -1
-; GCN-IR-NEXT: s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s12
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT: s_add_u32 s12, s2, -1
+; GCN-IR-NEXT: s_addc_u32 s13, s3, -1
+; GCN-IR-NEXT: s_sub_u32 s14, 58, s14
+; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB6_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -983,19 +991,22 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s4, s12, s10
+; GCN-IR-NEXT: s_subb_u32 s4, s13, s11
+; GCN-IR-NEXT: s_ashr_i32 s8, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s9, s8
+; GCN-IR-NEXT: s_and_b32 s4, s8, 1
+; GCN-IR-NEXT: s_and_b64 s[16:17], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
+; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
+; GCN-IR-NEXT: s_add_u32 s14, s14, 1
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
+; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3
; GCN-IR-NEXT: .LBB6_4: ; %Flow6
@@ -1064,52 +1075,58 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s8, 59, s12
+; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s8, 59, s10
; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec
; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
+; GCN-IR-NEXT: s_add_u32 s11, s8, 1
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
+; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s10
-; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc4
-; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s11
+; GCN-IR-NEXT: s_add_u32 s12, s10, 0xffffffc4
+; GCN-IR-NEXT: s_addc_u32 s13, 0, -1
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, 23, s10
-; GCN-IR-NEXT: s_subb_u32 s4, 0, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b32 s12, s12, 24
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, 0
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s4, 23, s8
+; GCN-IR-NEXT: s_subb_u32 s4, 0, s9
+; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
+; GCN-IR-NEXT: s_and_b32 s4, s10, 1
+; GCN-IR-NEXT: s_and_b32 s10, s10, 24
+; GCN-IR-NEXT: s_sub_u32 s8, s8, s10
+; GCN-IR-NEXT: s_subb_u32 s9, s9, 0
+; GCN-IR-NEXT: s_add_u32 s12, s12, 1
+; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-IR-NEXT: s_or_b32 s14, s14, s15
+; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15]
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
; GCN-IR-NEXT: .LBB7_4: ; %Flow6
@@ -1241,8 +1258,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1257,54 +1274,53 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB8_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execz .LBB8_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, 47, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v6
+; GCN-IR-NEXT: v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1
+; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB8_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB8_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1337,8 +1353,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
@@ -1352,51 +1368,50 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB9_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT: v_addc_u32_e64 v11, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB9_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 0289dab..d67a7b1 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -14,15 +14,16 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_sub_u32 s0, s2, s8
-; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: s_sub_u32 s2, s2, s8
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_subb_u32 s1, s3, s9
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
+; SI-NEXT: s_subb_u32 s3, s3, s9
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -33,15 +34,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_subb_u32 s1, s3, s5
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -52,14 +53,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_sub_u32 s4, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_subb_u32 s5, s3, s7
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_sub_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_subb_u32 s4, s3, s7
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -71,12 +72,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_u32 s4, s2, s6
-; GFX10-NEXT: s_subb_u32 s5, s3, s7
-; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: s_sub_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10-NEXT: s_subb_u32 s3, s3, s7
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -87,14 +90,16 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s4, s2, s4
-; GFX11-NEXT: s_subb_u32 s5, s3, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_sub_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-NEXT: s_subb_u32 s3, s3, s5
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
@@ -435,21 +440,23 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_u32 s6, s4, s6
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: s_subb_u32 s7, s5, s7
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_sub_u32 s4, s4, s6
+; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
+; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
+; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
-; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -457,37 +464,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_usubo_i64:
@@ -496,10 +503,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
@@ -509,12 +518,13 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s6, s4, s6
-; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_sub_u32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -550,10 +560,10 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
-; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -573,10 +583,9 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
-; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
@@ -589,10 +598,9 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -606,12 +614,11 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
-; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v2, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_usubo_i64:
@@ -623,14 +630,12 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v2, s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 90491a0..3ddb2f0 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -730,52 +730,38 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_usubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_usubsat_i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_usubsat_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: v_usubsat_i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
}
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags_V1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags_V1.ll
new file mode 100644
index 0000000..610ce4f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags_V1.ll
@@ -0,0 +1,18 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+; On Version 1, the only valid flag is DataVolatile (2).
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+
+; CHECK: error: Invalid value for RootDescriptorFlag: 4
+; CHECK-NOT: Root Signature Definitions
+define void @main() #0 {
+entry:
+ ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 1 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"RootCBV", i32 0, i32 1, i32 2, i32 4 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag_V1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag_V1.ll
new file mode 100644
index 0000000..76b60b8
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag_V1.ll
@@ -0,0 +1,19 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid value for Static Sampler Flag: 1
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+ ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 1 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 1 }
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index 02118fb..b503da4 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -72,7 +72,7 @@ define internal void @bar() {
; CHECK-NEXT: [[OFFSET:%.*]] = ashr exact i64 [[TMP2]], 3
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr addrspace(1) [[BEGIN]], i64 [[OFFSET]]
; CHECK-NEXT: [[START:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP3]], i64 -1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt ptr addrspace(1) [[START]], [[BEGIN]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge ptr addrspace(1) [[START]], [[BEGIN]]
; CHECK-NEXT: br i1 [[TMP4]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
; CHECK: while.entry:
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ [[START]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
index 4b1359e..73b0d3a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+zvfh -global-isel -stop-after=irtranslator \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+zvfhmin -global-isel -stop-after=irtranslator \
; RUN: -verify-machineinstrs < %s | FileCheck -check-prefixes=RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+zvfh -global-isel -stop-after=irtranslator \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+zvfhmin -global-isel -stop-after=irtranslator \
; RUN: -verify-machineinstrs < %s | FileCheck -check-prefixes=RV64 %s
; ==========================================================================
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 1361d92..2e500d5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -72,12 +72,12 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
-# DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_ABDS (opcode [[G_ABDS:[0-9]+]]): 1 type index, 0 imm indices
# DEBUG-NEXT:.. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT:.. imm index coverage check SKIPPED: user-defined predicate detected
#
-# DEBUG-NEXT:G_ABDU (opcode 66): 1 type index, 0 imm indices
-# DEBUG-NEXT:.. opcode 66 is aliased to 65
+# DEBUG-NEXT:G_ABDU (opcode [[G_ABDU:[0-9]+]]): 1 type index, 0 imm indices
+# DEBUG-NEXT:.. opcode [[G_ABDU]] is aliased to [[G_ABDS]]
# DEBUG-NEXT:.. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT:.. imm index coverage check SKIPPED: user-defined predicate detected
#
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_predicated_io/predicated_io_generic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_predicated_io/predicated_io_generic.ll
new file mode 100644
index 0000000..a3127e8
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_predicated_io/predicated_io_generic.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_predicated_io %s -o - | FileCheck %s
+
+; CHECK-ERROR: LLVM ERROR: OpPredicated[Load/Store]INTEL
+; CHECK-ERROR-SAME: instructions require the following SPIR-V extension: SPV_INTEL_predicated_io
+
+; CHECK-DAG: Capability PredicatedIOINTEL
+; CHECK-DAG: Extension "SPV_INTEL_predicated_io"
+
+; CHECK-DAG: %[[Int32Ty:[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: %[[IntPtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[Int32Ty]]
+; CHECK-DAG: %[[BoolTy:[0-9]+]] = OpTypeBool
+; CHECK-DAG: %[[VoidTy:[0-9]+]] = OpTypeVoid
+; CHECK: %[[LoadPtr:[0-9]+]] = OpFunctionParameter %[[IntPtrTy]]
+; CHECK: %[[StorePtr:[0-9]+]] = OpFunctionParameter %[[IntPtrTy]]
+; CHECK: %[[DefaultVal:[0-9]+]] = OpFunctionParameter %[[Int32Ty]]
+; CHECK: %[[StoreObj:[0-9]+]] = OpFunctionParameter %[[Int32Ty]]
+; CHECK: %[[Predicate:[0-9]+]] = OpFunctionParameter %[[BoolTy]]
+; CHECK: PredicatedLoadINTEL %[[Int32Ty]] %[[LoadPtr]] %[[Predicate]] %[[DefaultVal]]
+; CHECK: PredicatedLoadINTEL %[[Int32Ty]] %[[LoadPtr]] %[[Predicate]] %[[DefaultVal]] None
+; CHECK: PredicatedStoreINTEL %[[StorePtr]] %[[StoreObj]] %[[Predicate]]
+; CHECK: PredicatedStoreINTEL %[[StorePtr]] %[[StoreObj]] %[[Predicate]] None
+
+define spir_func void @foo(ptr addrspace(1) %load_pointer, ptr addrspace(1) %store_pointer, i32 %default_value, i32 %store_object, i1 zeroext %predicate) {
+entry:
+ %1 = call spir_func i32 @_Z27__spirv_PredicatedLoadINTELPU3AS1Kibi(ptr addrspace(1) %load_pointer, i1 %predicate, i32 %default_value)
+ %2 = call spir_func i32 @_Z27__spirv_PredicatedLoadINTELPU3AS1Kibii(ptr addrspace(1) %load_pointer, i1 %predicate, i32 %default_value, i32 0)
+ call spir_func void @_Z28__spirv_PredicatedStoreINTELPU3AS1Kiib(ptr addrspace(1) %store_pointer, i32 %store_object, i1 %predicate)
+ call spir_func void @_Z28__spirv_PredicatedStoreINTELPU3AS1Kiibi(ptr addrspace(1) %store_pointer, i32 %store_object, i1 %predicate, i32 0)
+ ret void
+}
+
+declare spir_func i32 @_Z27__spirv_PredicatedLoadINTELPU3AS1Kibi(ptr addrspace(1), i1, i32)
+declare spir_func i32 @_Z27__spirv_PredicatedLoadINTELPU3AS1Kibii(ptr addrspace(1), i1, i32, i32)
+declare spir_func void @_Z28__spirv_PredicatedStoreINTELPU3AS1Kiib(ptr addrspace(1), i32, i1)
+declare spir_func void @_Z28__spirv_PredicatedStoreINTELPU3AS1Kiibi(ptr addrspace(1), i32, i1, i32)
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 52f57dc..a8d37be 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -434,7 +434,6 @@ entry:
define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-LABEL: stest_f16i16:
; CHECK: .functype stest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT: .local v128, v128, v128
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: local.get 5
; CHECK-NEXT: call __truncsfhf2
@@ -474,15 +473,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-NEXT: call __extendhfsf2
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767
-; CHECK-NEXT: local.tee 8
-; CHECK-NEXT: i32x4.min_s
-; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768
-; CHECK-NEXT: local.tee 9
-; CHECK-NEXT: i32x4.max_s
-; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT: local.tee 10
-; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.splat
@@ -495,13 +485,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: local.get 8
-; CHECK-NEXT: i32x4.min_s
-; CHECK-NEXT: local.get 9
-; CHECK-NEXT: i32x4.max_s
-; CHECK-NEXT: local.get 10
-; CHECK-NEXT: v128.and
-; CHECK-NEXT: i16x8.narrow_i32x4_u
+; CHECK-NEXT: i16x8.narrow_i32x4_s
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <8 x half> %x to <8 x i32>
@@ -516,7 +500,6 @@ entry:
define <8 x i16> @utest_f16i16(<8 x half> %x) {
; CHECK-LABEL: utest_f16i16:
; CHECK: .functype utest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: local.get 5
; CHECK-NEXT: call __truncsfhf2
@@ -556,9 +539,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
; CHECK-NEXT: call __extendhfsf2
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT: local.tee 8
-; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.splat
@@ -571,8 +551,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: local.get 8
-; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: # fallthrough-return
entry:
@@ -1861,7 +1839,6 @@ entry:
define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-LABEL: stest_f16i16_mm:
; CHECK: .functype stest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT: .local v128, v128, v128
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: local.get 5
; CHECK-NEXT: call __truncsfhf2
@@ -1901,15 +1878,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-NEXT: call __extendhfsf2
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767
-; CHECK-NEXT: local.tee 8
-; CHECK-NEXT: i32x4.min_s
-; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768
-; CHECK-NEXT: local.tee 9
-; CHECK-NEXT: i32x4.max_s
-; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT: local.tee 10
-; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.splat
@@ -1922,13 +1890,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.trunc_sat_f32_s
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: local.get 8
-; CHECK-NEXT: i32x4.min_s
-; CHECK-NEXT: local.get 9
-; CHECK-NEXT: i32x4.max_s
-; CHECK-NEXT: local.get 10
-; CHECK-NEXT: v128.and
-; CHECK-NEXT: i16x8.narrow_i32x4_u
+; CHECK-NEXT: i16x8.narrow_i32x4_s
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <8 x half> %x to <8 x i32>
@@ -1941,7 +1903,6 @@ entry:
define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
; CHECK-LABEL: utest_f16i16_mm:
; CHECK: .functype utest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: local.get 5
; CHECK-NEXT: call __truncsfhf2
@@ -1981,9 +1942,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
; CHECK-NEXT: call __extendhfsf2
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT: local.tee 8
-; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 4
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.splat
@@ -1996,8 +1954,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
; CHECK-NEXT: local.get 7
; CHECK-NEXT: i32.trunc_sat_f32_u
; CHECK-NEXT: i32x4.replace_lane 3
-; CHECK-NEXT: local.get 8
-; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: # fallthrough-return
entry:
diff --git a/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
new file mode 100644
index 0000000..f3f3ba9
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) #2
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) #2
+
+define <16 x i8> @i16_signed(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_signed:
+; CHECK: .functype i16_signed (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %bb2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.narrow_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+bb2:
+ %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> splat (i16 -128))
+ %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
+ %3 = trunc nsw <16 x i16> %2 to <16 x i8>
+ ret <16 x i8> %3
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @i32_signed(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_signed:
+; CHECK: .functype i32_signed (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %bb2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i16x8.narrow_i32x4_s
+; CHECK-NEXT: # fallthrough-return
+bb2:
+ %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> splat (i32 -32768))
+ %2 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %1, <8 x i32> splat (i32 32767))
+ %3 = trunc nsw <8 x i32> %2 to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @i32_signed_flipped(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_signed_flipped:
+; CHECK: .functype i32_signed_flipped (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %bb2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i16x8.narrow_i32x4_s
+; CHECK-NEXT: # fallthrough-return
+bb2:
+ %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> splat (i32 32767), <8 x i32> %0)
+ %2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> splat (i32 -32768), <8 x i32> %1)
+ %3 = trunc nsw <8 x i32> %2 to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_unsigned:
+; CHECK: .functype i16_unsigned (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %bb2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.narrow_i16x8_u
+; CHECK-NEXT: # fallthrough-return
+bb2:
+ %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> splat (i16 255))
+ %2 = trunc nuw <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_unsigned:
+; CHECK: .functype i32_unsigned (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %bb2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i16x8.narrow_i32x4_u
+; CHECK-NEXT: # fallthrough-return
+bb2:
+ %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535))
+ %2 = trunc nsw <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll
new file mode 100644
index 0000000..d89f0db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/and-mask-variable.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-NOBMI
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
+
+define i32 @mask_pair(i32 %x, i32 %y) nounwind {
+; X86-NOBMI-LABEL: mask_pair:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: shrl %cl, %eax
+; X86-NOBMI-NEXT: shll %cl, %eax
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI2-LABEL: mask_pair:
+; X86-BMI2: # %bb.0:
+; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax
+; X86-BMI2-NEXT: retl
+;
+; X64-NOBMI-LABEL: mask_pair:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movl %esi, %ecx
+; X64-NOBMI-NEXT: movl %edi, %eax
+; X64-NOBMI-NEXT: shrl %cl, %eax
+; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT: shll %cl, %eax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI2-LABEL: mask_pair:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: shrxl %esi, %edi, %eax
+; X64-BMI2-NEXT: shlxl %esi, %eax, %eax
+; X64-BMI2-NEXT: retq
+ %shl = shl nsw i32 -1, %y
+ %and = and i32 %shl, %x
+ ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) nounwind {
+; X86-NOBMI-LABEL: mask_pair_64:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl $-1, %edx
+; X86-NOBMI-NEXT: movl $-1, %eax
+; X86-NOBMI-NEXT: shll %cl, %eax
+; X86-NOBMI-NEXT: testb $32, %cl
+; X86-NOBMI-NEXT: je .LBB1_2
+; X86-NOBMI-NEXT: # %bb.1:
+; X86-NOBMI-NEXT: movl %eax, %edx
+; X86-NOBMI-NEXT: xorl %eax, %eax
+; X86-NOBMI-NEXT: .LBB1_2:
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI2-LABEL: mask_pair_64:
+; X86-BMI2: # %bb.0:
+; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT: movl $-1, %edx
+; X86-BMI2-NEXT: shlxl %ecx, %edx, %eax
+; X86-BMI2-NEXT: testb $32, %cl
+; X86-BMI2-NEXT: je .LBB1_2
+; X86-BMI2-NEXT: # %bb.1:
+; X86-BMI2-NEXT: movl %eax, %edx
+; X86-BMI2-NEXT: xorl %eax, %eax
+; X86-BMI2-NEXT: .LBB1_2:
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT: retl
+;
+; X64-NOBMI-LABEL: mask_pair_64:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movq %rsi, %rcx
+; X64-NOBMI-NEXT: movq %rdi, %rax
+; X64-NOBMI-NEXT: shrq %cl, %rax
+; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT: shlq %cl, %rax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI2-LABEL: mask_pair_64:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: retq
+ %shl = shl nsw i64 -1, %y
+ %and = and i64 %shl, %x
+ ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) nounwind {
+; X86-NOBMI-LABEL: mask_pair_128:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: pushl %ebx
+; X86-NOBMI-NEXT: pushl %edi
+; X86-NOBMI-NEXT: pushl %esi
+; X86-NOBMI-NEXT: subl $32, %esp
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: movl $0, (%esp)
+; X86-NOBMI-NEXT: movl %ecx, %edx
+; X86-NOBMI-NEXT: shrb $3, %dl
+; X86-NOBMI-NEXT: andb $12, %dl
+; X86-NOBMI-NEXT: negb %dl
+; X86-NOBMI-NEXT: movsbl %dl, %ebx
+; X86-NOBMI-NEXT: movl 24(%esp,%ebx), %edx
+; X86-NOBMI-NEXT: movl 28(%esp,%ebx), %esi
+; X86-NOBMI-NEXT: shldl %cl, %edx, %esi
+; X86-NOBMI-NEXT: movl 16(%esp,%ebx), %edi
+; X86-NOBMI-NEXT: movl 20(%esp,%ebx), %ebx
+; X86-NOBMI-NEXT: shldl %cl, %ebx, %edx
+; X86-NOBMI-NEXT: shldl %cl, %edi, %ebx
+; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT: shll %cl, %edi
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT: movl %esi, 12(%eax)
+; X86-NOBMI-NEXT: movl %edx, 8(%eax)
+; X86-NOBMI-NEXT: movl %ebx, 4(%eax)
+; X86-NOBMI-NEXT: movl %edi, (%eax)
+; X86-NOBMI-NEXT: addl $32, %esp
+; X86-NOBMI-NEXT: popl %esi
+; X86-NOBMI-NEXT: popl %edi
+; X86-NOBMI-NEXT: popl %ebx
+; X86-NOBMI-NEXT: retl $4
+;
+; X86-BMI2-LABEL: mask_pair_128:
+; X86-BMI2: # %bb.0:
+; X86-BMI2-NEXT: pushl %ebx
+; X86-BMI2-NEXT: pushl %edi
+; X86-BMI2-NEXT: pushl %esi
+; X86-BMI2-NEXT: subl $32, %esp
+; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $-1, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT: movl $0, (%esp)
+; X86-BMI2-NEXT: movl %ecx, %edx
+; X86-BMI2-NEXT: shrb $3, %dl
+; X86-BMI2-NEXT: andb $12, %dl
+; X86-BMI2-NEXT: negb %dl
+; X86-BMI2-NEXT: movsbl %dl, %edi
+; X86-BMI2-NEXT: movl 24(%esp,%edi), %edx
+; X86-BMI2-NEXT: movl 28(%esp,%edi), %esi
+; X86-BMI2-NEXT: shldl %cl, %edx, %esi
+; X86-BMI2-NEXT: movl 16(%esp,%edi), %ebx
+; X86-BMI2-NEXT: movl 20(%esp,%edi), %edi
+; X86-BMI2-NEXT: shldl %cl, %edi, %edx
+; X86-BMI2-NEXT: shldl %cl, %ebx, %edi
+; X86-BMI2-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT: movl %esi, 12(%eax)
+; X86-BMI2-NEXT: movl %edx, 8(%eax)
+; X86-BMI2-NEXT: movl %edi, 4(%eax)
+; X86-BMI2-NEXT: movl %ecx, (%eax)
+; X86-BMI2-NEXT: addl $32, %esp
+; X86-BMI2-NEXT: popl %esi
+; X86-BMI2-NEXT: popl %edi
+; X86-BMI2-NEXT: popl %ebx
+; X86-BMI2-NEXT: retl $4
+;
+; X64-NOBMI-LABEL: mask_pair_128:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movq %rdx, %rcx
+; X64-NOBMI-NEXT: movq $-1, %rdx
+; X64-NOBMI-NEXT: movq $-1, %r8
+; X64-NOBMI-NEXT: shlq %cl, %r8
+; X64-NOBMI-NEXT: xorl %eax, %eax
+; X64-NOBMI-NEXT: testb $64, %cl
+; X64-NOBMI-NEXT: cmovneq %r8, %rdx
+; X64-NOBMI-NEXT: cmoveq %r8, %rax
+; X64-NOBMI-NEXT: andq %rdi, %rax
+; X64-NOBMI-NEXT: andq %rsi, %rdx
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI2-LABEL: mask_pair_128:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movq $-1, %rcx
+; X64-BMI2-NEXT: shlxq %rdx, %rcx, %r8
+; X64-BMI2-NEXT: xorl %eax, %eax
+; X64-BMI2-NEXT: testb $64, %dl
+; X64-BMI2-NEXT: cmovneq %r8, %rcx
+; X64-BMI2-NEXT: cmoveq %r8, %rax
+; X64-BMI2-NEXT: andq %rdi, %rax
+; X64-BMI2-NEXT: andq %rsi, %rcx
+; X64-BMI2-NEXT: movq %rcx, %rdx
+; X64-BMI2-NEXT: retq
+ %shl = shl nsw i128 -1, %y
+ %and = and i128 %shl, %x
+ ret i128 %and
+}
diff --git a/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll b/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll
new file mode 100644
index 0000000..92aedfe
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll
@@ -0,0 +1,36 @@
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=1 | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=10 | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=100 | FileCheck %s
+
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=1 -fast-isel=true | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=10 -fast-isel=true | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=100 -fast-isel=true | FileCheck %s
+
+; This test has the same purpose as the instr-ref-opt-bisect.ll, to check if
+; during opt-bisect's optimisation level change we won't run into an assert.
+; This is simply testing different IR.
+
+; CHECK: DBG_VALUE
+
+target triple = "x86_64-pc-windows-msvc"
+
+define i1 @foo(i32 %arg) !dbg !3 {
+entry:
+ #dbg_value(i32 %arg, !4, !DIExpression(), !5)
+ switch i32 %arg, label %bb [
+ i32 810, label %bb
+ ], !dbg !5
+bb:
+ %a = load volatile i1, ptr null, align 1
+ ret i1 false
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1)
+!1 = !DIFile(filename: "instr-ref-opt-bisect2.ll", directory: ".")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "instr-ref-opt-bisect2", file: !1, unit: !0)
+!4 = !DILocalVariable(name: "arg", arg: 2, scope: !3)
+!5 = !DILocation(line: 0, scope: !3)
diff --git a/llvm/test/Instrumentation/AllocToken/extralibfuncs.ll b/llvm/test/Instrumentation/AllocToken/extralibfuncs.ll
index 5f08552..0e382b2 100644
--- a/llvm/test/Instrumentation/AllocToken/extralibfuncs.ll
+++ b/llvm/test/Instrumentation/AllocToken/extralibfuncs.ll
@@ -38,7 +38,7 @@ entry:
ret ptr %ptr1
}
-!0 = !{!"int"}
+!0 = !{!"int", i1 0}
;.
-; CHECK: [[META0]] = !{!"int"}
+; CHECK: [[META0]] = !{!"int", i1 false}
;.
diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
index e023ab6b..19673da 100644
--- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
+++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
@@ -79,7 +79,7 @@ entry:
ret ptr %ptr1
}
-!0 = !{!"int"}
+!0 = !{!"int", i1 0}
;.
-; CHECK: [[META0]] = !{!"int"}
+; CHECK: [[META0]] = !{!"int", i1 false}
;.
diff --git a/llvm/test/Instrumentation/AllocToken/remark.ll b/llvm/test/Instrumentation/AllocToken/remark.ll
index a2404526..f2eaa62 100644
--- a/llvm/test/Instrumentation/AllocToken/remark.ll
+++ b/llvm/test/Instrumentation/AllocToken/remark.ll
@@ -32,7 +32,7 @@ entry:
ret ptr %ptr1
}
-!0 = !{!"int"}
+!0 = !{!"int", i1 0}
;.
-; CHECK: [[META0]] = !{!"int"}
+; CHECK: [[META0]] = !{!"int", i1 false}
;.
diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
new file mode 100644
index 0000000..1f77648
--- /dev/null
+++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+declare ptr @malloc(i64)
+
+define void @test_typehashpointersplit() sanitize_alloc_token {
+; CHECK-LABEL: define void @test_typehashpointersplit(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call ptr @__alloc_token_malloc(i64 4, i64 0), !alloc_token [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr @__alloc_token_malloc(i64 128, i64 0), !alloc_token [[META1:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = call ptr @__alloc_token_malloc(i64 8, i64 1), !alloc_token [[META2:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = call ptr @__alloc_token_malloc(i64 64, i64 1), !alloc_token [[META3:![0-9]+]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call ptr @malloc(i64 4), !alloc_token !0
+ call ptr @malloc(i64 128), !alloc_token !1
+ call ptr @malloc(i64 8), !alloc_token !2
+ call ptr @malloc(i64 64), !alloc_token !3
+ ret void
+}
+
+!0 = !{!"int", i1 0}
+!1 = !{!"Foo", i1 0}
+!2 = !{!"int*", i1 1}
+!3 = !{!"Foo", i1 1}
+;.
+; CHECK: [[META0]] = !{!"int", i1 false}
+; CHECK: [[META1]] = !{!"Foo", i1 false}
+; CHECK: [[META2]] = !{!"int*", i1 true}
+; CHECK: [[META3]] = !{!"Foo", i1 true}
+;.
diff --git a/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s b/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s
new file mode 100644
index 0000000..f8baf37
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s
@@ -0,0 +1,95 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ERROR
+
+tlbip ALLE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip PAALL
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip PAALLOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip RPALOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip RPAOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index bb3001e..a7d3446 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -91,12 +91,13 @@
@ctz7.table = internal unnamed_addr constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @ctz1(i32 %x) {
+define i32 @ctz1(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -498,3 +499,7 @@ entry:
%conv = zext i8 %0 to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
index d2ecb57..0e5c4f0 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
@@ -20,13 +20,14 @@
@table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
-define i32 @ctz6(ptr nocapture readonly %b) {
+define i32 @ctz6(ptr nocapture readonly %b) !prof !0 {
; CHECK-LABEL: @ctz6(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP0]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[TMP4]]
;
@@ -40,3 +41,7 @@ entry:
%1 = load i32, ptr %arrayidx, align 4
ret i32 %1
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
index f63badb..a7732f0 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
@@ -20,13 +20,14 @@
@.str = private constant [3 x i8] c"%u\00", align 1
@test.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @test() {
+define i32 @test() !prof !0 {
; CHECK-LABEL: @test(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @x, align 4
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 true)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -43,3 +44,7 @@ entry:
%conv = zext i8 %1 to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
index bbdd9b7c..5f9b4ce 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
@@ -3,12 +3,13 @@
@ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @ctz1(i32 %x) {
+define i32 @ctz1(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -24,3 +25,7 @@ entry:
%conv = zext i8 %0 to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index f5329cf..c225ede5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -580,6 +580,201 @@ exit:
ret double %accum
}
+define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %src, ptr noalias %src.2, ptr noalias %dst) #0 {
+; I64-LABEL: define void @loaded_address_used_by_load_through_blend(
+; I64-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
+; I64-NEXT: [[ENTRY:.*]]:
+; I64-NEXT: br label %[[LOOP_HEADER:.*]]
+; I64: [[LOOP_HEADER]]:
+; I64-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; I64-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; I64-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1
+; I64-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
+; I64-NEXT: [[L_SRC:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; I64-NEXT: [[C:%.*]] = fcmp oeq float [[L_SRC]], 0.000000e+00
+; I64-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; I64: [[THEN]]:
+; I64-NEXT: [[IV_MUL:%.*]] = mul i64 [[IV_1]], [[START]]
+; I64-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[IV_MUL]]
+; I64-NEXT: br label %[[LOOP_LATCH]]
+; I64: [[LOOP_LATCH]]:
+; I64-NEXT: [[MERGE_GEP:%.*]] = phi ptr [ [[GEP_SRC_2]], %[[THEN]] ], [ [[SRC_2]], %[[LOOP_HEADER]] ]
+; I64-NEXT: [[L_2:%.*]] = load float, ptr [[MERGE_GEP]], align 4
+; I64-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; I64-NEXT: store float [[L_2]], ptr [[GEP_DST]], align 4
+; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; I64-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; I64-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 100
+; I64-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
+; I64: [[EXIT]]:
+; I64-NEXT: ret void
+;
+; I32-LABEL: define void @loaded_address_used_by_load_through_blend(
+; I32-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
+; I32-NEXT: [[ENTRY:.*:]]
+; I32-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
+; I32-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 100)
+; I32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
+; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
+; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32: [[VECTOR_PH]]:
+; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
+; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; I32-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
+; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
+; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; I32-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
+; I32-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; I32-NEXT: br label %[[VECTOR_BODY:.*]]
+; I32: [[VECTOR_BODY]]:
+; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 4
+; I32-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 5
+; I32-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6
+; I32-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7
+; I32-NEXT: [[TMP11:%.*]] = add i64 [[TMP3]], 1
+; I32-NEXT: [[TMP12:%.*]] = add i64 [[TMP4]], 1
+; I32-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], 1
+; I32-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], 1
+; I32-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], 1
+; I32-NEXT: [[TMP16:%.*]] = add i64 [[TMP8]], 1
+; I32-NEXT: [[TMP17:%.*]] = add i64 [[TMP9]], 1
+; I32-NEXT: [[TMP18:%.*]] = add i64 [[TMP10]], 1
+; I32-NEXT: [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
+; I32-NEXT: [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
+; I32-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
+; I32-NEXT: [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
+; I32-NEXT: [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
+; I32-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
+; I32-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
+; I32-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
+; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
+; I32-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
+; I32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; I32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; I32-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
+; I32-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
+; I32-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; I32-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
+; I32-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
+; I32-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4
+; I32-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4
+; I32-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4
+; I32-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4
+; I32-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4
+; I32-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4
+; I32-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4
+; I32-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP35]], i32 0
+; I32-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP36]], i32 1
+; I32-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP37]], i32 2
+; I32-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP38]], i32 3
+; I32-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP39]], i32 4
+; I32-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP40]], i32 5
+; I32-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
+; I32-NEXT: [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
+; I32-NEXT: [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
+; I32-NEXT: [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
+; I32-NEXT: [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
+; I32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
+; I32-NEXT: [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
+; I32-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
+; I32-NEXT: [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
+; I32-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
+; I32-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
+; I32-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
+; I32-NEXT: [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
+; I32-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
+; I32-NEXT: [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
+; I32-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
+; I32-NEXT: [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
+; I32-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
+; I32-NEXT: [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
+; I32-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
+; I32-NEXT: [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
+; I32-NEXT: [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
+; I32-NEXT: [[TMP71:%.*]] = insertelement <8 x ptr> [[TMP70]], ptr [[TMP58]], i32 2
+; I32-NEXT: [[TMP72:%.*]] = insertelement <8 x ptr> [[TMP71]], ptr [[TMP60]], i32 3
+; I32-NEXT: [[TMP73:%.*]] = insertelement <8 x ptr> [[TMP72]], ptr [[TMP62]], i32 4
+; I32-NEXT: [[TMP74:%.*]] = insertelement <8 x ptr> [[TMP73]], ptr [[TMP64]], i32 5
+; I32-NEXT: [[TMP75:%.*]] = insertelement <8 x ptr> [[TMP74]], ptr [[TMP66]], i32 6
+; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
+; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
+; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
+; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
+; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
+; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
+; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
+; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
+; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
+; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
+; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
+; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
+; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
+; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
+; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
+; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
+; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
+; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
+; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; I32-NEXT: [[TMP95:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
+; I32-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; I32-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; I32-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; I32-NEXT: [[TMP99:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
+; I32-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
+; I32-NEXT: store float [[TMP78]], ptr [[TMP93]], align 4
+; I32-NEXT: store float [[TMP80]], ptr [[TMP94]], align 4
+; I32-NEXT: store float [[TMP82]], ptr [[TMP95]], align 4
+; I32-NEXT: store float [[TMP84]], ptr [[TMP96]], align 4
+; I32-NEXT: store float [[TMP86]], ptr [[TMP97]], align 4
+; I32-NEXT: store float [[TMP88]], ptr [[TMP98]], align 4
+; I32-NEXT: store float [[TMP90]], ptr [[TMP99]], align 4
+; I32-NEXT: store float [[TMP92]], ptr [[TMP100]], align 4
+; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; I32-NEXT: [[TMP101:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT: br i1 [[TMP101]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; I32: [[MIDDLE_BLOCK]]:
+; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; I32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; I32: [[SCALAR_PH]]:
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %iv.2 = phi i64 [ %start, %entry ], [ %iv.2.next, %loop.latch ]
+ %iv.1 = add i64 %iv, 1
+ %gep.src = getelementptr i8, ptr %src, i64 %iv.1
+ %l.src = load float, ptr %gep.src, align 4
+ %c = fcmp oeq float %l.src, 0.000000e+00
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ %iv.mul = mul i64 %iv.1, %start
+ %gep.src.2 = getelementptr i8, ptr %src.2, i64 %iv.mul
+ br label %loop.latch
+
+loop.latch:
+ %merge.gep = phi ptr [ %gep.src.2, %then ], [ %src.2, %loop.header ]
+ %l.2 = load float, ptr %merge.gep, align 4
+ %gep.dst = getelementptr i8, ptr %dst, i64 %iv
+ store float %l.2, ptr %gep.dst, align 4
+ %iv.next = add i64 %iv, 1
+ %iv.2.next = add i64 %iv.2, -1
+ %ec = icmp sgt i64 %iv.2, 100
+ br i1 %ec, label %loop.header, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }
attributes #0 = { "target-cpu"="znver2" }
!0 = distinct !{!0, !1}
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-alloc-token.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-alloc-token.ll
index 9bbe3eb..42d3dcc 100644
--- a/llvm/test/Transforms/SimplifyCFG/merge-calls-alloc-token.ll
+++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-alloc-token.ll
@@ -97,8 +97,8 @@ if.end:
ret ptr %x.0
}
-!0 = !{!"int"}
-!1 = !{!"char[4]"}
+!0 = !{!"int", i1 0}
+!1 = !{!"char[4]", i1 0}
;.
-; CHECK: [[META0]] = !{!"int"}
+; CHECK: [[META0]] = !{!"int", i1 false}
;.
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe
index 309476a..a4c36a3 100644
--- a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe
+++ b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe
Binary files differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
index ec5c8ff..29a8803 100644
--- a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
+++ b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
@@ -1,13 +1,13 @@
PERF_RECORD_MMAP2 5752/0: [0x7ff70a1b0000(0x640000) @ 0x1000 00:00 0 0]: r-xp c:\Users\haohaiwe\Desktop\coff-profile.exe
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/P/X/A/0 0x7ff70a1b1400/0x7ff70a1b13a0/M/X/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/-/X/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
+ 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0 0x7ff70a1b1461/0x7ff70a1b1410/P/-/A/0
diff --git a/llvm/test/tools/llvm-profgen/coff-profile.test b/llvm/test/tools/llvm-profgen/coff-profile.test
index 5578f73..6411642 100644
--- a/llvm/test/tools/llvm-profgen/coff-profile.test
+++ b/llvm/test/tools/llvm-profgen/coff-profile.test
@@ -1,37 +1,77 @@
+; RUN: llvm-profgen --format=text --use-dwarf-correlation --perfscript=%S/Inputs/coff-profile.perfscript --binary=%S/Inputs/coff-profile.exe --output=%t
+; RUN: FileCheck %s --input-file %t --check-prefix=DWARF
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/coff-profile.perfscript --binary=%S/Inputs/coff-profile.exe --output=%t
-; RUN: FileCheck %s --input-file %t
+; RUN: FileCheck %s --input-file %t --check-prefix=PROBE
-CHECK: main:31837:0
-CHECK-NEXT: 0: 0
-CHECK-NEXT: 3.1: 0
-CHECK-NEXT: 3.2: 0
-CHECK-NEXT: 8: 0
-CHECK-NEXT: 65501: 0
-CHECK-NEXT: 1: ??$init@HG@MyNameSpace2@@YAXHPEAG@Z:0
-CHECK-NEXT: 1: 0
-CHECK-NEXT: 1.1: 0
-CHECK-NEXT: 1.2: 0
-CHECK-NEXT: 2: 0
-CHECK-NEXT: 65514: 0
-CHECK-NEXT: 4: ?work1@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:3193
-CHECK-NEXT: 0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:3193
-CHECK-NEXT: 1.1: 31
-CHECK-NEXT: 1.2: 31
-CHECK-NEXT: 2: 31
-CHECK-NEXT: 3: 31
-CHECK-NEXT: 65530: 0
-CHECK-NEXT: 5: ?work2@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:28644
-CHECK-NEXT: 0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:28644
-CHECK-NEXT: 1.1: 341
-CHECK-NEXT: 1.2: 341
-CHECK-NEXT: 2: 341
-CHECK-NEXT: 3: 341
-CHECK-NEXT: 65530: 0
-CHECK-NEXT: 7: ?print@MyNameSpace2@@YAXPEAGH@Z:0
-CHECK-NEXT: 1: 0
+DWARF: main:31341:0
+DWARF-NEXT: 0: 0
+DWARF-NEXT: 3: 0
+DWARF-NEXT: 3.1: 0
+DWARF-NEXT: 3.2: 0
+DWARF-NEXT: 8: 0
+DWARF-NEXT: 65501: 0
+DWARF-NEXT: 1: ??$init@HG@MyNameSpace2@@YAXHPEAG@Z:0
+DWARF-NEXT: 1: 0
+DWARF-NEXT: 1.1: 0
+DWARF-NEXT: 1.2: 0
+DWARF-NEXT: 2: 0
+DWARF-NEXT: 65514: 0
+DWARF-NEXT: 4: ?work1@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:3038
+DWARF-NEXT: 0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:3038
+DWARF-NEXT: 1.1: 31
+DWARF-NEXT: 1.2: 31
+DWARF-NEXT: 2: 31
+DWARF-NEXT: 3: 31
+DWARF-NEXT: 5: ?work2@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:28303
+DWARF-NEXT: 0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:28303
+DWARF-NEXT: 1.1: 341
+DWARF-NEXT: 1.2: 341
+DWARF-NEXT: 2: 341
+DWARF-NEXT: 3: 341
+DWARF-NEXT: 7: ?print@MyNameSpace2@@YAXPEAGH@Z:0
+DWARF-NEXT: 1: 0
+
+PROBE: main:1116:0
+PROBE-NEXT: 1: 0
+PROBE-NEXT: 3: 0
+PROBE-NEXT: 4: 0
+PROBE-NEXT: 5: 0
+PROBE-NEXT: 8: 0
+PROBE-NEXT: 9: 0
+PROBE-NEXT: 2: ??$init@HG@MyNameSpace2@@YAXHPEAG@Z:0
+PROBE-NEXT: 1: 0
+PROBE-NEXT: 2: 0
+PROBE-NEXT: 3: 0
+PROBE-NEXT: 4: 0
+PROBE-NEXT: 5: 0
+PROBE-NEXT: 6: 0
+PROBE-NEXT: !CFGChecksum: 107105011060
+PROBE-NEXT: 6: ?work1@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:93
+PROBE-NEXT: 1: 0
+PROBE-NEXT: 2: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:93
+PROBE-NEXT: 1: 0
+PROBE-NEXT: 2: 31
+PROBE-NEXT: 4: 31
+PROBE-NEXT: 5: 31
+PROBE-NEXT: !CFGChecksum: 107105011060
+PROBE-NEXT: !CFGChecksum: 281479271677951
+PROBE-NEXT: 7: ?work2@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:1023
+PROBE-NEXT: 2: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:1023
+PROBE-NEXT: 2: 341
+PROBE-NEXT: 3: 0
+PROBE-NEXT: 4: 341
+PROBE-NEXT: 5: 341
+PROBE-NEXT: 6: 0
+PROBE-NEXT: !CFGChecksum: 107105011060
+PROBE-NEXT: !CFGChecksum: 281479271677951
+PROBE-NEXT: 10: ?print@MyNameSpace2@@YAXPEAGH@Z:0
+PROBE-NEXT: 1: 0
+PROBE-NEXT: 2: 0
+PROBE-NEXT: !CFGChecksum: 281479271677951
+PROBE-NEXT: !CFGChecksum: 1126005794311845
; Original code
-; clang-cl.exe -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -Xclang -fdebug-info-for-profiling -link -debug:dwarf
+; clang-cl.exe -O2 -gdwarf -gline-tables-only -fpseudo-probe-for-profiling coff-profile.cpp -fuse-ld=lld -Xclang -fdebug-info-for-profiling -link -debug:dwarf
#include <stdio.h>