aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPetar Avramovic <Petar.Avramovic@amd.com>2023-12-13 16:42:56 +0100
committerGitHub <noreply@github.com>2023-12-13 16:42:56 +0100
commit6892c175c565e59cf485ada6b1febd41b4666414 (patch)
tree7a8c3e41808e3a977abf81a72a4c89bbabac41f4
parentae2f8167eefbc78c6b6408c9ee3b7c7965ed596a (diff)
downloadllvm-6892c175c565e59cf485ada6b1febd41b4666414.zip
llvm-6892c175c565e59cf485ada6b1febd41b4666414.tar.gz
llvm-6892c175c565e59cf485ada6b1febd41b4666414.tar.bz2
AMDGPU/GlobalISel: add AMDGPUGlobalISelDivergenceLowering pass (#75340)
Add empty AMDGPUGlobalISelDivergenceLowering pass. This pass will implement - selection of divergent i1 phis as lane mask phis, requires lane mask merging in some cases - lower uses of divergent i1 values outside of the cycle using lane mask merging - lowering of all cases of temporal divergence: - lower uses of uniform i1 values outside of the cycle using lane mask merging - lower uses of uniform non-i1 values outside of the cycle using a copy to vgpr inside of the cycle Add very detailed set of regression tests for cases mentioned above. patch 1 from: https://github.com/llvm/llvm-project/pull/73337
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp68
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll336
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir610
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll512
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir914
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll461
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir1004
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll171
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir324
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll37
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir70
14 files changed, 4514 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 1b75607..8931952 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -36,6 +36,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@@ -162,6 +163,9 @@ extern char &SILowerWWMCopiesID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
+extern char &AMDGPUGlobalISelDivergenceLoweringID;
+
void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
new file mode 100644
index 0000000..4cd8b1e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -0,0 +1,68 @@
+//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
+/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
+/// Handles all cases of temporal divergence.
+/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
+/// currently depends on LCSSA to insert phis with one incoming.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {
+ initializeAMDGPUGlobalISelDivergenceLoweringPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU GlobalISel divergence lowering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+ "AMDGPU GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+ "AMDGPU GlobalISel divergence lowering", false, false)
+
+char AMDGPUGlobalISelDivergenceLowering::ID = 0;
+
+char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
+ AMDGPUGlobalISelDivergenceLowering::ID;
+
+FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
+ return new AMDGPUGlobalISelDivergenceLowering();
+}
+
+bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
+ MachineFunction &MF) {
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ef6372a..0cc048e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -375,6 +375,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
initializeSILowerWWMCopiesPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
@@ -1255,6 +1256,7 @@ bool GCNPassConfig::addLegalizeMachineIR() {
void GCNPassConfig::addPreRegBankSelect() {
bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+ addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
}
bool GCNPassConfig::addRegBankSelect() {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 53a33f8..2c92e7a 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -55,6 +55,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCtorDtorLowering.cpp
AMDGPUExportClustering.cpp
AMDGPUFrameLowering.cpp
+ AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
new file mode 100644
index 0000000..ccf4e84
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -0,0 +1,336 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; Divergent phis that don't require lowering using lane mask merging
+
+; - divergent phi that has divergent incoming value (this makes it divergent)
+; but is reachable through only one path - branch instruction that chooses
+; path is uniform
+
+; - divergent phi that is used only inside the loop and has incoming from
+; previous iteration. After phi-elimination (rewrite lane mask in phi def with
+; lane mask value from previous iteration), phi will hold lane mask valid for
+; current iteration which is fine since it is not used outside of the loop.
+
+; And one more that is tricky (is branch divergent or not ?)
+; "amdgpu-flat-work-group-size"="1,1" aka single lane execution does not stop
+; shader from activating multiple lanes by using some intrinsic (entering wwm
+; and using dpp instructions)
+; - there are cases with single lane execution where branch instructions are not
+; lowered to si_if (or other intrinsic branches) - with intention to use
+; uniform branch after instruction selection?
+; PhiIncomingAnalysis does not recognize G_BRCOND as divergent branch and does
+; not perform lane mask merging
+
+
+
+define amdgpu_ps void @divergent_i1_phi_uniform_branch(ptr addrspace(1) %out, i32 %tid, i32 inreg %cond, ptr addrspace(1) %dummyaddr) {
+; GFX10-LABEL: divergent_i1_phi_uniform_branch:
+; GFX10: ; %bb.0: ; %A
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT: s_branch .LBB0_3
+; GFX10-NEXT: .LBB0_2: ; %dummy
+; GFX10-NEXT: v_mov_b32_e32 v5, 0x7b
+; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT: global_store_dword v[3:4], v5, off
+; GFX10-NEXT: .LBB0_3: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+A:
+ %val_A = icmp uge i32 %tid, 6
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %dummy, label %exit
+
+dummy:
+ store i32 123, ptr addrspace(1) %dummyaddr
+ br label %B
+
+B:
+ %val_B = icmp ult i32 %tid, 1
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+; Fix me - there is no need to merge lane masks here
+define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %out, i32 %tid, i32 inreg %cond) {
+; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
+; GFX10: ; %bb.0: ; %A
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT: s_branch .LBB1_3
+; GFX10-NEXT: .LBB1_2: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT: .LBB1_3: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+A:
+ %val_A = icmp uge i32 %tid, 6
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %B, label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 1
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+
+; Divergent i1 phi that uses value from previous iteration.
+; Used only inside the loop (variable name is bool_counter)
+define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, 1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: .LBB2_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB2_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[1:2], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+ %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+ %neg_bool_counter = xor i1 %bool_counter, true
+ %fcounter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %fcounter, %val
+ %counterPlus1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, float %pre_cond_val, ptr %addr, ptr %addr_if, ptr %addr_else) {
+; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x3e8
+; GFX10-NEXT: v_mov_b32_e32 v9, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: s_branch .LBB3_2
+; GFX10-NEXT: .LBB3_1: ; %loop_body
+; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v10, v9
+; GFX10-NEXT: v_xor_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execz .LBB3_6
+; GFX10-NEXT: .LBB3_2: ; %loop_start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
+; GFX10-NEXT: s_mov_b32 s5, 1
+; GFX10-NEXT: s_cbranch_vccz .LBB3_4
+; GFX10-NEXT: ; %bb.3: ; %else
+; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: flat_store_dword v[6:7], v8
+; GFX10-NEXT: .LBB3_4: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT: s_xor_b32 s5, s5, 1
+; GFX10-NEXT: s_and_b32 s5, s5, 1
+; GFX10-NEXT: s_cmp_lg_u32 s5, 0
+; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX10-NEXT: ; %bb.5: ; %if
+; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT: flat_store_dword v[4:5], v8
+; GFX10-NEXT: s_branch .LBB3_1
+; GFX10-NEXT: .LBB3_6: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[2:3], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %pre_cond = fcmp ogt float %pre_cond_val, 1.0
+ br label %loop_start
+
+loop_start:
+ %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop_body ]
+ %bool_counter = phi i1 [ %pre_cond, %entry ], [ %neg_bool_counter, %loop_body ]
+ %cond_break = icmp sgt i32 %counter, 1000
+ br i1 %cond_break, label %if, label %else
+
+if:
+ store i32 1000, ptr %addr_if
+ br label %loop_body
+
+else:
+ store i32 1000, ptr %addr_else
+ br label %loop_body
+
+loop_body:
+ %neg_bool_counter = xor i1 %bool_counter, true
+ %fcounter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %fcounter, %val
+ %counterPlus1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop_start
+
+exit:
+ %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+; There is a divergent, according to machine uniformity info, g_brcond branch
+; here, not lowered to si_if because of "amdgpu-flat-work-group-size"="1,1".
+define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
+; GFX10-LABEL: single_lane_execution_attribute:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_mov_b32 s12, 0
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_mov_b32 s13, -1
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT: s_mov_b32 s3, s12
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
+; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10-NEXT: s_mov_b32 s2, 1
+; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: ; implicit-def: $vgpr3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
+; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
+; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
+; GFX10-NEXT: v_mov_b32_e32 v4, s12
+; GFX10-NEXT: v_mov_b32_e32 v3, s12
+; GFX10-NEXT: .LBB4_2: ; %.preheader
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
+; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
+; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
+; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
+; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: .LBB4_4: ; %Flow
+; GFX10-NEXT: s_and_b32 s2, s2, 1
+; GFX10-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-NEXT: s_cbranch_scc0 .LBB4_6
+; GFX10-NEXT: ; %bb.5: ; %.19
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: v_or_b32_e32 v3, 2, v1
+; GFX10-NEXT: .LBB4_6: ; %.22
+; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
+; GFX10-NEXT: buffer_store_dword v3, v0, s[8:11], 0 offen
+; GFX10-NEXT: s_endpgm
+.entry:
+ %.0 = call i64 @llvm.amdgcn.s.getpc()
+ %.1 = and i64 %.0, -4294967296
+ %.2 = zext i32 %.userdata0 to i64
+ %.3 = or i64 %.1, %.2
+ %.4 = inttoptr i64 %.3 to ptr addrspace(4)
+ %.5 = getelementptr i8, ptr addrspace(4) %.4, i64 16
+ %.6 = load <4 x i32>, ptr addrspace(4) %.5, align 16
+ %.7 = load <4 x i32>, ptr addrspace(4) %.4, align 16
+ %.8 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %.9 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %.8)
+ %.fr11 = freeze i32 %.9
+ %.idx = shl i32 %.fr11, 2
+ %.10 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %.7, i32 %.idx, i32 0, i32 0)
+ %.11 = icmp eq i32 %.10, 0
+ %.12 = and i32 %.fr11, 1
+ %.not = icmp eq i32 %.12, 0
+ br i1 %.not, label %.19, label %.preheader
+
+.preheader: ; preds = %.entry, %.preheader
+ %._96.02 = phi i32 [ %.15, %.preheader ], [ 0, %.entry ]
+ %._50.01 = phi i32 [ %.14, %.preheader ], [ 0, %.entry ]
+ %.idx5 = shl i32 %._96.02, 2
+ %.13 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %.7, i32 %.idx5, i32 0, i32 0)
+ %.14 = add i32 %.13, %._50.01
+ %.15 = add nuw i32 %._96.02, 1
+ %.exitcond.not = icmp eq i32 %.15, %.fr11
+ br i1 %.exitcond.not, label %.preheader._crit_edge, label %.preheader
+
+.preheader._crit_edge: ; preds = %.preheader
+ %.16 = icmp eq i32 %.14, %.10
+ %.17 = or i1 %.11, %.16
+ %.18 = zext i1 %.17 to i32
+ br label %.22
+
+.19: ; preds = %.entry
+ %.20 = zext i1 %.11 to i32
+ %.21 = or i32 %.20, 2
+ br label %.22
+
+.22: ; preds = %.19, %.preheader._crit_edge
+ %._51.0 = phi i32 [ %.18, %.preheader._crit_edge ], [ %.21, %.19 ]
+ %.WorkgroupId.i0 = extractelement <3 x i32> %.WorkgroupId, i64 0
+ %.LocalInvocationId.i0 = extractelement <3 x i32> %.LocalInvocationId, i64 0
+ %.i0 = add i32 %.LocalInvocationId.i0, %.WorkgroupId.i0
+ %.idx6 = shl i32 %.i0, 2
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %._51.0, <4 x i32> %.6, i32 %.idx6, i32 0, i32 0)
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc()
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg)
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1,1" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
new file mode 100644
index 0000000..d314ebe3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -0,0 +1,610 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+--- |
+ define void @divergent_i1_phi_uniform_branch() {ret void}
+ define void @divergent_i1_phi_uniform_branch_simple() {ret void}
+ define void @divergent_i1_phi_used_inside_loop() {ret void}
+ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body() {ret void}
+ define void @_amdgpu_cs_main() #0 {ret void}
+
+ attributes #0 = {"amdgpu-flat-work-group-size"="1,1"}
+...
+
+---
+name: divergent_i1_phi_uniform_branch
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_uniform_branch
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ ; GFX10-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 123
+ ; GFX10-NEXT: G_STORE [[C2]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $sgpr0
+ %5:_(s32) = COPY $vgpr3
+ %6:_(s32) = COPY $vgpr4
+ %7:_(p1) = G_MERGE_VALUES %5(s32), %6(s32)
+ %8:_(s32) = G_CONSTANT i32 6
+ %9:_(s1) = G_ICMP intpred(uge), %3(s32), %8
+ %10:_(s32) = G_CONSTANT i32 0
+ %11:_(s1) = G_ICMP intpred(ne), %4(s32), %10
+ G_BRCOND %11(s1), %bb.2
+ G_BR %bb.1
+
+ bb.1:
+ successors: %bb.3(0x80000000)
+
+ %12:_(s32) = G_CONSTANT i32 123
+ G_STORE %12(s32), %7(p1) :: (store (s32), addrspace 1)
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+
+ %13:_(s1) = G_PHI %14(s1), %bb.3, %9(s1), %bb.0
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.2(0x80000000)
+
+ %15:_(s32) = G_CONSTANT i32 1
+ %14:_(s1) = G_ICMP intpred(ult), %3(s32), %15
+ G_BR %bb.2
+
+ bb.4:
+ %16:_(s32) = G_CONSTANT i32 2
+ %17:_(s32) = G_CONSTANT i32 1
+ %18:_(s32) = G_SELECT %13(s1), %17, %16
+ G_STORE %18(s32), %2(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_uniform_branch_simple
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_uniform_branch_simple
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ ; GFX10-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $sgpr0
+ %5:_(s32) = G_CONSTANT i32 6
+ %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+ %7:_(s32) = G_CONSTANT i32 0
+ %8:_(s1) = G_ICMP intpred(ne), %4(s32), %7
+ G_BRCOND %8(s1), %bb.2
+ G_BR %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %9:_(s32) = G_CONSTANT i32 1
+ %10:_(s1) = G_ICMP intpred(ult), %3(s32), %9
+
+ bb.2:
+ %11:_(s1) = G_PHI %6(s1), %bb.0, %10(s1), %bb.1
+ %12:_(s32) = G_CONSTANT i32 2
+ %13:_(s32) = G_CONSTANT i32 1
+ %14:_(s32) = G_SELECT %11(s1), %13, %12
+ G_STORE %14(s32), %2(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_used_inside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_used_inside_loop
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s1) = G_CONSTANT i1 true
+ %5:_(s32) = G_CONSTANT i32 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+ %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+ %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+ %12:_(s1) = G_CONSTANT i1 true
+ %11:_(s1) = G_XOR %10, %12
+ %13:_(s32) = G_UITOFP %8(s32)
+ %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+ %15:_(s32) = G_CONSTANT i32 1
+ %9:_(s32) = G_ADD %8, %15
+ %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+ SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %16:_(s1) = G_PHI %11(s1), %bb.1
+ %17:_(s32) = G_PHI %7(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+ %18:_(s32) = G_FCONSTANT float 0.000000e+00
+ %19:_(s32) = G_FCONSTANT float 1.000000e+00
+ %20:_(s32) = G_SELECT %16(s1), %19, %18
+ G_STORE %20(s32), %3(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: divergent_i1_phi_used_inside_loop_bigger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_used_inside_loop_bigger_loop_body
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI1]](s32), [[C3]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
+ ; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.5
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+ ; GFX10-NEXT: G_STORE [[C5]](s32), [[MV1]](p0) :: (store (s32))
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+ ; GFX10-NEXT: G_STORE [[C7]](s32), [[MV2]](p0) :: (store (s32))
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C8]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C9]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[XOR1]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+ ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C11]], [[C10]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(s32) = COPY $vgpr3
+ %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+ %5:_(s32) = COPY $vgpr4
+ %6:_(s32) = COPY $vgpr5
+ %7:_(p0) = G_MERGE_VALUES %5(s32), %6(s32)
+ %8:_(s32) = COPY $vgpr6
+ %9:_(s32) = COPY $vgpr7
+ %10:_(p0) = G_MERGE_VALUES %8(s32), %9(s32)
+ %11:_(s32) = G_CONSTANT i32 0
+ %12:_(s32) = G_FCONSTANT float 1.000000e+00
+ %13:_(s1) = G_FCMP floatpred(ogt), %1(s32), %12
+
+ bb.1:
+ successors: %bb.4(0x40000000), %bb.2(0x40000000)
+
+ %14:_(s32) = G_PHI %15(s32), %bb.5, %11(s32), %bb.0
+ %16:_(s32) = G_PHI %11(s32), %bb.0, %17(s32), %bb.5
+ %18:_(s1) = G_PHI %13(s1), %bb.0, %19(s1), %bb.5
+ %20:_(s1) = G_CONSTANT i1 true
+ %21:_(s32) = G_CONSTANT i32 1000
+ %22:_(s1) = G_ICMP intpred(sle), %16(s32), %21
+ G_BRCOND %22(s1), %bb.4
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+ %23:_(s1) = G_PHI %24(s1), %bb.4, %20(s1), %bb.1
+ %25:_(s1) = G_CONSTANT i1 true
+ %26:_(s1) = G_XOR %23, %25
+ G_BRCOND %26(s1), %bb.5
+ G_BR %bb.3
+
+ bb.3:
+ successors: %bb.5(0x80000000)
+
+ %27:_(s32) = G_CONSTANT i32 1000
+ G_STORE %27(s32), %7(p0) :: (store (s32))
+ G_BR %bb.5
+
+ bb.4:
+ successors: %bb.2(0x80000000)
+
+ %24:_(s1) = G_CONSTANT i1 false
+ %28:_(s32) = G_CONSTANT i32 1000
+ G_STORE %28(s32), %10(p0) :: (store (s32))
+ G_BR %bb.2
+
+ bb.5:
+ successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+ %29:_(s1) = G_CONSTANT i1 true
+ %19:_(s1) = G_XOR %18, %29
+ %30:_(s32) = G_UITOFP %16(s32)
+ %31:_(s1) = G_FCMP floatpred(ogt), %30(s32), %0
+ %32:_(s32) = G_CONSTANT i32 1
+ %17:_(s32) = G_ADD %16, %32
+ %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %31(s1), %14(s32)
+ SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.6:
+ %33:_(s1) = G_PHI %19(s1), %bb.5
+ %34:_(s32) = G_PHI %15(s32), %bb.5
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+ %35:_(s32) = G_FCONSTANT float 0.000000e+00
+ %36:_(s32) = G_FCONSTANT float 1.000000e+00
+ %37:_(s32) = G_SELECT %33(s1), %36, %35
+ G_STORE %37(s32), %4(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: _amdgpu_cs_main
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: _amdgpu_cs_main
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.getpc)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
+ ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[INT]], [[C]]
+ ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+ ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[ZEXT]]
+ ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[INTTOPTR]](p4) :: (load (<8 x s32>))
+ ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s128) = G_TRUNC [[BITCAST]](s256)
+ ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[TRUNC]](s128)
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.lo), [[C2]](s32), [[C1]](s32)
+ ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.hi), [[C2]](s32), [[INT1]](s32)
+ ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[INT2]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FREEZE]], [[C3]](s32)
+ ; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[BITCAST1]](<4 x s32>), [[C1]](s32), [[SHL]], [[C1]], 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_BUFFER_LOAD]](s32), [[C1]]
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FREEZE]], [[C4]]
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
+ ; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.2
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
+ ; GFX10-NEXT: G_BRCOND [[PHI1]](s1), %bb.5
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %34(s32), %bb.3, [[C6]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %36(s32), %bb.3, [[FREEZE]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %38(s32), %bb.3, [[C6]](s32), %bb.1
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[BITCAST1]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
+ ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; GFX10-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[C7]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.3
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.3
+ ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
+ ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
+ ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
+ ; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
+ ; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ADD3]], [[C12]](s32)
+ ; GFX10-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: G_AMDGPU_BUFFER_STORE [[PHI6]](s32), [[UV1]](<4 x s32>), [[C13]](s32), [[SHL1]], [[C13]], 0, 0, 0 :: (store (s32), align 1, addrspace 8)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s32) = COPY $vgpr0
+ %3:_(s32) = G_IMPLICIT_DEF
+ %4:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.getpc)
+ %5:_(s64) = G_CONSTANT i64 -4294967296
+ %6:_(s64) = G_AND %4, %5
+ %7:_(s64) = G_ZEXT %0(s32)
+ %8:_(s64) = G_OR %6, %7
+ %9:_(p4) = G_INTTOPTR %8(s64)
+ %10:_(<8 x s32>) = G_LOAD %9(p4) :: (load (<8 x s32>))
+ %11:_(s256) = G_BITCAST %10(<8 x s32>)
+ %12:_(s128) = G_TRUNC %11(s256)
+ %13:_(<4 x s32>) = G_BITCAST %12(s128)
+ %15:_(s32) = G_CONSTANT i32 0
+ %14:_(s32) = G_CONSTANT i32 -1
+ %16:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.lo), %14(s32), %15(s32)
+ %17:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.hi), %14(s32), %16(s32)
+ %18:_(s32) = G_FREEZE %17
+ %19:_(s32) = G_CONSTANT i32 2
+ %20:_(s32) = G_SHL %18, %19(s32)
+ %21:_(s32) = G_AMDGPU_BUFFER_LOAD %13(<4 x s32>), %15(s32), %20, %15, 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+ %22:_(s1) = G_ICMP intpred(eq), %21(s32), %15
+ %23:_(s32) = G_CONSTANT i32 1
+ %24:_(s32) = G_AND %18, %23
+ %25:_(s1) = G_TRUNC %24(s32)
+ %26:_(s1) = G_CONSTANT i1 true
+ %27:_(s1) = G_XOR %25, %26
+ G_BRCOND %27(s1), %bb.2
+ G_BR %bb.1
+
+ bb.1:
+ successors: %bb.3(0x80000000)
+
+ %28:_(s32) = G_CONSTANT i32 0
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+ %29:_(s32) = G_PHI %30(s32), %bb.4, %3(s32), %bb.0
+ %31:_(s1) = G_PHI %32(s1), %bb.4, %26(s1), %bb.0
+ G_BRCOND %31(s1), %bb.5
+ G_BR %bb.6
+
+ bb.3:
+ successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+
+ %33:_(s32) = G_PHI %34(s32), %bb.3, %28(s32), %bb.1
+ %35:_(s32) = G_PHI %36(s32), %bb.3, %18(s32), %bb.1
+ %37:_(s32) = G_PHI %38(s32), %bb.3, %28(s32), %bb.1
+ %39:_(s32) = G_CONSTANT i32 0
+ %40:_(s32) = G_AMDGPU_BUFFER_LOAD %13(<4 x s32>), %39(s32), %33, %39, 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+ %38:_(s32) = G_ADD %40, %37
+ %41:_(s32) = G_CONSTANT i32 -1
+ %36:_(s32) = G_ADD %35, %41
+ %42:_(s32) = G_CONSTANT i32 4
+ %34:_(s32) = G_ADD %33, %42
+ %43:_(s1) = G_ICMP intpred(ne), %36(s32), %39
+ G_BRCOND %43(s1), %bb.3
+ G_BR %bb.4
+
+ bb.4:
+ successors: %bb.2(0x80000000)
+
+ %44:_(s32) = G_PHI %38(s32), %bb.3
+ %32:_(s1) = G_CONSTANT i1 false
+ %45:_(s1) = G_ICMP intpred(eq), %44(s32), %21
+ %46:_(s1) = G_OR %22, %45
+ %30:_(s32) = G_ZEXT %46(s1)
+ G_BR %bb.2
+
+ bb.5:
+ successors: %bb.6(0x80000000)
+
+ %47:_(s32) = G_ZEXT %22(s1)
+ %48:_(s32) = G_CONSTANT i32 2
+ %49:_(s32) = G_OR %47, %48
+
+ bb.6:
+ %50:_(s32) = G_PHI %29(s32), %bb.2, %49(s32), %bb.5
+ %51:_(<4 x s32>), %52:_(<4 x s32>) = G_UNMERGE_VALUES %10(<8 x s32>)
+ %53:_(s32) = G_ADD %2, %1
+ %54:_(s32) = G_CONSTANT i32 2
+ %55:_(s32) = G_SHL %53, %54(s32)
+ %56:_(s32) = G_CONSTANT i32 0
+ G_AMDGPU_BUFFER_STORE %50(s32), %52(<4 x s32>), %56(s32), %55, %56, 0, 0, 0 :: (store (s32), align 1, addrspace 8)
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
new file mode 100644
index 0000000..34dedfe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -0,0 +1,512 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; This file contains various tests that have divergent i1s used outside of
+; the loop. These are lane masks is sgpr and need to have correct value in
+; corresponding bit at the iteration lane exits the loop.
+; Achieved by merging lane mask with same lane mask from previous iteration
+; and using that merged lane mask outside of the loop.
+
+; Phi used outside of the loop directly (loopfinder will figure out that it
+; needs to merge lane mask across all iterations)
+define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_outside_loop:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT: .LBB0_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
+; GFX10-NEXT: v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB0_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[2:3], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %pre.cond = fcmp ogt float %pre.cond.val, 1.0
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+ %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
+ %neg.bool.counter = xor i1 %bool.counter, true
+ %f.counter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %f.counter, %val
+ %counter.plus.1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ %select = select i1 %bool.counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, -1
+; GFX10-NEXT: v_mov_b32_e32 v5, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: s_branch .LBB1_2
+; GFX10-NEXT: .LBB1_1: ; %loop.cond
+; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4
+; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
+; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT: s_cbranch_vccz .LBB1_4
+; GFX10-NEXT: .LBB1_2: ; %loop.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v5
+; GFX10-NEXT: s_mov_b32 s6, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, s5
+; GFX10-NEXT: s_cbranch_execz .LBB1_1
+; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
+; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
+; GFX10-NEXT: global_load_dword v5, v[1:2], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v5
+; GFX10-NEXT: s_branch .LBB1_1
+; GFX10-NEXT: .LBB1_4: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5
+; GFX10-NEXT: flat_store_dword v[3:4], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop.start
+
+loop.start:
+ %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+ %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero, %loop.cond ]
+ br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
+
+is.eq.zero:
+ %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+ %elt.i = load i32, ptr addrspace(1) %a.plus.i
+ %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+ br label %loop.cond
+
+loop.cond:
+ %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
+ %cond = icmp slt i32 %i, 10
+ %i.plus.1 = add i32 %i, 1
+ br i1 %cond, label %exit, label %loop.start
+
+exit:
+ %select = select i1 %all.eq.zero, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+; Non-phi used outside of the loop
+
+define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: .LBB2_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT: v_xor_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB2_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[2:3], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %pre.cond = fcmp ogt float %pre.cond.val, 1.0
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+ %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
+ %neg.bool.counter = xor i1 %bool.counter, true
+ %f.counter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %f.counter, %val
+ %counter.plus.1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ %select = select i1 %neg.bool.counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+;void xor(int num_elts, int* a, int* addr) {
+;for(int i=0; i<num_elts; ++i) {
+; if(a[i]==0)
+; return;
+;}
+;addr[0] = 5
+;return;
+;}
+
+define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB3_6
+; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: s_branch .LBB3_3
+; GFX10-NEXT: .LBB3_2: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_xor_b32 s7, s7, 1
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s5, s6, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execz .LBB3_5
+; GFX10-NEXT: .LBB3_3: ; %loop.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
+; GFX10-NEXT: global_load_dword v6, v[6:7], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB3_2
+; GFX10-NEXT: ; %bb.4: ; %loop.cond
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
+; GFX10-NEXT: v_cmp_lt_i32_e64 s6, v5, v0
+; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: s_branch .LBB3_2
+; GFX10-NEXT: .LBB3_5: ; %loop.exit.guard
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_and_b32 s5, 1, s7
+; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s5
+; GFX10-NEXT: .LBB3_6: ; %Flow1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s6
+; GFX10-NEXT: s_cbranch_execz .LBB3_8
+; GFX10-NEXT: ; %bb.7: ; %block.after.loop
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: flat_store_dword v[3:4], v0
+; GFX10-NEXT: .LBB3_8: ; %exit
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %start.cond = icmp eq i32 %num.elts, 0
+ br i1 %start.cond, label %loop.start, label %block.after.loop
+
+loop.start:
+ %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+ %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+ %elt.i = load i32, ptr addrspace(1) %a.plus.i
+ %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+ br i1 %elt.i.eq.zero, label %exit, label %loop.cond
+
+loop.cond:
+ %cond = icmp slt i32 %i, %num.elts
+ %i.plus.1 = add i32 %i, 1
+ br i1 %cond, label %block.after.loop, label %loop.start
+
+block.after.loop:
+ store i32 5, ptr %addr
+ br label %exit
+
+exit:
+ ret void
+}
+
+
+;void icmp(int num_elts, int* a, int* addr) {
+;for(;;) {
+; if(a[i]==0)
+; return;
+;}
+;addr[0] = 5
+;return;
+;}
+
+define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr %addr) {
+; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: s_branch .LBB4_2
+; GFX10-NEXT: .LBB4_1: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
+; GFX10-NEXT: s_and_b32 s4, 1, s6
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT: s_or_b32 s5, s4, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execz .LBB4_6
+; GFX10-NEXT: .LBB4_2: ; %cond.block.0
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_4
+; GFX10-NEXT: ; %bb.3: ; %if.block.0
+; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
+; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
+; GFX10-NEXT: global_store_dword v[8:9], v4, off
+; GFX10-NEXT: .LBB4_4: ; %loop.break.block
+; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
+; GFX10-NEXT: s_mov_b32 s6, 1
+; GFX10-NEXT: ; implicit-def: $vgpr5
+; GFX10-NEXT: s_and_saveexec_b32 s7, s4
+; GFX10-NEXT: s_cbranch_execz .LBB4_1
+; GFX10-NEXT: ; %bb.5: ; %loop.cond
+; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
+; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_branch .LBB4_1
+; GFX10-NEXT: .LBB4_6: ; %cond.block.1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_8
+; GFX10-NEXT: ; %bb.7: ; %if.block.1
+; GFX10-NEXT: global_store_dword v[6:7], v4, off
+; GFX10-NEXT: .LBB4_8: ; %exit
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop.start
+
+loop.start:
+ %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+ br label %cond.block.0
+
+cond.block.0:
+ %cond.0 = icmp eq i32 %v0, %i
+ br i1 %cond.0, label %if.block.0, label %loop.break.block
+
+if.block.0:
+ %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+ store i32 %i, ptr addrspace(1) %a.plus.i
+ br label %loop.break.block
+
+loop.break.block:
+ %cond.1 = icmp eq i32 %v1, %i
+ br i1 %cond.1, label %cond.block.1, label %loop.cond
+
+loop.cond:
+ ; no cond, infinite loop with one break
+ %i.plus.1 = add i32 %i, 1
+ br label %loop.start
+
+cond.block.1:
+ %cond.2 = icmp eq i32 %v0, %i
+ br i1 %cond.2, label %if.block.1, label %exit
+
+if.block.1:
+ store i32 %i, ptr addrspace(1) %c
+ br label %exit
+
+exit:
+ ret void
+}
+
+
+; bool all_eq_zero = true;
+; i32 i = 0;
+; do {
+; if(all_eq_zero)
+; all_eq_zero = (a[i] == 0);
+;
+; i += 1;
+; } while ( i < n )
+
+; *addr = all_eq_zero ? 1.0 : 0.0;
+
+; check that all elements in an array of size n are zero, loop has divergent
+; exit condition based on array size, but zero check does not break out of the
+; loop but instead skips zero check in remaining iterations
+; llpc "freezes" zero check since it is (via phi) used in a conditional branch
+define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, 1
+; GFX10-NEXT: v_mov_b32_e32 v5, s0
+; GFX10-NEXT: s_branch .LBB5_2
+; GFX10-NEXT: .LBB5_1: ; %loop.cond
+; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v5
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s1
+; GFX10-NEXT: v_mov_b32_e32 v5, v7
+; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB5_4
+; GFX10-NEXT: .LBB5_2: ; %loop.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v6
+; GFX10-NEXT: s_and_saveexec_b32 s2, s1
+; GFX10-NEXT: s_cbranch_execz .LBB5_1
+; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
+; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
+; GFX10-NEXT: global_load_dword v6, v[6:7], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v6
+; GFX10-NEXT: s_branch .LBB5_1
+; GFX10-NEXT: .LBB5_4: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX10-NEXT: flat_store_dword v[3:4], v0
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %loop.start
+
+loop.start:
+ %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+ %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero.fr, %loop.cond ]
+ br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
+
+is.eq.zero:
+ %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+ %elt.i = load i32, ptr addrspace(1) %a.plus.i
+ %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+ br label %loop.cond
+
+loop.cond:
+ %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
+ %eq.zero.fr = freeze i1 %eq.zero
+ %cond = icmp slt i32 %i, %n
+ %i.plus.1 = add i32 %i, 1
+ br i1 %cond, label %exit, label %loop.start
+
+exit:
+ %select = select i1 %eq.zero.fr, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+; Divergent i1 phi from structurize-cfg used outside of the loop
+define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, s0
+; GFX10-NEXT: s_branch .LBB6_2
+; GFX10-NEXT: .LBB6_1: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_and_b32 s1, 1, s3
+; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB6_4
+; GFX10-NEXT: .LBB6_2: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 1
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT: global_load_dword v9, v[9:10], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB6_1
+; GFX10-NEXT: ; %bb.3: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
+; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: global_load_dword v9, v[7:8], off
+; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT: global_store_dword v[7:8], v9, off
+; GFX10-NEXT: s_branch .LBB6_1
+; GFX10-NEXT: .LBB6_4: ; %loop.exit.guard
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_and_saveexec_b32 s0, s1
+; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB6_6
+; GFX10-NEXT: ; %bb.5: ; %break.body
+; GFX10-NEXT: v_mov_b32_e32 v0, 10
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: .LBB6_6: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+ store i32 10, ptr addrspace(1) %a.break
+ br label %exit
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, 100
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
new file mode 100644
index 0000000..9246371
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -0,0 +1,914 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: divergent_i1_phi_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_used_outside_loop
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(s32) = COPY $vgpr3
+ %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+ %5:_(s32) = G_CONSTANT i32 0
+ %6:_(s32) = G_FCONSTANT float 1.000000e+00
+ %7:_(s1) = G_FCMP floatpred(ogt), %1(s32), %6
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %8:_(s32) = G_PHI %9(s32), %bb.1, %5(s32), %bb.0
+ %10:_(s32) = G_PHI %5(s32), %bb.0, %11(s32), %bb.1
+ %12:_(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.1
+ %14:_(s1) = G_CONSTANT i1 true
+ %13:_(s1) = G_XOR %12, %14
+ %15:_(s32) = G_UITOFP %10(s32)
+ %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
+ %17:_(s32) = G_CONSTANT i32 1
+ %11:_(s32) = G_ADD %10, %17
+ %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+ SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %18:_(s1) = G_PHI %12(s1), %bb.1
+ %19:_(s32) = G_PHI %9(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+ %20:_(s32) = G_FCONSTANT float 0.000000e+00
+ %21:_(s32) = G_FCONSTANT float 1.000000e+00
+ %22:_(s32) = G_SELECT %18(s1), %21, %20
+ G_STORE %22(s32), %4(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: divergent_i1_phi_used_outside_loop_larger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_used_outside_loop_larger_loop_body
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %13(s1), %bb.3
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI1]], [[C3]](s64)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]]
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[ADD]](s32), [[C5]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.1
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.3
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C7]], [[C6]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ %0:_(s32) = COPY $vgpr1
+ %1:_(s32) = COPY $vgpr2
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr3
+ %4:_(s32) = COPY $vgpr4
+ %5:_(p0) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = G_CONSTANT i32 -1
+ %7:_(s1) = G_CONSTANT i1 true
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+ %8:_(s32) = G_PHI %6(s32), %bb.0, %9(s32), %bb.3
+ %10:_(p1) = G_PHI %2(p1), %bb.0, %11(p1), %bb.3
+ %12:sreg_32_xm0_xexec(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.3
+ %14:sreg_32_xm0_xexec(s32) = SI_IF %12(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+
+ %15:_(s32) = G_LOAD %10(p1) :: (load (s32), addrspace 1)
+ %16:_(s32) = G_CONSTANT i32 0
+ %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+
+ bb.3:
+ successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+ %13:_(s1) = G_PHI %17(s1), %bb.2, %12(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+ %18:_(s64) = G_CONSTANT i64 4
+ %11:_(p1) = G_PTR_ADD %10, %18(s64)
+ %19:_(s32) = G_CONSTANT i32 1
+ %9:_(s32) = nsw G_ADD %8, %19
+ %20:_(s32) = G_CONSTANT i32 10
+ %21:_(s1) = G_ICMP intpred(sge), %9(s32), %20
+ G_BRCOND %21(s1), %bb.1
+ G_BR %bb.4
+
+ bb.4:
+ %22:_(s1) = G_PHI %12(s1), %bb.3
+ %23:_(s32) = G_FCONSTANT float 0.000000e+00
+ %24:_(s32) = G_FCONSTANT float 1.000000e+00
+ %25:_(s32) = G_SELECT %22(s1), %24, %23
+ G_STORE %25(s32), %5(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: divergent_i1_xor_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_xor_used_outside_loop
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(s32) = COPY $vgpr3
+ %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+ %5:_(s32) = G_CONSTANT i32 0
+ %6:_(s32) = G_FCONSTANT float 1.000000e+00
+ %7:_(s1) = G_FCMP floatpred(ogt), %1(s32), %6
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %8:_(s32) = G_PHI %9(s32), %bb.1, %5(s32), %bb.0
+ %10:_(s32) = G_PHI %5(s32), %bb.0, %11(s32), %bb.1
+ %12:_(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.1
+ %14:_(s1) = G_CONSTANT i1 true
+ %13:_(s1) = G_XOR %12, %14
+ %15:_(s32) = G_UITOFP %10(s32)
+ %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
+ %17:_(s32) = G_CONSTANT i32 1
+ %11:_(s32) = G_ADD %10, %17
+ %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+ SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %18:_(s1) = G_PHI %13(s1), %bb.1
+ %19:_(s32) = G_PHI %9(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+ %20:_(s32) = G_FCONSTANT float 0.000000e+00
+ %21:_(s32) = G_FCONSTANT float 1.000000e+00
+ %22:_(s32) = G_SELECT %18(s1), %21, %20
+ G_STORE %22(s32), %4(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: divergent_i1_xor_used_outside_loop_larger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_xor_used_outside_loop_larger_loop_body
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, [[C1]](s1), %bb.0
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
+ ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.7(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C7]]
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI2]](s32), [[COPY]]
+ ; GFX10-NEXT: G_BR %bb.7
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; GFX10-NEXT: G_STORE [[C8]](s32), [[MV1]](p0) :: (store (s32))
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+ ; GFX10-NEXT: SI_RETURN
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.7:
+ ; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.3(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[C6]](s1), %bb.4, [[C3]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C3]](s1), %bb.3
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+ ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C9]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI5]](s1), [[PHI1]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.8:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.7
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+ ; GFX10-NEXT: G_BR %bb.2
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s32) = COPY $vgpr3
+ %5:_(s32) = COPY $vgpr4
+ %6:_(p0) = G_MERGE_VALUES %4(s32), %5(s32)
+ %7:_(s32) = G_IMPLICIT_DEF
+ %8:_(s32) = G_CONSTANT i32 0
+ %9:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %8
+ %10:_(s1) = G_CONSTANT i1 true
+ %11:sreg_32_xm0_xexec(s32) = SI_IF %9(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.1
+
+ bb.1:
+ successors: %bb.3(0x80000000)
+
+ %12:_(s32) = G_CONSTANT i32 0
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+ %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+ %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.5
+
+ bb.3:
+ successors: %bb.4(0x40000000), %bb.7(0x40000000)
+
+ %16:_(s32) = G_PHI %12(s32), %bb.1, %17(s32), %bb.7
+ %18:_(s32) = G_PHI %19(s32), %bb.7, %12(s32), %bb.1
+ %20:_(s1) = G_CONSTANT i1 true
+ %21:_(s64) = G_SEXT %18(s32)
+ %22:_(s32) = G_CONSTANT i32 2
+ %23:_(s64) = G_SHL %21, %22(s32)
+ %24:_(p1) = G_PTR_ADD %3, %23(s64)
+ %25:_(s32) = G_LOAD %24(p1) :: (load (s32), addrspace 1)
+ %26:_(s32) = G_CONSTANT i32 0
+ %27:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %25(s32), %26
+ %28:sreg_32_xm0_xexec(s32) = SI_IF %27(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.4:
+ successors: %bb.7(0x80000000)
+
+ %29:_(s1) = G_CONSTANT i1 false
+ %30:_(s32) = G_CONSTANT i32 1
+ %31:_(s32) = G_ADD %18, %30
+ %32:_(s1) = G_ICMP intpred(slt), %18(s32), %0
+ G_BR %bb.7
+
+ bb.5:
+ successors: %bb.6(0x80000000)
+
+ %33:_(s32) = G_CONSTANT i32 5
+ G_STORE %33(s32), %6(p0) :: (store (s32))
+
+ bb.6:
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+ SI_RETURN
+
+ bb.7:
+ successors: %bb.8(0x04000000), %bb.3(0x7c000000)
+
+ %19:_(s32) = G_PHI %31(s32), %bb.4, %7(s32), %bb.3
+ %34:_(s1) = G_PHI %29(s1), %bb.4, %20(s1), %bb.3
+ %35:_(s1) = G_PHI %32(s1), %bb.4, %20(s1), %bb.3
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32)
+ %36:_(s1) = G_CONSTANT i1 true
+ %37:_(s1) = G_XOR %34, %36
+ %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32)
+ SI_LOOP %17(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.8
+
+ bb.8:
+ successors: %bb.2(0x80000000)
+
+ %14:_(s1) = G_PHI %37(s1), %bb.7
+ %38:_(s32) = G_PHI %17(s32), %bb.7
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+ G_BR %bb.2
+...
+
+---
+name: divergent_i1_icmp_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_icmp_used_outside_loop
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+ ; GFX10-NEXT: G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C3]](s1), %bb.5, [[C2]](s1), %bb.4
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.7
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.7:
+ ; GFX10-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.6
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.6
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.8:
+ ; GFX10-NEXT: successors: %bb.9(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: G_STORE [[PHI6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.9:
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(s32) = COPY $vgpr3
+ %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+ %5:_(s32) = COPY $vgpr6
+ %6:_(s32) = COPY $vgpr7
+ %7:_(p1) = G_MERGE_VALUES %5(s32), %6(s32)
+ %8:_(s32) = G_CONSTANT i32 0
+ %9:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %10:_(s32) = G_PHI %11(s32), %bb.6, %8(s32), %bb.0
+ %12:_(s32) = G_PHI %8(s32), %bb.0, %13(s32), %bb.6
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+ %14:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %12
+ %15:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.3
+
+ bb.3:
+ successors: %bb.4(0x80000000)
+
+ %16:_(s64) = G_SEXT %12(s32)
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s64) = G_SHL %16, %17(s32)
+ %19:_(p1) = G_PTR_ADD %4, %18(s64)
+ G_STORE %12(s32), %19(p1) :: (store (s32), addrspace 1)
+
+ bb.4:
+ successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+ %20:_(s1) = G_CONSTANT i1 true
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+ %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12
+ %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.5
+
+ bb.5:
+ successors: %bb.6(0x80000000)
+
+ %23:_(s1) = G_CONSTANT i1 false
+ %24:_(s32) = G_CONSTANT i32 1
+ %25:_(s32) = G_ADD %12, %24
+
+ bb.6:
+ successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+
+ %13:_(s32) = G_PHI %25(s32), %bb.5, %9(s32), %bb.4
+ %26:_(s1) = G_PHI %23(s1), %bb.5, %20(s1), %bb.4
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32)
+ %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32)
+ SI_LOOP %11(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.7
+
+ bb.7:
+ successors: %bb.8(0x40000000), %bb.9(0x40000000)
+
+ %27:_(s32) = G_PHI %11(s32), %bb.6
+ %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6
+ %29:_(s32) = G_PHI %12(s32), %bb.6
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+ %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.8
+
+ bb.8:
+ successors: %bb.9(0x80000000)
+
+ G_STORE %29(s32), %7(p1) :: (store (s32), addrspace 1)
+
+ bb.9:
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+ SI_RETURN
+...
+
+---
+name: divergent_i1_freeze_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_freeze_used_outside_loop
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %14(s1), %bb.3
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[PHI3]]
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI1]](s32), [[COPY]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[FREEZE]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C6]], [[C5]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s32) = COPY $vgpr3
+ %5:_(s32) = COPY $vgpr4
+ %6:_(p0) = G_MERGE_VALUES %4(s32), %5(s32)
+ %7:_(s32) = G_CONSTANT i32 0
+ %8:_(s1) = G_CONSTANT i1 true
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+ %9:_(s32) = G_PHI %10(s32), %bb.3, %7(s32), %bb.0
+ %11:_(s32) = G_PHI %7(s32), %bb.0, %12(s32), %bb.3
+ %13:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %14(s1), %bb.3
+ %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+
+ %16:_(s64) = G_SEXT %11(s32)
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s64) = G_SHL %16, %17(s32)
+ %19:_(p1) = G_PTR_ADD %3, %18(s64)
+ %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+ %21:_(s32) = G_CONSTANT i32 0
+ %22:_(s1) = G_ICMP intpred(eq), %20(s32), %21
+
+ bb.3:
+ successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+ %23:_(s1) = G_PHI %22(s1), %bb.2, %13(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+ %14:_(s1) = G_FREEZE %23
+ %24:_(s32) = G_CONSTANT i32 1
+ %12:_(s32) = G_ADD %11, %24
+ %25:_(s1) = G_ICMP intpred(slt), %11(s32), %0
+ %10:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %25(s1), %9(s32)
+ SI_LOOP %10(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.4:
+ %26:_(s1) = G_PHI %14(s1), %bb.3
+ %27:_(s32) = G_PHI %10(s32), %bb.3
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+ %28:_(s32) = G_FCONSTANT float 0.000000e+00
+ %29:_(s32) = G_FCONSTANT float 1.000000e+00
+ %30:_(s32) = G_SELECT %26(s1), %29, %28
+ G_STORE %30(s32), %6(p0) :: (store (s32))
+ S_ENDPGM 0
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_1break
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+ ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = COPY $vgpr4
+ %7:_(s32) = COPY $vgpr5
+ %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+ %9:_(s32) = G_CONSTANT i32 0
+ %10:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+ %11:_(s32) = G_PHI %12(s32), %bb.5, %9(s32), %bb.0
+ %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.5
+ %15:_(s1) = G_CONSTANT i1 true
+ %16:_(s64) = G_SEXT %13(s32)
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s64) = G_SHL %16, %17(s32)
+ %19:_(p1) = G_PTR_ADD %5, %18(s64)
+ %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+ %21:_(s32) = G_CONSTANT i32 0
+ %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+ %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+
+ %24:_(s32) = G_CONSTANT i32 10
+ G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1)
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.5(0x80000000)
+
+ %25:_(s1) = G_CONSTANT i1 false
+ %26:_(s32) = G_CONSTANT i32 2
+ %27:_(s64) = G_SHL %16, %26(s32)
+ %28:_(p1) = G_PTR_ADD %2, %27(s64)
+ %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+ %30:_(s32) = G_CONSTANT i32 1
+ %31:_(s32) = G_ADD %29, %30
+ G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+ %32:_(s32) = G_ADD %13, %30
+ %33:_(s32) = G_CONSTANT i32 100
+ %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33
+ G_BR %bb.5
+
+ bb.4:
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+ S_ENDPGM 0
+
+ bb.5:
+ successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+ %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
+ %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
+ %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+ %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+ SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.6:
+ successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+ %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
+ %39:_(s32) = G_PHI %12(s32), %bb.5
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+ %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
new file mode 100644
index 0000000..afd271c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -0,0 +1,461 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; Simples case, if - then, that requires lane mask merging,
+; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
+; will overwrite its own lane bit in lane mask with val_B
+define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: divergent_i1_phi_if_then:
+; GFX10: ; %bb.0: ; %A
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+A:
+ %val_A = icmp uge i32 %tid, 6
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %B, label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 1
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+; if - else
+define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: divergent_i1_phi_if_else:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_and_b32 s0, 1, s0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT: ; %bb.1: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT: ; %bb.3: ; %A
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, 1, v2
+; GFX10-NEXT: ; %bb.4: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+entry:
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+; if - break;
+
+; counter = 0;
+; do {
+; if (a[counter] == 0)
+; break;
+; if (b[counter] == 0)
+; break;
+; if (c[counter] == 0)
+; break;
+; x[counter++]+=1;
+; } while (counter<100);
+
+; Tests with multiple break conditions. Divergent phis will be used to track
+; if any of the break conditions was reached. We only need to do simple lane
+; mask merging (for current loop iteration only). There is an intrinsic,
+; if_break, that will merge lane masks across all iterations of the loop.
+
+define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s0
+; GFX10-NEXT: s_branch .LBB2_2
+; GFX10-NEXT: .LBB2_1: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB2_4
+; GFX10-NEXT: .LBB2_2: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5]
+; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT: global_load_dword v7, v[7:8], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB2_1
+; GFX10-NEXT: ; %bb.3: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4
+; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v4
+; GFX10-NEXT: global_load_dword v7, v[5:6], off
+; GFX10-NEXT: v_mov_b32_e32 v4, v8
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7
+; GFX10-NEXT: global_store_dword v[5:6], v7, off
+; GFX10-NEXT: s_branch .LBB2_1
+; GFX10-NEXT: .LBB2_4: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %exit, label %loop.body
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, 100
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
+define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; GFX10-LABEL: loop_with_2breaks:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, s0
+; GFX10-NEXT: s_branch .LBB3_3
+; GFX10-NEXT: .LBB3_1: ; %Flow3
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB3_2: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB3_6
+; GFX10-NEXT: .LBB3_3: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT: global_load_dword v9, v[9:10], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB3_2
+; GFX10-NEXT: ; %bb.4: ; %B
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX10-NEXT: global_load_dword v9, v[9:10], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB3_1
+; GFX10-NEXT: ; %bb.5: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT: global_load_dword v9, v[7:8], off
+; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT: global_store_dword v[7:8], v9, off
+; GFX10-NEXT: s_branch .LBB3_1
+; GFX10-NEXT: .LBB3_6: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %exit, label %B
+
+B:
+ %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
+ %b.val = load i32, ptr addrspace(1) %b.plus.counter
+ %b.cond = icmp eq i32 %b.val, 0
+ br i1 %b.cond, label %exit, label %loop.body
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, 100
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
+define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
+; GFX10-LABEL: loop_with_3breaks:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v8, s0
+; GFX10-NEXT: s_branch .LBB4_4
+; GFX10-NEXT: .LBB4_1: ; %Flow5
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: .LBB4_2: ; %Flow4
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB4_3: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB4_8
+; GFX10-NEXT: .LBB4_4: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9]
+; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
+; GFX10-NEXT: global_load_dword v11, v[11:12], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_3
+; GFX10-NEXT: ; %bb.5: ; %B
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT: global_load_dword v11, v[11:12], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_2
+; GFX10-NEXT: ; %bb.6: ; %C
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT: global_load_dword v11, v[11:12], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_1
+; GFX10-NEXT: ; %bb.7: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8
+; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v8
+; GFX10-NEXT: global_load_dword v11, v[9:10], off
+; GFX10-NEXT: v_mov_b32_e32 v8, v12
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11
+; GFX10-NEXT: global_store_dword v[9:10], v11, off
+; GFX10-NEXT: s_branch .LBB4_1
+; GFX10-NEXT: .LBB4_8: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %exit, label %B
+
+B:
+ %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
+ %b.val = load i32, ptr addrspace(1) %b.plus.counter
+ %b.cond = icmp eq i32 %b.val, 0
+ br i1 %b.cond, label %exit, label %C
+
+C:
+ %c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter
+ %c.val = load i32, ptr addrspace(1) %c.plus.counter
+ %c.cond = icmp eq i32 %c.val, 0
+ br i1 %c.cond, label %exit, label %loop.body
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, 100
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
+; Divergent condition if with body, ending with break. This is loop with two
+; exits but structurizer will create phi that will track exit from break
+; and move break.body after the loop. Loop will then have one exit and phi
+; used outside of the loop by condition used to enter the break.body.
+define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
+; GFX10-LABEL: loop_with_div_break_with_body:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, s0
+; GFX10-NEXT: s_branch .LBB5_2
+; GFX10-NEXT: .LBB5_1: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_and_b32 s1, 1, s3
+; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB5_4
+; GFX10-NEXT: .LBB5_2: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 1
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT: global_load_dword v9, v[9:10], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB5_1
+; GFX10-NEXT: ; %bb.3: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: global_load_dword v9, v[7:8], off
+; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT: global_store_dword v[7:8], v9, off
+; GFX10-NEXT: s_branch .LBB5_1
+; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_and_saveexec_b32 s0, s1
+; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB5_6
+; GFX10-NEXT: ; %bb.5: ; %break.body
+; GFX10-NEXT: v_mov_b32_e32 v0, 10
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: .LBB5_6: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+ store i32 10, ptr addrspace(1) %a.break
+ br label %exit
+
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, 100
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
+; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
+; with irreducible control flow graph. FixIrreducible converts it into natural
+; loop and in the process creates i1 phi with three incoming values.
+
+; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
+; do {
+; if (y < a2) {
+; do {
+; } while (x < a2);
+; }
+; if (x < a3) {
+; return a1;
+; }
+; } while (y < a2);
+; return a0;
+; }
+
+; This test is also interesting because it has phi with three incomings
+;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+;.entry:
+; %.y_lt_a2 = icmp sgt i32 %a2, %y
+; %.x_lt_a2 = icmp sgt i32 %a2, %x
+; %.x_lt_a3 = icmp sgt i32 %a3, %x
+; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
+;
+;.preheader: ; if (y < a2),
+; br label %.inner_loop
+;
+;.inner_loop: ; do while x < a2
+; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
+;
+;.loopexit: ; if x < a3
+; %not.inner_loop = xor i1 %.y_lt_a2, true
+; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
+; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0 ; select retrun value a1 'x < a3' or a0 'loop ends'
+; br i1 %brmerge, label %.exit, label %.preheader
+;
+;.exit:
+; ret i32 %.ret
+;}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
new file mode 100644
index 0000000..9461d55
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -0,0 +1,1004 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: divergent_i1_phi_if_then
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_if_then
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(s32) = G_CONSTANT i32 6
+ %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+ %7:_(s32) = G_CONSTANT i32 0
+ %8:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %4(s32), %7
+ %9:sreg_32_xm0_xexec(s32) = SI_IF %8(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %10:_(s32) = G_CONSTANT i32 1
+ %11:_(s1) = G_ICMP intpred(ult), %3(s32), %10
+
+ bb.2:
+ %12:_(s1) = G_PHI %6(s1), %bb.0, %11(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32)
+ %13:_(s32) = G_CONSTANT i32 2
+ %14:_(s32) = G_CONSTANT i32 1
+ %15:_(s32) = G_SELECT %12(s1), %14, %13
+ G_STORE %15(s32), %2(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_if_else
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: divergent_i1_phi_if_else
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI %10(s1), %bb.3, [[DEF]](s1), %bb.0
+ ; GFX10-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+ ; GFX10-NEXT: G_BR %bb.1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[PHI]](s1), %bb.1, [[ICMP1]](s1), %bb.2
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI1]](s1), [[C3]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(s1) = G_IMPLICIT_DEF
+ %6:_(s32) = G_CONSTANT i32 0
+ %7:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %4(s32), %6
+ %8:sreg_32_xm0_xexec(s32) = SI_IF %7(s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.3
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+ %9:_(s1) = G_PHI %10(s1), %bb.3, %5(s1), %bb.0
+ %11:sreg_32_xm0_xexec(s32) = SI_ELSE %8(s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+
+ %12:_(s32) = G_CONSTANT i32 1
+ %13:_(s1) = G_ICMP intpred(uge), %3(s32), %12
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.1(0x80000000)
+
+ %14:_(s32) = G_CONSTANT i32 2
+ %10:_(s1) = G_ICMP intpred(ult), %3(s32), %14
+ G_BR %bb.1
+
+ bb.4:
+ %15:_(s1) = G_PHI %9(s1), %bb.1, %13(s1), %bb.2
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+ %16:_(s32) = G_CONSTANT i32 1
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s32) = G_SELECT %15(s1), %16, %17
+ G_STORE %18(s32), %2(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_1break
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C5]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C5]]
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C6]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.2, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = G_CONSTANT i32 0
+ %7:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+ %8:_(s32) = G_PHI %9(s32), %bb.3, %6(s32), %bb.0
+ %10:_(s32) = G_PHI %6(s32), %bb.0, %11(s32), %bb.3
+ %12:_(s1) = G_CONSTANT i1 true
+ %13:_(s64) = G_SEXT %10(s32)
+ %14:_(s32) = G_CONSTANT i32 2
+ %15:_(s64) = G_SHL %13, %14(s32)
+ %16:_(p1) = G_PTR_ADD %5, %15(s64)
+ %17:_(s32) = G_LOAD %16(p1) :: (load (s32), addrspace 1)
+ %18:_(s32) = G_CONSTANT i32 0
+ %19:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %17(s32), %18
+ %20:sreg_32_xm0_xexec(s32) = SI_IF %19(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+
+ %21:_(s32) = G_CONSTANT i32 2
+ %22:_(s64) = G_SHL %13, %21(s32)
+ %23:_(p1) = G_PTR_ADD %2, %22(s64)
+ %24:_(s32) = G_LOAD %23(p1) :: (load (s32), addrspace 1)
+ %25:_(s32) = G_CONSTANT i32 1
+ %26:_(s32) = G_ADD %24, %25
+ G_STORE %26(s32), %23(p1) :: (store (s32), addrspace 1)
+ %27:_(s32) = G_ADD %10, %25
+ %28:_(s32) = G_CONSTANT i32 100
+ %29:_(s1) = G_ICMP intpred(ult), %10(s32), %28
+
+ bb.3:
+ successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+ %11:_(s32) = G_PHI %27(s32), %bb.2, %7(s32), %bb.1
+ %30:_(s1) = G_PHI %29(s1), %bb.2, %12(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32)
+ %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32)
+ SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.4:
+ %31:_(s32) = G_PHI %9(s32), %bb.3
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+ S_ENDPGM 0
+...
+
+---
+name: loop_with_2breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_2breaks
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %34(s1), %bb.5, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C7]](s32)
+ ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+ ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C9]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C4]](s1), %bb.2
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = COPY $vgpr4
+ %7:_(s32) = COPY $vgpr5
+ %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+ %9:_(s32) = G_CONSTANT i32 0
+ %10:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+ %11:_(s32) = G_PHI %12(s32), %bb.3, %9(s32), %bb.0
+ %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.3
+ %15:_(s1) = G_CONSTANT i1 true
+ %16:_(s64) = G_SEXT %13(s32)
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s64) = G_SHL %16, %17(s32)
+ %19:_(p1) = G_PTR_ADD %5, %18(s64)
+ %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+ %21:_(s32) = G_CONSTANT i32 0
+ %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+ %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+ %24:_(s1) = G_CONSTANT i1 true
+ %25:_(s32) = G_CONSTANT i32 2
+ %26:_(s64) = G_SHL %16, %25(s32)
+ %27:_(p1) = G_PTR_ADD %8, %26(s64)
+ %28:_(s32) = G_LOAD %27(p1) :: (load (s32), addrspace 1)
+ %29:_(s32) = G_CONSTANT i32 0
+ %30:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %28(s32), %29
+ %31:sreg_32_xm0_xexec(s32) = SI_IF %30(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+ %14:_(s32) = G_PHI %32(s32), %bb.5, %10(s32), %bb.1
+ %33:_(s1) = G_PHI %34(s1), %bb.5, %15(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+ %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32)
+ SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.4:
+ successors: %bb.5(0x80000000)
+
+ %35:_(s32) = G_CONSTANT i32 2
+ %36:_(s64) = G_SHL %16, %35(s32)
+ %37:_(p1) = G_PTR_ADD %2, %36(s64)
+ %38:_(s32) = G_LOAD %37(p1) :: (load (s32), addrspace 1)
+ %39:_(s32) = G_CONSTANT i32 1
+ %40:_(s32) = G_ADD %38, %39
+ G_STORE %40(s32), %37(p1) :: (store (s32), addrspace 1)
+ %41:_(s32) = G_ADD %13, %39
+ %42:_(s32) = G_CONSTANT i32 100
+ %43:_(s1) = G_ICMP intpred(ult), %13(s32), %42
+
+ bb.5:
+ successors: %bb.3(0x80000000)
+
+ %32:_(s32) = G_PHI %41(s32), %bb.4, %10(s32), %bb.2
+ %34:_(s1) = G_PHI %43(s1), %bb.4, %24(s1), %bb.2
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+ G_BR %bb.3
+
+ bb.6:
+ %44:_(s32) = G_PHI %12(s32), %bb.3
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32)
+ S_ENDPGM 0
+...
+
+---
+name: loop_with_3breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_3breaks
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %37(s1), %bb.5, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
+ ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
+ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD2]](s32), [[C9]]
+ ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s1) = G_PHI %47(s1), %bb.7, [[C4]](s1), %bb.2
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.7(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C10]](s32)
+ ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL3]](s64)
+ ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[C11]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD3]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C11]]
+ ; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+ ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C12]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.7:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s1) = G_PHI [[ICMP3]](s1), %bb.6, [[C7]](s1), %bb.4
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.8:
+ ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = COPY $vgpr4
+ %7:_(s32) = COPY $vgpr5
+ %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+ %9:_(s32) = COPY $vgpr6
+ %10:_(s32) = COPY $vgpr7
+ %11:_(p1) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 0
+ %13:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+ %14:_(s32) = G_PHI %15(s32), %bb.3, %12(s32), %bb.0
+ %16:_(s32) = G_PHI %12(s32), %bb.0, %17(s32), %bb.3
+ %18:_(s1) = G_CONSTANT i1 true
+ %19:_(s64) = G_SEXT %16(s32)
+ %20:_(s32) = G_CONSTANT i32 2
+ %21:_(s64) = G_SHL %19, %20(s32)
+ %22:_(p1) = G_PTR_ADD %5, %21(s64)
+ %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
+ %24:_(s32) = G_CONSTANT i32 0
+ %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
+ %26:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+ %27:_(s1) = G_CONSTANT i1 true
+ %28:_(s32) = G_CONSTANT i32 2
+ %29:_(s64) = G_SHL %19, %28(s32)
+ %30:_(p1) = G_PTR_ADD %8, %29(s64)
+ %31:_(s32) = G_LOAD %30(p1) :: (load (s32), addrspace 1)
+ %32:_(s32) = G_CONSTANT i32 0
+ %33:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %31(s32), %32
+ %34:sreg_32_xm0_xexec(s32) = SI_IF %33(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.8(0x04000000), %bb.1(0x7c000000)
+
+ %17:_(s32) = G_PHI %35(s32), %bb.5, %13(s32), %bb.1
+ %36:_(s1) = G_PHI %37(s1), %bb.5, %18(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32)
+ %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32)
+ SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.8
+
+ bb.4:
+ successors: %bb.6(0x40000000), %bb.7(0x40000000)
+
+ %38:_(s1) = G_CONSTANT i1 true
+ %39:_(s32) = G_CONSTANT i32 2
+ %40:_(s64) = G_SHL %19, %39(s32)
+ %41:_(p1) = G_PTR_ADD %11, %40(s64)
+ %42:_(s32) = G_LOAD %41(p1) :: (load (s32), addrspace 1)
+ %43:_(s32) = G_CONSTANT i32 0
+ %44:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %42(s32), %43
+ %45:sreg_32_xm0_xexec(s32) = SI_IF %44(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.5:
+ successors: %bb.3(0x80000000)
+
+ %35:_(s32) = G_PHI %46(s32), %bb.7, %13(s32), %bb.2
+ %37:_(s1) = G_PHI %47(s1), %bb.7, %27(s1), %bb.2
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+ G_BR %bb.3
+
+ bb.6:
+ successors: %bb.7(0x80000000)
+
+ %48:_(s32) = G_CONSTANT i32 2
+ %49:_(s64) = G_SHL %19, %48(s32)
+ %50:_(p1) = G_PTR_ADD %2, %49(s64)
+ %51:_(s32) = G_LOAD %50(p1) :: (load (s32), addrspace 1)
+ %52:_(s32) = G_CONSTANT i32 1
+ %53:_(s32) = G_ADD %51, %52
+ G_STORE %53(s32), %50(p1) :: (store (s32), addrspace 1)
+ %54:_(s32) = G_ADD %16, %52
+ %55:_(s32) = G_CONSTANT i32 100
+ %56:_(s1) = G_ICMP intpred(ult), %16(s32), %55
+
+ bb.7:
+ successors: %bb.5(0x80000000)
+
+ %46:_(s32) = G_PHI %54(s32), %bb.6, %13(s32), %bb.4
+ %47:_(s1) = G_PHI %56(s1), %bb.6, %38(s1), %bb.4
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32)
+ G_BR %bb.5
+
+ bb.8:
+ %57:_(s32) = G_PHI %15(s32), %bb.3
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32)
+ S_ENDPGM 0
+...
+
+---
+name: loop_with_div_break_with_body
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_div_break_with_body
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+ ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+ ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+ ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(s32) = COPY $vgpr4
+ %7:_(s32) = COPY $vgpr5
+ %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+ %9:_(s32) = G_CONSTANT i32 0
+ %10:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+ %11:_(s32) = G_PHI %12(s32), %bb.5, %9(s32), %bb.0
+ %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.5
+ %15:_(s1) = G_CONSTANT i1 true
+ %16:_(s64) = G_SEXT %13(s32)
+ %17:_(s32) = G_CONSTANT i32 2
+ %18:_(s64) = G_SHL %16, %17(s32)
+ %19:_(p1) = G_PTR_ADD %5, %18(s64)
+ %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+ %21:_(s32) = G_CONSTANT i32 0
+ %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+ %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+
+ %24:_(s32) = G_CONSTANT i32 10
+ G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1)
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.5(0x80000000)
+
+ %25:_(s1) = G_CONSTANT i1 false
+ %26:_(s32) = G_CONSTANT i32 2
+ %27:_(s64) = G_SHL %16, %26(s32)
+ %28:_(p1) = G_PTR_ADD %2, %27(s64)
+ %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+ %30:_(s32) = G_CONSTANT i32 1
+ %31:_(s32) = G_ADD %29, %30
+ G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+ %32:_(s32) = G_ADD %13, %30
+ %33:_(s32) = G_CONSTANT i32 100
+ %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33
+ G_BR %bb.5
+
+ bb.4:
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+ S_ENDPGM 0
+
+ bb.5:
+ successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+ %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
+ %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
+ %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+ %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+ SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.6:
+ successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+ %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
+ %39:_(s32) = G_PHI %12(s32), %bb.5
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+ %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+...
+
+---
+name: irreducible_cfg
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: irreducible_cfg
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.7(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]]
+ ; GFX10-NEXT: G_BR %bb.7
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY4]](s32), [[COPY]]
+ ; GFX10-NEXT: G_BR %bb.3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI1]](s1), %17(s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.3(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.5(0x04000000), %bb.7(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
+ ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[COPY3]], [[COPY2]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
+ ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+ ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.7:
+ ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.0, [[PHI]](s1), %bb.2, [[C2]](s1), %bb.4
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI8]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.1
+ bb.0:
+ successors: %bb.7(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(s32) = COPY $vgpr3
+ %4:_(s32) = COPY $vgpr4
+ %5:_(s32) = COPY $vgpr5
+ %6:_(s32) = G_CONSTANT i32 0
+ %7:_(s1) = G_IMPLICIT_DEF
+ %8:_(s1) = G_ICMP intpred(sgt), %4(s32), %1
+ G_BR %bb.7
+
+ bb.1:
+ successors: %bb.3(0x80000000)
+
+ %9:_(s32) = G_CONSTANT i32 0
+ %10:_(s1) = G_ICMP intpred(sle), %4(s32), %0
+ G_BR %bb.3
+
+ bb.2:
+ successors: %bb.4(0x40000000), %bb.7(0x40000000)
+
+ %11:_(s1) = G_PHI %12(s1), %bb.6, %7(s1), %bb.7
+ %13:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+ %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32)
+ SI_LOOP %16(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.6(0x04000000), %bb.3(0x7c000000)
+
+ %18:_(s32) = G_PHI %9(s32), %bb.1, %19(s32), %bb.3
+ %19:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %10(s1), %18(s32)
+ SI_LOOP %19(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.4:
+ successors: %bb.5(0x04000000), %bb.7(0x7c000000)
+
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
+ %20:_(s1) = G_ICMP intpred(sgt), %5(s32), %0
+ %21:_(s1) = G_CONSTANT i1 true
+ %22:_(s1) = G_XOR %8, %21
+ %23:_(s1) = G_OR %20, %22
+ %24:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %23(s1), %25(s32)
+ SI_LOOP %24(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.5
+
+ bb.5:
+ %26:_(s1) = G_PHI %20(s1), %bb.4
+ %27:_(s32) = G_PHI %24(s32), %bb.4
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+ %28:_(s32) = G_SELECT %26(s1), %3, %2
+ %29:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %28(s32)
+ $sgpr0 = COPY %29(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+
+ bb.6:
+ successors: %bb.2(0x80000000)
+
+ %30:_(s32) = G_PHI %19(s32), %bb.3
+ %12:_(s1) = G_CONSTANT i1 false
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+ G_BR %bb.2
+
+ bb.7:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+ %25:_(s32) = G_PHI %24(s32), %bb.4, %25(s32), %bb.2, %6(s32), %bb.0
+ %17:_(s32) = G_PHI %6(s32), %bb.4, %16(s32), %bb.2, %6(s32), %bb.0
+ %31:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %11(s1), %bb.2, %21(s1), %bb.4
+ %14:_(s1) = G_CONSTANT i1 true
+ %15:sreg_32_xm0_xexec(s32) = SI_IF %31(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
new file mode 100644
index 0000000..54881f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i1_phi:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: .LBB0_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v3
+; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
+; GFX10-NEXT: v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB0_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[1:2], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+ %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+ %neg_bool_counter = xor i1 %bool_counter, true
+ %fcounter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %fcounter, %val
+ %counterPlus1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ %select = select i1 %bool_counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i1_non_phi:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, 1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: .LBB1_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB1_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[1:2], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+ %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+ %neg_bool_counter = xor i1 %bool_counter, true
+ %fcounter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %fcounter, %val
+ %counterPlus1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+ store float %select, ptr %addr
+ ret void
+}
+
+; This is temporal divergent uniform i1 structurize-cfg phi.
+; Loop has uniform condition if with body with break at the end.
+define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr addrspace(1) inreg %a, ptr addrspace(1) inreg %a.break) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s1
+; GFX10-NEXT: v_mov_b32_e32 v3, s0
+; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: s_branch .LBB2_3
+; GFX10-NEXT: .LBB2_1: ; %loop.body
+; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v5
+; GFX10-NEXT: v_cmp_lt_u32_e64 s0, v5, v2
+; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: global_load_dword v8, v[6:7], off
+; GFX10-NEXT: v_mov_b32_e32 v5, v9
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT: global_store_dword v[6:7], v8, off
+; GFX10-NEXT: .LBB2_2: ; %Flow
+; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
+; GFX10-NEXT: s_or_b32 s4, s0, s4
+; GFX10-NEXT: s_and_b32 s0, 1, s1
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execz .LBB2_5
+; GFX10-NEXT: .LBB2_3: ; %A
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v3, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
+; GFX10-NEXT: global_load_dword v8, v[8:9], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT: s_mov_b32 s0, -1
+; GFX10-NEXT: s_mov_b32 s1, 1
+; GFX10-NEXT: ; implicit-def: $vgpr5
+; GFX10-NEXT: s_branch .LBB2_2
+; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s1, s0
+; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT: s_cbranch_execz .LBB2_7
+; GFX10-NEXT: ; %bb.6: ; %break.body
+; GFX10-NEXT: v_mov_b32_e32 v0, 10
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: .LBB2_7: ; %exit
+; GFX10-NEXT: s_endpgm
+entry:
+ br label %A
+
+A:
+ %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+ %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+ %a.val = load i32, ptr addrspace(1) %a.plus.counter
+ %a.cond = icmp eq i32 %a.val, 0
+ br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+ store i32 10, ptr addrspace(1) %a.break
+ br label %exit
+
+loop.body:
+ %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+ %x.val = load i32, ptr addrspace(1) %x.plus.counter
+ %x.val.plus.1 = add i32 %x.val, 1
+ store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+ %counter.plus.1 = add i32 %counter, 1
+ %x.cond = icmp ult i32 %counter, %x.size
+ br i1 %x.cond, label %exit, label %A
+
+exit:
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
new file mode 100644
index 0000000..9c2d083
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -0,0 +1,324 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: temporal_divergent_i1_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: temporal_divergent_i1_phi
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s1) = G_CONSTANT i1 true
+ %5:_(s32) = G_CONSTANT i32 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+ %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+ %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+ %12:_(s1) = G_CONSTANT i1 true
+ %11:_(s1) = G_XOR %10, %12
+ %13:_(s32) = G_UITOFP %8(s32)
+ %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+ %15:_(s32) = G_CONSTANT i32 1
+ %9:_(s32) = G_ADD %8, %15
+ %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+ SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %16:_(s1) = G_PHI %10(s1), %bb.1
+ %17:_(s32) = G_PHI %7(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+ %18:_(s32) = G_FCONSTANT float 0.000000e+00
+ %19:_(s32) = G_FCONSTANT float 1.000000e+00
+ %20:_(s32) = G_SELECT %16(s1), %19, %18
+ G_STORE %20(s32), %3(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: temporal_divergent_i1_non_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: temporal_divergent_i1_non_phi
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s1) = G_CONSTANT i1 true
+ %5:_(s32) = G_CONSTANT i32 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+ %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+ %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+ %12:_(s1) = G_CONSTANT i1 true
+ %11:_(s1) = G_XOR %10, %12
+ %13:_(s32) = G_UITOFP %8(s32)
+ %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+ %15:_(s32) = G_CONSTANT i32 1
+ %9:_(s32) = G_ADD %8, %15
+ %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+ SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %16:_(s1) = G_PHI %11(s1), %bb.1
+ %17:_(s32) = G_PHI %7(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+ %18:_(s32) = G_FCONSTANT float 0.000000e+00
+ %19:_(s32) = G_FCONSTANT float 1.000000e+00
+ %20:_(s32) = G_SELECT %16(s1), %19, %18
+ G_STORE %20(s32), %3(p0) :: (store (s32))
+ SI_RETURN
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: loop_with_1break
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.3(0x50000000), %bb.5(0x30000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.3
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+ ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_BR %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+ ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+ ; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+ ; GFX10-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY2]]
+ ; GFX10-NEXT: G_BR %bb.5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $sgpr0
+ %5:_(s32) = COPY $sgpr1
+ %6:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+ %7:_(s32) = COPY $sgpr2
+ %8:_(s32) = COPY $sgpr3
+ %9:_(p1) = G_MERGE_VALUES %7(s32), %8(s32)
+ %10:_(s32) = G_CONSTANT i32 0
+ %11:_(s32) = G_IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.3(0x50000000), %bb.5(0x30000000)
+
+ %12:_(s32) = G_PHI %13(s32), %bb.5, %10(s32), %bb.0
+ %14:_(s32) = G_PHI %10(s32), %bb.0, %15(s32), %bb.5
+ %16:_(s1) = G_CONSTANT i1 true
+ %17:_(s64) = G_SEXT %14(s32)
+ %18:_(s32) = G_CONSTANT i32 2
+ %19:_(s64) = G_SHL %17, %18(s32)
+ %20:_(p1) = G_PTR_ADD %6, %19(s64)
+ %21:_(s32) = G_LOAD %20(p1) :: (load (s32), addrspace 1)
+ %22:_(s32) = G_CONSTANT i32 0
+ %23:_(s1) = G_ICMP intpred(ne), %21(s32), %22
+ G_BRCOND %23(s1), %bb.3
+ G_BR %bb.5
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+
+ %24:_(s32) = G_CONSTANT i32 10
+ G_STORE %24(s32), %9(p1) :: (store (s32), addrspace 1)
+ G_BR %bb.4
+
+ bb.3:
+ successors: %bb.5(0x80000000)
+
+ %25:_(s1) = G_CONSTANT i1 false
+ %26:_(s32) = G_CONSTANT i32 2
+ %27:_(s64) = G_SHL %17, %26(s32)
+ %28:_(p1) = G_PTR_ADD %2, %27(s64)
+ %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+ %30:_(s32) = G_CONSTANT i32 1
+ %31:_(s32) = G_ADD %29, %30
+ G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+ %32:_(s32) = G_ADD %14, %30
+ %33:_(s1) = G_ICMP intpred(ult), %14(s32), %3
+ G_BR %bb.5
+
+ bb.4:
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+ S_ENDPGM 0
+
+ bb.5:
+ successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+ %15:_(s32) = G_PHI %32(s32), %bb.3, %11(s32), %bb.1
+ %35:_(s1) = G_PHI %25(s1), %bb.3, %16(s1), %bb.1
+ %36:_(s1) = G_PHI %33(s1), %bb.3, %16(s1), %bb.1
+ %13:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %12(s32)
+ SI_LOOP %13(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.6
+
+ bb.6:
+ successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+ %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5
+ %38:_(s32) = G_PHI %13(s32), %bb.5
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+ %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
new file mode 100644
index 0000000..d065f22
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+define void @temporal_divergent_i32(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i32:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, -1
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB0_1: ; %loop
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB0_1
+; GFX10-NEXT: ; %bb.2: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: flat_store_dword v[1:2], v3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+ %f.counter = uitofp i32 %counter to float
+ %cond = fcmp ogt float %f.counter, %val
+ %counter.plus.1 = add i32 %counter, 1
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ store i32 %counter, ptr %addr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
new file mode 100644
index 0000000..4cc6802
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: temporal_divergent_i32
+legalized: true
+tracksRegLiveness: true
+body: |
+ ; GFX10-LABEL: name: temporal_divergent_i32
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C2]]
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[ADD]](s32)
+ ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-NEXT: G_BR %bb.2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+ ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[MV]](p0) :: (store (s32))
+ ; GFX10-NEXT: SI_RETURN
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(s32) = G_CONSTANT i32 0
+ %5:_(s32) = G_CONSTANT i32 -1
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %6:_(s32) = G_PHI %7(s32), %bb.1, %4(s32), %bb.0
+ %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+ %10:_(s32) = G_CONSTANT i32 1
+ %9:_(s32) = G_ADD %8, %10
+ %11:_(s32) = G_UITOFP %9(s32)
+ %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
+ %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+ SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+ G_BR %bb.2
+
+ bb.2:
+ %13:_(s32) = G_PHI %9(s32), %bb.1
+ %14:_(s32) = G_PHI %7(s32), %bb.1
+ G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+ G_STORE %13(s32), %3(p0) :: (store (s32))
+ SI_RETURN
+...