diff options
| author | Aaditya <Aaditya.AlokDeshpande@amd.com> | 2026-04-28 11:29:50 +0530 |
|---|---|---|
| committer | Aaditya <Aaditya.AlokDeshpande@amd.com> | 2026-04-29 13:55:25 +0530 |
| commit | 8508e574f2174b56dae929ca55f822bf643e21c2 (patch) | |
| tree | c48b2a0573de7c7795dbe4552f46bb36513e5f8a | |
| parent | 922d95aefd7a09661e546a1790285b20fe70719a (diff) | |
| download | llvm-users/easyonaadit/amdgpu/i16-wave-reduce-min-max.tar.gz llvm-users/easyonaadit/amdgpu/i16-wave-reduce-min-max.tar.bz2 llvm-users/easyonaadit/amdgpu/i16-wave-reduce-min-max.zip | |
[AMDGPU] Support Wave Reduction for i16 types - 1users/easyonaadit/amdgpu/i16-wave-reduce-min-max
Supported Ops: `min`, `umin`, `max`, `umax`.
Supports only the iterative stratergy, DPP is yet
to be supported.
Supports only Fake-16 versions of the lowering.
True-16 support is yet to be added.
Assisted by - Claude-sonnet:4.6
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 101 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 630 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 630 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 616 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 629 |
9 files changed, 2069 insertions, 549 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 0782818b1c9d..c54855ea3936 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -1458,6 +1458,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr16: case Vgpr16: case UniInVgprS16: + case Sgpr16ToVgprDst: return LLT::scalar(16); case Sgpr32: case Sgpr32_WF: @@ -1710,6 +1711,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Vgpr32AExt: case Vgpr32SExt: case Vgpr32ZExt: + case Sgpr16ToVgprDst: case Sgpr32ToVgprDst: case Sgpr64ToVgprDst: return VgprRB; @@ -1870,6 +1872,7 @@ bool RegBankLegalizeHelper::applyMappingDst( B.buildTrunc(Reg, NewDst); break; } + case Sgpr16ToVgprDst: case Sgpr32ToVgprDst: case Sgpr64ToVgprDst: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 77802f5f4349..b83b39502c4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -820,6 +820,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}) + .Any({{DivS16, DivS32}, {{Vgpr16}, {Vgpr32}}}) .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}}) .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}}) .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}}) @@ -1697,6 +1698,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax, amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {IntrId, Sgpr32ZExt}}) + .Div(S16, {{Sgpr16ToVgprDst}, {IntrId, Vgpr16}}) .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}) .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}}) .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 35bbefb54d29..64774618d247 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -234,6 +234,7 @@ enum RegBankLLTMappingApplyID { // Dst only modifiers: dst was assigned VGPR by RegBankSelect but the // instruction result must be in SGPR. Replace dst with SGPR, then copy the // result back to the original VGPR. + Sgpr16ToVgprDst, Sgpr32ToVgprDst, Sgpr64ToVgprDst, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f08e12a7fbf3..f7528846c7c1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5627,6 +5627,22 @@ static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB) { static uint64_t getIdentityValueForWaveReduction(unsigned Opc) { switch (Opc) { + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_opsel_e64: + case AMDGPU::V_MIN_U16_fake16_e64: + return 0xffff; + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_opsel_e64: + case AMDGPU::V_MIN_I16_fake16_e64: + return 0x7fff; + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_opsel_e64: + case AMDGPU::V_MAX_U16_fake16_e64: + return 0x0; + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_opsel_e64: + case AMDGPU::V_MAX_I16_fake16_e64: + return 0x8000; case AMDGPU::S_MIN_U32: return std::numeric_limits<uint32_t>::max(); case AMDGPU::S_MIN_I32: @@ -5677,6 +5693,17 @@ static uint64_t getIdentityValueForWaveReduction(unsigned Opc) { } } +static bool is16bitWaveReduceOperation(unsigned Opc) { + return Opc == AMDGPU::V_MIN_U16_opsel_e64 || + Opc == AMDGPU::V_MIN_U16_fake16_e64 || Opc == AMDGPU::V_MIN_U16_e64 || + Opc == AMDGPU::V_MIN_I16_opsel_e64 || + Opc == AMDGPU::V_MIN_I16_fake16_e64 || Opc == AMDGPU::V_MIN_I16_e64 || + Opc == AMDGPU::V_MAX_U16_opsel_e64 || + Opc == AMDGPU::V_MAX_U16_fake16_e64 || Opc == AMDGPU::V_MAX_U16_e64 || + Opc == AMDGPU::V_MAX_I16_opsel_e64 || + Opc == AMDGPU::V_MAX_I16_fake16_e64 || Opc == AMDGPU::V_MAX_I16_e64; +} + static bool is32bitWaveReduceOperation(unsigned Opc) { return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 || Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 || @@ -5818,6 +5845,18 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, }; if (isSGPR) { switch (Opc) { + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_opsel_e64: + case AMDGPU::V_MIN_U16_fake16_e64: + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_opsel_e64: + case AMDGPU::V_MIN_I16_fake16_e64: + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_opsel_e64: + case AMDGPU::V_MAX_U16_fake16_e64: + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_opsel_e64: + case AMDGPU::V_MAX_I16_fake16_e64: case AMDGPU::S_MIN_U32: case AMDGPU::S_MIN_I32: case AMDGPU::V_MIN_F32_e64: @@ -6047,6 +6086,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock::iterator I = BB.end(); Register SrcReg = MI.getOperand(1).getReg(); bool is32BitOpc = is32bitWaveReduceOperation(Opc); + bool is16BitOpc = is16bitWaveReduceOperation(Opc); bool isFPOp = isFloatingPointWaveReduceOperation(Opc); bool NeedsMovDPP = !is32BitOpc; // Create virtual registers required for lowering. @@ -6086,8 +6126,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, ? 0x0 // +0.0 for double sub reduction : getIdentityValueForWaveReduction(Opc); BuildMI(BB, I, DL, - TII->get(is32BitOpc ? AMDGPU::S_MOV_B32 - : AMDGPU::S_MOV_B64_IMM_PSEUDO), + TII->get(is32BitOpc || is16BitOpc ? AMDGPU::S_MOV_B32 + : AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg) .addImm(IdentityValue); // clang-format off @@ -6113,12 +6153,35 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) .addReg(ActiveBitsReg); - if (is32BitOpc) { + if (is32BitOpc || is16BitOpc) { BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) .addReg(SrcReg) .addReg(FF1Reg); - if (isFPOp) { + if (is16BitOpc) { + Register LaneValVgpr = MRI.createVirtualRegister(SrcRegClass); + Register VgprResultReg = MRI.createVirtualRegister(SrcRegClass); + bool isGFX10 = ST.getGeneration() == AMDGPUSubtarget::GFX10; + // Get the Lane Value in VGPR to avoid the Constant Bus Restriction + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), LaneValVgpr) + .addReg(LaneValueReg); + auto OpInstr = + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), VgprResultReg); + if (isGFX10) + OpInstr.addImm(SISrcMods::NONE); // src0 modifier + OpInstr.addReg(AccumulatorReg); // src0 + if (isGFX10) + OpInstr.addImm(SISrcMods::NONE); // src1 modifier + OpInstr.addReg(LaneValVgpr); // src1 + if (isGFX10) { + OpInstr.addImm(0); // omod + OpInstr.addImm(0); // opsel + } + NewAccumulator = + BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(VgprResultReg); + } else if (isFPOp) { Register LaneValVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); @@ -6139,7 +6202,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addReg(DstVreg); } else { NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) + .addReg(AccumulatorReg) .addReg(LaneValueReg); } } else { @@ -6672,6 +6735,34 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U16: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() == AMDGPUSubtarget::GFX10 + ? AMDGPU::V_MIN_U16_opsel_e64 + : ST.hasTrue16BitInsts() + ? AMDGPU::V_MIN_U16_fake16_e64 + : AMDGPU::V_MIN_U16_e64); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I16: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() == AMDGPUSubtarget::GFX10 + ? AMDGPU::V_MIN_I16_opsel_e64 + : ST.hasTrue16BitInsts() + ? AMDGPU::V_MIN_I16_fake16_e64 + : AMDGPU::V_MIN_I16_e64); + case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U16: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() == AMDGPUSubtarget::GFX10 + ? AMDGPU::V_MAX_U16_opsel_e64 + : ST.hasTrue16BitInsts() + ? AMDGPU::V_MAX_U16_fake16_e64 + : AMDGPU::V_MAX_U16_e64); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I16: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() == AMDGPUSubtarget::GFX10 + ? AMDGPU::V_MAX_I16_opsel_e64 + : ST.hasTrue16BitInsts() + ? AMDGPU::V_MAX_I16_fake16_e64 + : AMDGPU::V_MAX_I16_e64); case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3f89ebc55fc8..42a99233132c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -403,6 +403,11 @@ defvar Operations = [ WaveReduceOp<"fadd", "F64", f64, SGPR_64, VSrc_b64>, WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>, WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>, + + WaveReduceOp<"umin", "U16", i16, SGPR_32, VSrc_b16>, + WaveReduceOp<"min", "I16", i16, SGPR_32, VSrc_b16>, + WaveReduceOp<"umax", "U16", i16, SGPR_32, VSrc_b16>, + WaveReduceOp<"max", "I16", i16, SGPR_32, VSrc_b16> ]; foreach Op = Operations in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index fda3c218a26f..a9621efdb15f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -7,10 +7,368 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: uniform_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_i16: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_i16: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.max.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: divergent_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_max_i16_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_max_i16_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_max_i16_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_max_i16_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_i16: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_max_i16 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_i16: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0x8000 +; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_max_i16 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_i16: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x8000 +; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_max_i16 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_i16: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x8000 +; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_max_i16 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0x8000 +; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_max_i16 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b32 s2, 0x8000 +; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_max_i16 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x8000 +; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_max_i16 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x8000 +; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_max_i16 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.max.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: @@ -130,13 +488,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -150,13 +508,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s4, 1 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -171,13 +529,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -189,13 +547,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s4, 1 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -209,13 +567,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -227,13 +585,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s4, 1 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -247,13 +605,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s2, 1 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_max_i32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -265,13 +623,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s2, 1 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_max_i32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -286,14 +644,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s4, 1 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -306,14 +664,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s4, 1 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -327,14 +685,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s2, 1 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -347,14 +705,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s2, 1 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,20 +2502,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2172,30 +2530,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB6_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s6, 1 -; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB6_6: ; %endif +; GFX8GISEL-NEXT: .LBB8_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2218,20 +2576,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2245,30 +2603,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB6_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s6, 1 -; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB6_6: ; %endif +; GFX9GISEL-NEXT: .LBB8_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2290,20 +2648,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2317,30 +2675,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s6, 1 -; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064GISEL-NEXT: .LBB6_6: ; %endif +; GFX1064GISEL-NEXT: .LBB8_6: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2362,20 +2720,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1 -; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2389,30 +2747,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s1, 1 -; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB6_6: ; %endif +; GFX1032GISEL-NEXT: .LBB8_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2436,21 +2794,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2466,31 +2824,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s6, 1 -; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164GISEL-NEXT: .LBB6_6: ; %endif +; GFX1164GISEL-NEXT: .LBB8_6: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2514,21 +2872,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 -; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2544,31 +2902,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s1, 1 -; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB6_6: ; %endif +; GFX1132GISEL-NEXT: .LBB8_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2707,7 +3065,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2718,7 +3076,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2732,7 +3090,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: s_brev_b32 s5, 1 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2743,7 +3101,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2757,7 +3115,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX9DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2768,7 +3126,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2782,7 +3140,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: s_brev_b32 s5, 1 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2793,7 +3151,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2807,7 +3165,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2818,7 +3176,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2831,7 +3189,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: s_brev_b32 s5, 1 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2842,7 +3200,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2855,7 +3213,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2866,7 +3224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2879,7 +3237,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1032GISEL-NEXT: s_brev_b32 s5, 1 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2890,7 +3248,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2903,7 +3261,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s0, 0 ; GFX1164DAGISEL-NEXT: s_brev_b32 s1, 1 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2915,7 +3273,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2928,7 +3286,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1164GISEL-NEXT: s_brev_b32 s1, 1 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2940,7 +3298,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2953,7 +3311,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2964,7 +3322,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -2976,7 +3334,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132GISEL-NEXT: s_brev_b32 s1, 1 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2987,7 +3345,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -3028,24 +3386,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB9_2: ; %Flow +; GFX8GISEL-NEXT: .LBB11_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB9_4: ; %endif +; GFX8GISEL-NEXT: .LBB11_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3080,24 +3438,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB9_2: ; %Flow +; GFX9GISEL-NEXT: .LBB11_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB9_4: ; %endif +; GFX9GISEL-NEXT: .LBB11_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3132,24 +3490,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064GISEL-NEXT: .LBB9_4: ; %endif +; GFX1064GISEL-NEXT: .LBB11_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3184,24 +3542,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB9_4: ; %endif +; GFX1032GISEL-NEXT: .LBB11_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3240,17 +3598,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3258,7 +3616,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164GISEL-NEXT: .LBB9_4: ; %endif +; GFX1164GISEL-NEXT: .LBB11_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -3295,23 +3653,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB9_4: ; %endif +; GFX1132GISEL-NEXT: .LBB11_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index c01c06972515..17ed1c15348f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -7,10 +7,368 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: uniform_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_i16: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_i16: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.min.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: divergent_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_min_i16_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_min_i16_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_min_i16_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_min_i16_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_i16: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_min_i16 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_i16: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_movk_i32 s6, 0x7fff +; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_min_i16 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_i16: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_movk_i32 s5, 0x7fff +; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_min_i16 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_i16: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_movk_i32 s5, 0x7fff +; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_min_i16 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_movk_i32 s2, 0x7fff +; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_min_i16 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_movk_i32 s2, 0x7fff +; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_min_i16 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_movk_i32 s1, 0x7fff +; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_min_i16 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_movk_i32 s1, 0x7fff +; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_min_i16 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.min.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: @@ -130,13 +488,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s4, -2 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -150,13 +508,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s4, -2 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -171,13 +529,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s4, -2 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -189,13 +547,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s4, -2 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -209,13 +567,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s4, -2 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -227,13 +585,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s4, -2 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -247,13 +605,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s2, -2 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_min_i32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -265,13 +623,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s2, -2 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_min_i32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -286,14 +644,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s4, -2 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -306,14 +664,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s4, -2 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -327,14 +685,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s2, -2 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -347,14 +705,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s2, -2 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,20 +2502,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2172,30 +2530,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB6_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s6, -2 -; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB6_6: ; %endif +; GFX8GISEL-NEXT: .LBB8_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2218,20 +2576,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2245,30 +2603,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB6_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s6, -2 -; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB6_6: ; %endif +; GFX9GISEL-NEXT: .LBB8_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2290,20 +2648,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2317,30 +2675,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s6, -2 -; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064GISEL-NEXT: .LBB6_6: ; %endif +; GFX1064GISEL-NEXT: .LBB8_6: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2362,20 +2720,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2 -; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2389,30 +2747,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s1, -2 -; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB6_6: ; %endif +; GFX1032GISEL-NEXT: .LBB8_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2436,21 +2794,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2466,31 +2824,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s6, -2 -; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164GISEL-NEXT: .LBB6_6: ; %endif +; GFX1164GISEL-NEXT: .LBB8_6: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2514,21 +2872,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 -; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2544,31 +2902,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s1, -2 -; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB6_6: ; %endif +; GFX1132GISEL-NEXT: .LBB8_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2707,7 +3065,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2718,7 +3076,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2732,7 +3090,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: s_brev_b32 s5, -2 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2743,7 +3101,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2757,7 +3115,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX9DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2768,7 +3126,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2782,7 +3140,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: s_brev_b32 s5, -2 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2793,7 +3151,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2807,7 +3165,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2818,7 +3176,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2831,7 +3189,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: s_brev_b32 s5, -2 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2842,7 +3200,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2855,7 +3213,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1032DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2866,7 +3224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2879,7 +3237,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1032GISEL-NEXT: s_brev_b32 s5, -2 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2890,7 +3248,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2903,7 +3261,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s0, -1 ; GFX1164DAGISEL-NEXT: s_brev_b32 s1, -2 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2915,7 +3273,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2928,7 +3286,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1164GISEL-NEXT: s_brev_b32 s1, -2 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2940,7 +3298,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2953,7 +3311,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2964,7 +3322,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -2976,7 +3334,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132GISEL-NEXT: s_brev_b32 s1, -2 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2987,7 +3345,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -3028,24 +3386,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB9_2: ; %Flow +; GFX8GISEL-NEXT: .LBB11_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB9_4: ; %endif +; GFX8GISEL-NEXT: .LBB11_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3080,24 +3438,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB9_2: ; %Flow +; GFX9GISEL-NEXT: .LBB11_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB9_4: ; %endif +; GFX9GISEL-NEXT: .LBB11_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3132,24 +3490,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064GISEL-NEXT: .LBB9_4: ; %endif +; GFX1064GISEL-NEXT: .LBB11_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3184,24 +3542,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB9_4: ; %endif +; GFX1032GISEL-NEXT: .LBB11_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3240,17 +3598,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3258,7 +3616,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164GISEL-NEXT: .LBB9_4: ; %endif +; GFX1164GISEL-NEXT: .LBB11_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -3295,23 +3653,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB9_4: ; %endif +; GFX1132GISEL-NEXT: .LBB11_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 81ee4574e582..29a0c5fb953b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -7,11 +7,355 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s +define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: uniform_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_i16: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_i16: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: uniform_value_i16: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_clause 0x1 +; GFX11GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.umax.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: divergent_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_max_u16_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_max_u16_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_max_u16_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_max_u16_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_i16: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_max_u16 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_i16: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_max_u16 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_i16: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0 +; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_max_u16 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_i16: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s5, 0 +; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_max_u16 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_max_u16 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_max_u16 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_max_u16 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_max_u16 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.umax.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: @@ -131,13 +475,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -151,13 +495,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -172,13 +516,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,13 +534,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -210,13 +554,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -228,13 +572,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -248,13 +592,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -266,13 +610,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -287,14 +631,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -307,14 +651,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -328,14 +672,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -348,14 +692,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2097,20 +2441,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2125,30 +2469,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB6_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB6_6: ; %endif +; GFX8GISEL-NEXT: .LBB8_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2171,20 +2515,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2198,30 +2542,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB6_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB6_6: ; %endif +; GFX9GISEL-NEXT: .LBB8_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2243,20 +2587,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2270,30 +2614,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064GISEL-NEXT: .LBB6_6: ; %endif +; GFX1064GISEL-NEXT: .LBB8_6: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2315,20 +2659,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2342,30 +2686,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB6_6: ; %endif +; GFX1032GISEL-NEXT: .LBB8_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2389,21 +2733,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2419,31 +2763,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164GISEL-NEXT: .LBB6_6: ; %endif +; GFX1164GISEL-NEXT: .LBB8_6: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2467,21 +2811,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2497,31 +2841,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB6_6: ; %endif +; GFX1132GISEL-NEXT: .LBB8_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2799,7 +3143,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2810,7 +3154,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2823,7 +3167,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2834,7 +3178,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2847,7 +3191,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2858,7 +3202,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2871,7 +3215,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2882,7 +3226,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2895,7 +3239,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2906,7 +3250,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2918,7 +3262,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2929,7 +3273,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2941,7 +3285,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2952,7 +3296,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2964,7 +3308,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2975,7 +3319,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2987,7 +3331,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2999,7 +3343,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3011,7 +3355,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -3023,7 +3367,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3035,7 +3379,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -3046,7 +3390,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -3057,7 +3401,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -3068,7 +3412,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -3109,24 +3453,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB11_2: ; %Flow +; GFX8GISEL-NEXT: .LBB13_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB11_4: ; %endif +; GFX8GISEL-NEXT: .LBB13_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3161,24 +3505,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB11_2: ; %Flow +; GFX9GISEL-NEXT: .LBB13_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB11_4: ; %endif +; GFX9GISEL-NEXT: .LBB13_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3213,24 +3557,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB13_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064GISEL-NEXT: .LBB11_4: ; %endif +; GFX1064GISEL-NEXT: .LBB13_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3265,24 +3609,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB13_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB11_4: ; %endif +; GFX1032GISEL-NEXT: .LBB13_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3321,17 +3665,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB13_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3339,7 +3683,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164GISEL-NEXT: .LBB11_4: ; %endif +; GFX1164GISEL-NEXT: .LBB13_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -3376,23 +3720,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB13_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB13_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB11_4: ; %endif +; GFX1132GISEL-NEXT: .LBB13_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 0a57975a1fa2..6df3ef92c246 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -7,11 +7,368 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s +define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: uniform_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_i16: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_i16: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.umin.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) { +; GFX8DAGISEL-LABEL: divergent_value_i16: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_min_u16_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_i16: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_min_u16_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_short v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_i16: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_min_u16_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_i16: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_min_u16_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_i16: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_min_u16 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_i16: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0xffff +; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_min_u16 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_i16: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0xffff +; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_min_u16 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_i16: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s5, 0xffff +; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_min_u16 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_i16: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0xffff +; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_min_u16 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_i16: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b32 s2, 0xffff +; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_min_u16 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_i16: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0xffff +; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_min_u16 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_i16: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s1, 0xffff +; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_min_u16 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call i16 @llvm.amdgcn.wave.reduce.umin.i16(i16 %in, i32 1) + store i16 %result, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: @@ -131,13 +488,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -151,13 +508,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -172,13 +529,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,13 +547,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -210,13 +567,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -228,13 +585,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -248,13 +605,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -266,13 +623,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -287,14 +644,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -307,14 +664,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -328,14 +685,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -348,14 +705,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2097,20 +2454,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2125,30 +2482,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB6_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 -; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB6_6: ; %endif +; GFX8GISEL-NEXT: .LBB8_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2171,20 +2528,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2198,30 +2555,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB6_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB6_6: ; %endif +; GFX9GISEL-NEXT: .LBB8_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2243,20 +2600,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2270,30 +2627,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064GISEL-NEXT: .LBB6_6: ; %endif +; GFX1064GISEL-NEXT: .LBB8_6: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2315,20 +2672,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2342,30 +2699,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s1, -1 -; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB6_6: ; %endif +; GFX1032GISEL-NEXT: .LBB8_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2389,21 +2746,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2419,31 +2776,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s2, s2 -; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164GISEL-NEXT: .LBB6_6: ; %endif +; GFX1164GISEL-NEXT: .LBB8_6: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2467,21 +2824,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2497,31 +2854,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s1, -1 -; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB6_6: ; %endif +; GFX1132GISEL-NEXT: .LBB8_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -2659,7 +3016,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2670,7 +3027,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2683,7 +3040,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2694,7 +3051,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2707,7 +3064,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2718,7 +3075,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2731,7 +3088,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2742,7 +3099,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2755,7 +3112,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2766,7 +3123,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2778,7 +3135,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2789,7 +3146,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2801,7 +3158,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2812,7 +3169,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2824,7 +3181,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -2835,7 +3192,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -2847,7 +3204,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2859,7 +3216,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2871,7 +3228,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -2883,7 +3240,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2895,7 +3252,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2906,7 +3263,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -2917,7 +3274,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -2928,7 +3285,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -2969,24 +3326,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB9_2: ; %Flow +; GFX8GISEL-NEXT: .LBB11_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB9_4: ; %endif +; GFX8GISEL-NEXT: .LBB11_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3021,24 +3378,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB9_2: ; %Flow +; GFX9GISEL-NEXT: .LBB11_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB9_4: ; %endif +; GFX9GISEL-NEXT: .LBB11_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3073,24 +3430,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064GISEL-NEXT: .LBB9_4: ; %endif +; GFX1064GISEL-NEXT: .LBB11_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3125,24 +3482,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB9_4: ; %endif +; GFX1032GISEL-NEXT: .LBB11_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3181,17 +3538,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3199,7 +3556,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164GISEL-NEXT: .LBB9_4: ; %endif +; GFX1164GISEL-NEXT: .LBB11_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -3236,23 +3593,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB9_4: ; %endif +; GFX1132GISEL-NEXT: .LBB11_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
