diff options
| author | Carl Ritson <carl.ritson@amd.com> | 2025-10-28 16:09:28 +0900 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-10-28 16:09:28 +0900 |
| commit | 385c12134aa6b3215d92ce6034d99fd7aec45dd7 (patch) | |
| tree | 20f4ef7ca29136ea9ad002fbb2e708f08243f719 | |
| parent | c1779f33bdada6e478e882cc23a647ef9abaad96 (diff) | |
| download | llvm-385c12134aa6b3215d92ce6034d99fd7aec45dd7.zip llvm-385c12134aa6b3215d92ce6034d99fd7aec45dd7.tar.gz llvm-385c12134aa6b3215d92ce6034d99fd7aec45dd7.tar.bz2 | |
[AMDGPU] Rework GFX11 VALU Mask Write Hazard (#138663)
Apply additional counter waits to address VALU writes to SGPRs. Rework
expiry detection and apply wait coalescing to mitigate some of the
additional waits.
12 files changed, 741 insertions, 124 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e..52cc4ca 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; + + struct StateType { + SmallSet<Register, 2> HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector<const MachineInstr *> WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } + + assert(!HazardDef); + HazardDef = &Op; + } - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + 0); + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector<Register, 2> Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits<int>::max()) + if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) + : AMDGPU::DepCtr::encodeFieldVaSdst(0)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector<MachineInstr *> ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) + continue; + Found++; + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + ToErase.push_back(WaitMI); + } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9db6d70..6a95881 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -3154,17 +3154,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3172,6 +3174,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -3184,8 +3187,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -3237,8 +3241,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -5856,6 +5861,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[10:11] @@ -6384,6 +6390,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -6983,6 +6990,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX1164_ITERATIVE-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -7669,17 +7677,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7687,6 +7697,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -7699,8 +7710,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -7748,8 +7760,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, v10, s8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7761,6 +7774,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -7775,8 +7789,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -12770,6 +12785,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12826,6 +12842,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13804,9 +13821,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h @@ -13815,6 +13833,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13863,6 +13882,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xf1ff ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] @@ -13873,6 +13893,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6167a84..08a4f0c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -957,6 +957,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -2640,17 +2641,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -2658,6 +2661,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -2670,13 +2674,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2701,6 +2707,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -3288,23 +3295,26 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 @@ -3312,10 +3322,11 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffc ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -4337,6 +4348,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -6043,17 +6055,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6061,6 +6075,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -6073,13 +6088,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6104,6 +6121,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -6724,6 +6742,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -7441,6 +7460,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -8071,6 +8091,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -8787,6 +8808,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -9417,6 +9439,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -10133,6 +10156,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -10763,6 +10787,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -11041,6 +11066,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -11870,16 +11896,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 @@ -11887,8 +11916,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -11898,6 +11928,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 @@ -11912,14 +11943,17 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -11944,6 +11978,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -12577,6 +12612,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -12855,6 +12891,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -13684,16 +13721,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 @@ -13701,8 +13741,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -13712,6 +13753,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 @@ -13726,14 +13768,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -13758,6 +13803,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -14391,6 +14437,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -14665,6 +14712,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -15489,16 +15537,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -15506,8 +15557,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -15517,6 +15569,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -15531,14 +15584,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15563,6 +15619,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -16190,6 +16247,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -16465,6 +16523,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -17287,16 +17346,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 @@ -17304,8 +17366,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -17315,6 +17378,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 @@ -17329,14 +17393,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17361,6 +17428,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 0f59304..62f09de 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -515,6 +515,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s13, 32 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 5959f76..f6149cf 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s0, s2, s0 ; GFX11-NEXT: s_addc_u32 s1, s3, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index b7ee9f7..65832f8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -519,6 +519,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -649,6 +650,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -771,6 +773,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -885,6 +888,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -1443,6 +1447,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -1573,6 +1578,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -1695,6 +1701,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -1809,6 +1816,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -2367,6 +2375,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -2497,6 +2506,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -2619,6 +2629,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -2733,6 +2744,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -3291,6 +3303,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -3421,6 +3434,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -3543,6 +3557,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -3657,6 +3672,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 4581efc..ff4b658 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -963,14 +963,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1996,14 +1996,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3029,14 +3029,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4248,10 +4248,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -5511,10 +5512,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6774,10 +6776,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index bd570d9..15d4b9a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -963,14 +963,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -1996,14 +1996,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3029,14 +3029,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4248,10 +4248,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -5511,10 +5512,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6774,10 +6776,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index 0a2e7af..8103140 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1185,6 +1185,7 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 { ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB21_1 ; GFX11-NEXT: s_endpgm @@ -1595,6 +1596,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xf1ff ; GFX11-NEXT: .LBB26_2: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll index 4205393..97c2bdc 100644 --- a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll +++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll @@ -13,6 +13,7 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff ; SDAG-W64-NEXT: ; return to shader part epilog ; ; GISEL-W64-LABEL: test_nor: @@ -57,8 +58,8 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff ; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe ; SDAG-W64-NEXT: ; return to shader part epilog ; ; GISEL-W64-LABEL: test_or_two_uses: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index b21c781..5461532 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1239,6 +1239,7 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX11-NEXT: ; %bb.1: ; %bb @@ -2066,6 +2067,7 @@ define amdgpu_ps void @scc_use_after_kill_inst(float inreg %x, i32 inreg %y) #0 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB17_6 ; GFX11-NEXT: ; %bb.1: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index 1eabe62..e1d3ebc 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -38,6 +38,22 @@ define amdgpu_gs void @mask_hazard_no_hazard1() { ret void } define amdgpu_gs void @mask_hazard_no_hazard2() { ret void } define amdgpu_gs void @mask_hazard_no_hazard3() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard1() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard2() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard3() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard4() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel1() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel2() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel3() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel4() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane1() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane2() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane3() { ret void } + define amdgpu_gs void @mask_hazard_valu_readfirstlane() { ret void } + define amdgpu_gs void @mask_hazard_valu_vcmp_vcc() { ret void } + define amdgpu_gs void @mask_hazard_valu_vcmp_sgpr() { ret void } + define amdgpu_gs void @mask_hazard_combine1() { ret void } + define amdgpu_gs void @mask_hazard_combine2() { ret void } ... --- @@ -487,8 +503,8 @@ body: | ; GFX11-LABEL: name: mask_hazard_subreg3 ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec ; GFX11-NEXT: $sgpr2 = S_MOV_B32 0 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 ; GFX11-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: mask_hazard_subreg3 @@ -655,3 +671,373 @@ body: | $vgpr2 = V_MOV_B32_e32 0, implicit $exec S_ENDPGM 0 ... + +--- +name: mask_hazard_cancel_hazard1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: $vcc_hi = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vcc = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_MOV_B32 0 + $vcc_hi = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vcc = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc = S_MOV_B64 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0_sgpr1 = S_MOV_B64 0 + $sgpr4 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel1 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX11-NEXT: $vcc = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel1 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX12-NEXT: $vcc = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel2 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX11-NEXT: $vcc = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel2 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX12-NEXT: $vcc = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_hi = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel3 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel3 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr0 + ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel3 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr0 + ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel4 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel4 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr1 + ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel4 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr1 + ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr1 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 $sgpr1 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane1 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane1 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READLANE_B32 $vgpr3, 0 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane2 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane2 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr3 = V_READLANE_B32 $vgpr3, 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane3 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane3 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane3 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READLANE_B32 $vgpr3, 0 + $sgpr3 = V_READLANE_B32 $vgpr3, 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readfirstlane +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readfirstlane + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readfirstlane + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_vcmp_vcc +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_vcmp_vcc + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65533 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_vcmp_vcc + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_vcmp_sgpr +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_vcmp_sgpr + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_vcmp_sgpr + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_combine1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_combine1 + ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61948 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_combine1 + ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_combine2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_combine2 + ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65532 + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 $sgpr4 + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61950 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_combine2 + ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 $sgpr4 + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 $sgpr4 + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... |
