diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 238 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 |
7 files changed, 204 insertions, 68 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1c8383c..54d94b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1466,6 +1466,13 @@ def FeatureClusters : SubtargetFeature< "clusters", "Has clusters of workgroups support" >; +def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature< + "waits-before-system-scope-stores", + "RequiresWaitsBeforeSystemScopeStores", + "true", + "Target requires waits for loads and atomics before system scope stores" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -2060,7 +2067,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureMaxHardClauseLength32, Feature1_5xVGPRs, FeatureMemoryAtomicFAddF32DenormalSupport, - FeatureBVHDualAndBVH8Insts + FeatureBVHDualAndBVH8Insts, + FeatureWaitsBeforeSystemScopeStores, ]>; def FeatureISAVersion12_50 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5580e4c..09338c5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9028,6 +9028,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); + // Parse a dummy operand as a placeholder for the SWZ operand. This enforces + // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands. + Inst.addOperand(MCOperand::createImm(0)); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e..52cc4ca 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; + + struct StateType { + SmallSet<Register, 2> HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector<const MachineInstr *> WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } + + assert(!HazardDef); + HazardDef = &Op; + } - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + 0); + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector<Register, 2> Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits<int>::max()) + if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) + : AMDGPU::DepCtr::encodeFieldVaSdst(0)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector<MachineInstr *> ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) + continue; + Found++; + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + ToErase.push_back(WaitMI); + } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ac660d5..f377b8a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -290,6 +290,7 @@ protected: bool Has45BitNumRecordsBufferResource = false; bool HasClusters = false; + bool RequiresWaitsBeforeSystemScopeStores = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1861,6 +1862,10 @@ public: bool has45BitNumRecordsBufferResource() const { return Has45BitNumRecordsBufferResource; } + + bool requiresWaitsBeforeSystemScopeStores() const { + return RequiresWaitsBeforeSystemScopeStores; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index a177a42..6ab8d552 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2673,7 +2673,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. - if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) + if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic && + Scope == CPol::SCOPE_SYS) Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ebd2e7e..d80a6f3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1874,9 +1874,13 @@ void SIRegisterInfo::buildSpillLoadStore( } bool IsSrcDstDef = SrcDstRegState & RegState::Define; + bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore; if (NeedSuperRegImpOperand && - (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) + (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) { MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + if (PartialReloadCopy) + MIB.addReg(ValueReg, RegState::Implicit); + } // The epilog restore of a wwm-scratch register can cause undesired // optimization during machine-cp post PrologEpilogInserter if the same diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b6..1931e0b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -838,9 +838,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } -let Defs = [SCC] in { -def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; -} // End Defs = [SCC] +let isCommutable = 1, Defs = [SCC] in +def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32", + [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))] +>; let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < |
