diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/CaptureTracking.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Analysis/LazyValueInfo.cpp | 28 | ||||
-rw-r--r-- | llvm/lib/IR/Metadata.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/IR/Verifier.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 179 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 28 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/IPO/GlobalOpt.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/Local.cpp | 6 |
12 files changed, 263 insertions, 158 deletions
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index a0fe7f9..22229d9 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -320,8 +320,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) { return CaptureComponents::None; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. + if (U.getOperandNo() == 0) + return MDNode::toCaptureComponents( + I->getMetadata(LLVMContext::MD_captures)); + // Volatile stores make the address observable. - if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile()) + if (cast<StoreInst>(I)->isVolatile()) return CaptureComponents::All; return CaptureComponents::None; case Instruction::AtomicRMW: { diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 6fb2807..0e5bc48 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1632,19 +1632,25 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, *getValueFromCondition(Usr->getOperand(0), Condition, isTrueDest, /*UseBlockValue*/ false); - if (!OpLatticeVal.isConstantRange()) - return OpLatticeVal; + if (OpLatticeVal.isConstantRange()) { + const unsigned ResultBitWidth = + Usr->getType()->getScalarSizeInBits(); + if (auto *Trunc = dyn_cast<TruncInst>(Usr)) + return ValueLatticeElement::getRange( + OpLatticeVal.getConstantRange().truncate( + ResultBitWidth, Trunc->getNoWrapKind())); - const unsigned ResultBitWidth = - Usr->getType()->getScalarSizeInBits(); - if (auto *Trunc = dyn_cast<TruncInst>(Usr)) return ValueLatticeElement::getRange( - OpLatticeVal.getConstantRange().truncate( - ResultBitWidth, Trunc->getNoWrapKind())); - - return ValueLatticeElement::getRange( - OpLatticeVal.getConstantRange().castOp( - cast<CastInst>(Usr)->getOpcode(), ResultBitWidth)); + OpLatticeVal.getConstantRange().castOp( + cast<CastInst>(Usr)->getOpcode(), ResultBitWidth)); + } + if (OpLatticeVal.isConstant()) { + Constant *C = OpLatticeVal.getConstant(); + if (auto *CastC = ConstantFoldCastOperand( + cast<CastInst>(Usr)->getOpcode(), C, Usr->getType(), DL)) + return ValueLatticeElement::get(CastC); + } + return ValueLatticeElement::getOverdefined(); } else { // If one of Val's operand has an inferred value, we may be able to // infer the value of Val. diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 9cfb0ff..1add0c7 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/ModRef.h" #include <cassert> #include <cstddef> #include <cstdint> @@ -1435,6 +1436,40 @@ MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) { return B; } +CaptureComponents MDNode::toCaptureComponents(const MDNode *MD) { + if (!MD) + return CaptureComponents::All; + + CaptureComponents CC = CaptureComponents::None; + for (Metadata *Op : MD->operands()) { + CaptureComponents Component = + StringSwitch<CaptureComponents>(cast<MDString>(Op)->getString()) + .Case("address", CaptureComponents::Address) + .Case("address_is_null", CaptureComponents::AddressIsNull) + .Case("provenance", CaptureComponents::Provenance) + .Case("read_provenance", CaptureComponents::ReadProvenance); + CC |= Component; + } + return CC; +} + +MDNode *MDNode::fromCaptureComponents(LLVMContext &Ctx, CaptureComponents CC) { + assert(!capturesNothing(CC) && "Can't encode captures(none)"); + if (capturesAll(CC)) + return nullptr; + + SmallVector<Metadata *> Components; + if (capturesAddressIsNullOnly(CC)) + Components.push_back(MDString::get(Ctx, "address_is_null")); + else if (capturesAddress(CC)) + Components.push_back(MDString::get(Ctx, "address")); + if (capturesReadProvenanceOnly(CC)) + Components.push_back(MDString::get(Ctx, "read_provenance")); + else if (capturesFullProvenance(CC)) + Components.push_back(MDString::get(Ctx, "provenance")); + return MDNode::get(Ctx, Components); +} + //===----------------------------------------------------------------------===// // NamedMDNode implementation. // diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 8c03d6f..6b3cd27 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -542,6 +542,7 @@ private: void visitAliasScopeMetadata(const MDNode *MD); void visitAliasScopeListMetadata(const MDNode *MD); void visitAccessGroupMetadata(const MDNode *MD); + void visitCapturesMetadata(Instruction &I, const MDNode *Captures); template <class Ty> bool isValidMetadataArray(const MDTuple &N); #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N); @@ -5373,6 +5374,27 @@ void Verifier::visitAccessGroupMetadata(const MDNode *MD) { } } +void Verifier::visitCapturesMetadata(Instruction &I, const MDNode *Captures) { + static const char *ValidArgs[] = {"address_is_null", "address", + "read_provenance", "provenance"}; + + auto *SI = dyn_cast<StoreInst>(&I); + Check(SI, "!captures metadata can only be applied to store instructions", &I); + Check(SI->getValueOperand()->getType()->isPointerTy(), + "!captures metadata can only be applied to store with value operand of " + "pointer type", + &I); + Check(Captures->getNumOperands() != 0, "!captures metadata cannot be empty", + &I); + + for (Metadata *Op : Captures->operands()) { + auto *Str = dyn_cast<MDString>(Op); + Check(Str, "!captures metadata must be a list of strings", &I); + Check(is_contained(ValidArgs, Str->getString()), + "invalid entry in !captures metadata", &I, Str); + } +} + /// verifyInstruction - Verify that an instruction is well formed. /// void Verifier::visitInstruction(Instruction &I) { @@ -5600,6 +5622,9 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation)) visitAnnotationMetadata(Annotation); + if (MDNode *Captures = I.getMetadata(LLVMContext::MD_captures)) + visitCapturesMetadata(I, Captures); + if (MDNode *N = I.getDebugLoc().getAsMDNode()) { CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N); visitMDNode(*N, AreDebugLocsAllowed::Yes); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f291191..3f9a1f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -495,13 +495,6 @@ public: bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); - bool isForceEmitWaitcnt() const { - for (auto T : inst_counter_types()) - if (ForceEmitWaitcnt[T]) - return true; - return false; - } - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -570,10 +563,6 @@ public: return VmemReadMapping[getVmemType(Inst)]; } - bool hasXcnt() const { return ST->hasWaitXCnt(); } - - bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; - bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -591,7 +580,6 @@ public: WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - static bool asynchronouslyWritesSCC(unsigned Opcode); }; // This objects maintains the current score brackets of each wait counter, and @@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(FIRST_LDS_VGPR, T, CurrScore); } - if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) { + if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { setRegScore(SCC, T, CurrScore); PendingSCCWrite = &Inst; } @@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( return Modified; } -static bool readsVCCZ(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && - !MI.getOperand(1).isUndef(); -} - /// \returns true if the callee inserts an s_waitcnt 0 on function entry. static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { // Currently all conventions wait, but this may not always be the case. @@ -1871,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, assert(!MI.isMetaInstruction()); AMDGPU::Waitcnt Wait; + const unsigned Opc = MI.getOpcode(); // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to // verify that nothing was relying on this. The number of buffer invalidates // being handled here should not be expanded. - if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || - MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || - MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { + if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC || + Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV || + Opc == AMDGPU::BUFFER_GL1_INV) { Wait.LoadCnt = 0; } // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN || + Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || + Opc == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } @@ -1902,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // send a message to explicitly release all VGPRs before the stores have // completed, but it is only safe to do this if there are no outstanding // scratch stores. - else if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) { if (!WCG->isOptNone() && (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() || (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && @@ -1912,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. - else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || - MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) && ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { @@ -1938,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Wait for any pending GDS instruction to complete before any // "Always GDS" instruction. - if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) + if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS()) addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { @@ -1964,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait); } } - } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) { + } else if (Opc == AMDGPU::S_BARRIER_WAIT) { ScoreBrackets.tryClearSCCWriteEvent(&MI); } else { // FIXME: Should not be relying on memoperands. @@ -2061,7 +2039,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } - if (hasXcnt() && Op.isDef()) + if (ST->hasWaitXCnt() && Op.isDef()) ScoreBrackets.determineWait(X_CNT, Interval, Wait); } } @@ -2079,18 +2057,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // // In all other cases, ensure safety by ensuring that there are no outstanding // memory operations. - if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { + if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && + !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.DsCnt = 0; - } + if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.DsCnt = 0; } // Verify that the wait is actually needed. @@ -2165,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } // XCnt may be already consumed by a load wait. - if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.XCnt != ~0u) { + if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) + Wait.XCnt = ~0u; - if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) + Wait.XCnt = ~0u; - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory dependency - // with another VMEM instruction in flight. - if (Wait.XCnt != ~0u && isVmemAccess(*It)) - Wait.XCnt = ~0u; + // Since the translation for VMEM addresses occur in-order, we can skip the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (isVmemAccess(*It)) + Wait.XCnt = ~0u; + } if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2185,75 +2162,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, return Modified; } -// This is a flat memory operation. Check to see if it has memory tokens other -// than LDS. Other address spaces supported by flat memory operations involve -// global memory. -bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // All flat instructions use the VMEM counter except prefetch. - if (!TII->usesVM_CNT(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access VMEM. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves VMEM. - // Flat operations only supported FLAT, LOCAL (LDS), or address spaces - // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION - // (GDS) address space is not supported by flat operations. Therefore, simply - // return true unless only the LDS address space is found. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - assert(AS != AMDGPUAS::REGION_ADDRESS); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - return true; - } - - return false; -} - -// This is a flat memory operation. Check to see if it has memory tokens for -// either LDS or FLAT. -bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. - if (!TII->usesLGKM_CNT(MI)) - return false; - - // If in tgsplit mode then there can be no use of LDS. - if (ST->isTgSplitEnabled()) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access LDS. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves LDS. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) - return true; - } - - return false; -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { - return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || + return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); } -static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { - auto Opc = Inst.getOpcode(); - return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || - Opc == AMDGPU::GLOBAL_WBINV; -} - // Return true if the next instruction is S_ENDPGM, following fallthrough // blocks if necessary. bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, @@ -2331,7 +2244,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { - if (isGFX12CacheInvOrWBInst(Inst)) { + if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); return; @@ -2341,14 +2254,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, int FlatASCount = 0; - if (mayAccessVMEMThroughFlat(Inst)) { + if (TII->mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; IsVMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); } - if (mayAccessLDSThroughFlat(Inst)) { + if (TII->mayAccessLDSThroughFlat(Inst)) { ++FlatASCount; ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } @@ -2394,7 +2307,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); - } else if (asynchronouslyWritesSCC(Inst.getOpcode())) { + } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { @@ -2413,7 +2326,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } - if (!hasXcnt()) + if (!ST->hasWaitXCnt()) return; if (IsVMEMAccess) @@ -2478,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); if (!OldEventsHasSCCWrite) { PendingSCCWrite = Other.PendingSCCWrite; - } else { - if (PendingSCCWrite != Other.PendingSCCWrite) - PendingSCCWrite = nullptr; + } else if (PendingSCCWrite != Other.PendingSCCWrite) { + PendingSCCWrite = nullptr; } } } @@ -2516,12 +2428,6 @@ static bool isWaitInstr(MachineInstr &Inst) { counterTypeForInstr(Opcode).has_value(); } -bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) { - return Opcode == AMDGPU::S_BARRIER_LEAVE || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; -} - // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -2578,7 +2484,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst); // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { @@ -2701,7 +2607,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush( bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { if (SIInstrInfo::isFLAT(MI)) - return mayAccessVMEMThroughFlat(MI); + return TII->mayAccessVMEMThroughFlat(MI); return SIInstrInfo::isVMEM(MI); } @@ -2724,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { - if (MI.mayLoad()) - HasVMemLoad = true; - if (MI.mayStore()) - HasVMemStore = true; + HasVMemLoad |= MI.mayLoad(); + HasVMemStore |= MI.mayStore(); } + for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044ea86..56435a5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { }); } +bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // All flat instructions use the VMEM counter except prefetch. + if (!usesVM_CNT(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access VMEM. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves VMEM. + // Flat operations only supported FLAT, LOCAL (LDS), or address spaces + // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION + // (GDS) address space is not supported by flat operations. Therefore, simply + // return true unless only the LDS address space is found. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + assert(AS != AMDGPUAS::REGION_ADDRESS); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + return true; + } + + return false; +} + +bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. + if (!usesLGKM_CNT(MI)) + return false; + + // If in tgsplit mode then there can be no use of LDS. + if (ST.isTgSplitEnabled()) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access LDS. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves LDS. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) + return true; + } + + return false; +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c2252af..754f52a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -688,6 +688,12 @@ public: /// to not hit scratch. bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + /// \returns true for FLAT instructions that can access VMEM. + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; + + /// \returns true for FLAT instructions that can access LDS. + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: @@ -748,6 +754,18 @@ public: return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; } + static bool isSBarrierSCCWrite(unsigned Opcode) { + return Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; + } + + static bool isCBranchVCCZRead(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -1010,6 +1028,16 @@ public: Opcode == AMDGPU::DS_GWS_BARRIER; } + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + static bool isF16PseudoScalarTrans(unsigned Opcode) { return Opcode == AMDGPU::V_S_EXP_F16_e64 || Opcode == AMDGPU::V_S_LOG_F16_e64 || diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 70b6c7e..1e6b04f8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3793,6 +3793,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, return false; // Operands 1 and 2 are commutable, if we switch the opcode. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); + case RISCV::QC_SELECTIEQ: + case RISCV::QC_SELECTINE: + case RISCV::QC_SELECTIIEQ: + case RISCV::QC_SELECTIINE: + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); case RISCV::QC_MVEQ: case RISCV::QC_MVNE: case RISCV::QC_MVLT: @@ -4018,6 +4023,11 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1, OpIdx2); } + case RISCV::QC_SELECTIEQ: + case RISCV::QC_SELECTINE: + case RISCV::QC_SELECTIIEQ: + case RISCV::QC_SELECTIINE: + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); case RISCV::QC_MVEQ: case RISCV::QC_MVNE: case RISCV::QC_MVLT: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index ff4a040..5407868 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -524,7 +524,7 @@ class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11, let Inst{30-20} = imm11; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in class QCISELECTIICC<bits<3> funct3, string opcodestr> : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2), @@ -537,7 +537,7 @@ class QCISELECTIICC<bits<3> funct3, string opcodestr> let rs2 = simm1; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in class QCISELECTICC<bits<3> funct3, string opcodestr> : RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2), diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index f88d51f..99c4982 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV, /// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Fast); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Fast); } static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, @@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F, return false; for (User *U : F.users()) { - CallBase &CB = cast<CallBase>(*U); - Function *CallerFunc = CB.getParent()->getParent(); + CallBase *CB = dyn_cast<CallBase>(U); + if (!CB || CB->getCalledOperand() != &F) + continue; + Function *CallerFunc = CB->getParent()->getParent(); BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); - if (!isColdCallSite(CB, CallerBFI)) + if (!isColdCallSite(*CB, CallerBFI)) return false; if (!llvm::is_contained(AllCallsCold, CallerFunc)) return false; @@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F, static void changeCallSitesToColdCC(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Cold); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Cold); } // This function iterates over all the call instructions in the input Function diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 8fbaf68..ff063f9 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // - or: pick -1 // - select's condition: if the true value is constant, choose it by making // the condition true. + // - phi: pick the common constant across operands // - default: pick 0 // // Note that this transform is intentionally done here rather than @@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid // duplicating logic for binops at least. auto getUndefReplacement = [&](Type *Ty) { - Value *BestValue = nullptr; + auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * { + // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be + // removed. + Constant *BestValue = nullptr; + for (Value *V : PN.incoming_values()) { + if (match(V, m_Freeze(m_Undef()))) + continue; + + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; + + if (!isGuaranteedNotToBeUndefOrPoison(C)) + return nullptr; + + if (BestValue && BestValue != C) + return nullptr; + + BestValue = C; + } + return BestValue; + }; + Value *NullValue = Constant::getNullValue(Ty); - for (const auto *U : I.users()) { + Value *BestValue = nullptr; + for (auto *U : I.users()) { Value *V = NullValue; if (match(U, m_Or(m_Value(), m_Value()))) V = ConstantInt::getAllOnesValue(Ty); @@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) { if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT)) V = NullValue; + } else if (auto *PHI = dyn_cast<PHINode>(U)) { + if (Value *MaybeV = pickCommonConstantFromPHI(*PHI)) + V = MaybeV; } if (!BestValue) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 123881e..21b2652 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J, // Preserve !nosanitize if both K and J have it. K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_captures: + K->setMetadata( + Kind, MDNode::fromCaptureComponents( + K->getContext(), MDNode::toCaptureComponents(JMD) | + MDNode::toCaptureComponents(KMD))); + break; } } // Set !invariant.group from J if J has it. If both instructions have it |