aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CaptureTracking.cpp6
-rw-r--r--llvm/lib/Analysis/LazyValueInfo.cpp28
-rw-r--r--llvm/lib/IR/Metadata.cpp35
-rw-r--r--llvm/lib/IR/Verifier.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp179
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h28
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td4
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp16
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp31
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp6
12 files changed, 263 insertions, 158 deletions
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index a0fe7f9..22229d9 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -320,8 +320,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
return CaptureComponents::None;
case Instruction::Store:
// Stored the pointer - conservatively assume it may be captured.
+ if (U.getOperandNo() == 0)
+ return MDNode::toCaptureComponents(
+ I->getMetadata(LLVMContext::MD_captures));
+
// Volatile stores make the address observable.
- if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
+ if (cast<StoreInst>(I)->isVolatile())
return CaptureComponents::All;
return CaptureComponents::None;
case Instruction::AtomicRMW: {
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 6fb2807..0e5bc48 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1632,19 +1632,25 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
*getValueFromCondition(Usr->getOperand(0), Condition,
isTrueDest, /*UseBlockValue*/ false);
- if (!OpLatticeVal.isConstantRange())
- return OpLatticeVal;
+ if (OpLatticeVal.isConstantRange()) {
+ const unsigned ResultBitWidth =
+ Usr->getType()->getScalarSizeInBits();
+ if (auto *Trunc = dyn_cast<TruncInst>(Usr))
+ return ValueLatticeElement::getRange(
+ OpLatticeVal.getConstantRange().truncate(
+ ResultBitWidth, Trunc->getNoWrapKind()));
- const unsigned ResultBitWidth =
- Usr->getType()->getScalarSizeInBits();
- if (auto *Trunc = dyn_cast<TruncInst>(Usr))
return ValueLatticeElement::getRange(
- OpLatticeVal.getConstantRange().truncate(
- ResultBitWidth, Trunc->getNoWrapKind()));
-
- return ValueLatticeElement::getRange(
- OpLatticeVal.getConstantRange().castOp(
- cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+ OpLatticeVal.getConstantRange().castOp(
+ cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+ }
+ if (OpLatticeVal.isConstant()) {
+ Constant *C = OpLatticeVal.getConstant();
+ if (auto *CastC = ConstantFoldCastOperand(
+ cast<CastInst>(Usr)->getOpcode(), C, Usr->getType(), DL))
+ return ValueLatticeElement::get(CastC);
+ }
+ return ValueLatticeElement::getOverdefined();
} else {
// If one of Val's operand has an inferred value, we may be able to
// infer the value of Val.
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 9cfb0ff..1add0c7 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -48,6 +48,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -1435,6 +1436,40 @@ MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) {
return B;
}
+CaptureComponents MDNode::toCaptureComponents(const MDNode *MD) {
+ if (!MD)
+ return CaptureComponents::All;
+
+ CaptureComponents CC = CaptureComponents::None;
+ for (Metadata *Op : MD->operands()) {
+ CaptureComponents Component =
+ StringSwitch<CaptureComponents>(cast<MDString>(Op)->getString())
+ .Case("address", CaptureComponents::Address)
+ .Case("address_is_null", CaptureComponents::AddressIsNull)
+ .Case("provenance", CaptureComponents::Provenance)
+ .Case("read_provenance", CaptureComponents::ReadProvenance);
+ CC |= Component;
+ }
+ return CC;
+}
+
+MDNode *MDNode::fromCaptureComponents(LLVMContext &Ctx, CaptureComponents CC) {
+ assert(!capturesNothing(CC) && "Can't encode captures(none)");
+ if (capturesAll(CC))
+ return nullptr;
+
+ SmallVector<Metadata *> Components;
+ if (capturesAddressIsNullOnly(CC))
+ Components.push_back(MDString::get(Ctx, "address_is_null"));
+ else if (capturesAddress(CC))
+ Components.push_back(MDString::get(Ctx, "address"));
+ if (capturesReadProvenanceOnly(CC))
+ Components.push_back(MDString::get(Ctx, "read_provenance"));
+ else if (capturesFullProvenance(CC))
+ Components.push_back(MDString::get(Ctx, "provenance"));
+ return MDNode::get(Ctx, Components);
+}
+
//===----------------------------------------------------------------------===//
// NamedMDNode implementation.
//
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8c03d6f..6b3cd27 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -542,6 +542,7 @@ private:
void visitAliasScopeMetadata(const MDNode *MD);
void visitAliasScopeListMetadata(const MDNode *MD);
void visitAccessGroupMetadata(const MDNode *MD);
+ void visitCapturesMetadata(Instruction &I, const MDNode *Captures);
template <class Ty> bool isValidMetadataArray(const MDTuple &N);
#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -5373,6 +5374,27 @@ void Verifier::visitAccessGroupMetadata(const MDNode *MD) {
}
}
+void Verifier::visitCapturesMetadata(Instruction &I, const MDNode *Captures) {
+ static const char *ValidArgs[] = {"address_is_null", "address",
+ "read_provenance", "provenance"};
+
+ auto *SI = dyn_cast<StoreInst>(&I);
+ Check(SI, "!captures metadata can only be applied to store instructions", &I);
+ Check(SI->getValueOperand()->getType()->isPointerTy(),
+ "!captures metadata can only be applied to store with value operand of "
+ "pointer type",
+ &I);
+ Check(Captures->getNumOperands() != 0, "!captures metadata cannot be empty",
+ &I);
+
+ for (Metadata *Op : Captures->operands()) {
+ auto *Str = dyn_cast<MDString>(Op);
+ Check(Str, "!captures metadata must be a list of strings", &I);
+ Check(is_contained(ValidArgs, Str->getString()),
+ "invalid entry in !captures metadata", &I, Str);
+ }
+}
+
/// verifyInstruction - Verify that an instruction is well formed.
///
void Verifier::visitInstruction(Instruction &I) {
@@ -5600,6 +5622,9 @@ void Verifier::visitInstruction(Instruction &I) {
if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation))
visitAnnotationMetadata(Annotation);
+ if (MDNode *Captures = I.getMetadata(LLVMContext::MD_captures))
+ visitCapturesMetadata(I, Captures);
+
if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
visitMDNode(*N, AreDebugLocsAllowed::Yes);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f291191..3f9a1f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -495,13 +495,6 @@ public:
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
bool run(MachineFunction &MF);
- bool isForceEmitWaitcnt() const {
- for (auto T : inst_counter_types())
- if (ForceEmitWaitcnt[T])
- return true;
- return false;
- }
-
void setForceEmitWaitcnt() {
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
// For debug builds, get the debug counter info and adjust if need be
@@ -570,10 +563,6 @@ public:
return VmemReadMapping[getVmemType(Inst)];
}
- bool hasXcnt() const { return ST->hasWaitXCnt(); }
-
- bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
- bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -591,7 +580,6 @@ public:
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
- static bool asynchronouslyWritesSCC(unsigned Opcode);
};
// This objects maintains the current score brackets of each wait counter, and
@@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
- if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+ if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
setRegScore(SCC, T, CurrScore);
PendingSCCWrite = &Inst;
}
@@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
return Modified;
}
-static bool readsVCCZ(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
- !MI.getOperand(1).isUndef();
-}
-
/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
// Currently all conventions wait, but this may not always be the case.
@@ -1871,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
assert(!MI.isMetaInstruction());
AMDGPU::Waitcnt Wait;
+ const unsigned Opc = MI.getOpcode();
// FIXME: This should have already been handled by the memory legalizer.
// Removing this currently doesn't affect any lit tests, but we need to
// verify that nothing was relying on this. The number of buffer invalidates
// being handled here should not be expanded.
- if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
- MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
- MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
+ if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
+ Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
+ Opc == AMDGPU::BUFFER_GL1_INV) {
Wait.LoadCnt = 0;
}
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
- if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
- MI.getOpcode() == AMDGPU::SI_RETURN ||
- MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
- MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
+ Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
+ Opc == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
@@ -1902,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// send a message to explicitly release all VGPRs before the stores have
// completed, but it is only safe to do this if there are no outstanding
// scratch stores.
- else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
if (!WCG->isOptNone() &&
(MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
@@ -1912,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
- else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
- MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+ else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
ST->hasLegacyGeometry() &&
((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
@@ -1938,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Wait for any pending GDS instruction to complete before any
// "Always GDS" instruction.
- if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
+ if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
@@ -1964,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait);
}
}
- } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+ } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
@@ -2061,7 +2039,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
- if (hasXcnt() && Op.isDef())
+ if (ST->hasWaitXCnt() && Op.isDef())
ScoreBrackets.determineWait(X_CNT, Interval, Wait);
}
}
@@ -2079,18 +2057,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
//
// In all other cases, ensure safety by ensuring that there are no outstanding
// memory operations.
- if (MI.getOpcode() == AMDGPU::S_BARRIER &&
- !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
+ if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
+ !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
- if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- Wait.DsCnt = 0;
- }
+ if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ Wait.DsCnt = 0;
}
// Verify that the wait is actually needed.
@@ -2165,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
// XCnt may be already consumed by a load wait.
- if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
- !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
- Wait.XCnt = ~0u;
+ if (Wait.XCnt != ~0u) {
+ if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+ Wait.XCnt = ~0u;
- if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
- !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = ~0u;
+ if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ Wait.XCnt = ~0u;
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory dependency
- // with another VMEM instruction in flight.
- if (Wait.XCnt != ~0u && isVmemAccess(*It))
- Wait.XCnt = ~0u;
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (isVmemAccess(*It))
+ Wait.XCnt = ~0u;
+ }
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
@@ -2185,75 +2162,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
return Modified;
}
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // All flat instructions use the VMEM counter except prefetch.
- if (!TII->usesVM_CNT(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access VMEM.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves VMEM.
- // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
- // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
- // (GDS) address space is not supported by flat operations. Therefore, simply
- // return true unless only the LDS address space is found.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- assert(AS != AMDGPUAS::REGION_ADDRESS);
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- return true;
- }
-
- return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
- if (!TII->usesLGKM_CNT(MI))
- return false;
-
- // If in tgsplit mode then there can be no use of LDS.
- if (ST->isTgSplitEnabled())
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access LDS.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves LDS.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
- return true;
- }
-
- return false;
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
- return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+ return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
}
-static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
- auto Opc = Inst.getOpcode();
- return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
- Opc == AMDGPU::GLOBAL_WBINV;
-}
-
// Return true if the next instruction is S_ENDPGM, following fallthrough
// blocks if necessary.
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -2331,7 +2244,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
} else if (TII->isFLAT(Inst)) {
- if (isGFX12CacheInvOrWBInst(Inst)) {
+ if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
return;
@@ -2341,14 +2254,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
int FlatASCount = 0;
- if (mayAccessVMEMThroughFlat(Inst)) {
+ if (TII->mayAccessVMEMThroughFlat(Inst)) {
++FlatASCount;
IsVMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
}
- if (mayAccessLDSThroughFlat(Inst)) {
+ if (TII->mayAccessLDSThroughFlat(Inst)) {
++FlatASCount;
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
@@ -2394,7 +2307,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
- } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
+ } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
@@ -2413,7 +2326,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
}
}
- if (!hasXcnt())
+ if (!ST->hasWaitXCnt())
return;
if (IsVMEMAccess)
@@ -2478,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
if (!OldEventsHasSCCWrite) {
PendingSCCWrite = Other.PendingSCCWrite;
- } else {
- if (PendingSCCWrite != Other.PendingSCCWrite)
- PendingSCCWrite = nullptr;
+ } else if (PendingSCCWrite != Other.PendingSCCWrite) {
+ PendingSCCWrite = nullptr;
}
}
}
@@ -2516,12 +2428,6 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
-bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
- return Opcode == AMDGPU::S_BARRIER_LEAVE ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
-}
-
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
@@ -2578,7 +2484,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
- bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+ bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
// Don't examine operands unless we need to track vccz correctness.
if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
@@ -2701,7 +2607,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
if (SIInstrInfo::isFLAT(MI))
- return mayAccessVMEMThroughFlat(MI);
+ return TII->mayAccessVMEMThroughFlat(MI);
return SIInstrInfo::isVMEM(MI);
}
@@ -2724,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
for (MachineBasicBlock *MBB : ML->blocks()) {
for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {
- if (MI.mayLoad())
- HasVMemLoad = true;
- if (MI.mayStore())
- HasVMemStore = true;
+ HasVMemLoad |= MI.mayLoad();
+ HasVMemStore |= MI.mayStore();
}
+
for (const MachineOperand &Op : MI.all_uses()) {
if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea86..56435a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
});
}
+bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+ assert(isFLAT(MI));
+
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!usesVM_CNT(MI))
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access VMEM.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves VMEM.
+ // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+ // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+ // (GDS) address space is not supported by flat operations. Therefore, simply
+ // return true unless only the LDS address space is found.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ assert(AS != AMDGPUAS::REGION_ADDRESS);
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
+bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+ assert(isFLAT(MI));
+
+ // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+ if (!usesLGKM_CNT(MI))
+ return false;
+
+ // If in tgsplit mode then there can be no use of LDS.
+ if (ST.isTgSplitEnabled())
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access LDS.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves LDS.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c2252af..754f52a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -688,6 +688,12 @@ public:
/// to not hit scratch.
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ /// \returns true for FLAT instructions that can access VMEM.
+ bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+ /// \returns true for FLAT instructions that can access LDS.
+ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
@@ -748,6 +754,18 @@ public:
return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
}
+ static bool isSBarrierSCCWrite(unsigned Opcode) {
+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+ }
+
+ static bool isCBranchVCCZRead(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -1010,6 +1028,16 @@ public:
Opcode == AMDGPU::DS_GWS_BARRIER;
}
+ static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+ return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+ Opc == AMDGPU::GLOBAL_WBINV;
+ }
+
+ static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+ return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+ Opc == AMDGPU::GLOBAL_WBINV;
+ }
+
static bool isF16PseudoScalarTrans(unsigned Opcode) {
return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
Opcode == AMDGPU::V_S_LOG_F16_e64 ||
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 70b6c7e..1e6b04f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3793,6 +3793,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
return false;
// Operands 1 and 2 are commutable, if we switch the opcode.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ case RISCV::QC_SELECTIEQ:
+ case RISCV::QC_SELECTINE:
+ case RISCV::QC_SELECTIIEQ:
+ case RISCV::QC_SELECTIINE:
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
case RISCV::QC_MVEQ:
case RISCV::QC_MVNE:
case RISCV::QC_MVLT:
@@ -4018,6 +4023,11 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
OpIdx2);
}
+ case RISCV::QC_SELECTIEQ:
+ case RISCV::QC_SELECTINE:
+ case RISCV::QC_SELECTIIEQ:
+ case RISCV::QC_SELECTIINE:
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
case RISCV::QC_MVEQ:
case RISCV::QC_MVNE:
case RISCV::QC_MVLT:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index ff4a040..5407868 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -524,7 +524,7 @@ class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11,
let Inst{30-20} = imm11;
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
class QCISELECTIICC<bits<3> funct3, string opcodestr>
: RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
(ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2),
@@ -537,7 +537,7 @@ class QCISELECTIICC<bits<3> funct3, string opcodestr>
let rs2 = simm1;
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
class QCISELECTICC<bits<3> funct3, string opcodestr>
: RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
(ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2),
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index f88d51f..99c4982 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV,
/// FastCC.
static void ChangeCalleesToFastCall(Function *F) {
for (User *U : F->users())
- cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
+ if (auto *Call = dyn_cast<CallBase>(U))
+ if (Call->getCalledOperand() == F)
+ Call->setCallingConv(CallingConv::Fast);
}
static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
@@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F,
return false;
for (User *U : F.users()) {
- CallBase &CB = cast<CallBase>(*U);
- Function *CallerFunc = CB.getParent()->getParent();
+ CallBase *CB = dyn_cast<CallBase>(U);
+ if (!CB || CB->getCalledOperand() != &F)
+ continue;
+ Function *CallerFunc = CB->getParent()->getParent();
BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
- if (!isColdCallSite(CB, CallerBFI))
+ if (!isColdCallSite(*CB, CallerBFI))
return false;
if (!llvm::is_contained(AllCallsCold, CallerFunc))
return false;
@@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F,
static void changeCallSitesToColdCC(Function *F) {
for (User *U : F->users())
- cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
+ if (auto *Call = dyn_cast<CallBase>(U))
+ if (Call->getCalledOperand() == F)
+ Call->setCallingConv(CallingConv::Cold);
}
// This function iterates over all the call instructions in the input Function
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8fbaf68..ff063f9 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
// - or: pick -1
// - select's condition: if the true value is constant, choose it by making
// the condition true.
+ // - phi: pick the common constant across operands
// - default: pick 0
//
// Note that this transform is intentionally done here rather than
@@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
// TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid
// duplicating logic for binops at least.
auto getUndefReplacement = [&](Type *Ty) {
- Value *BestValue = nullptr;
+ auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * {
+ // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be
+ // removed.
+ Constant *BestValue = nullptr;
+ for (Value *V : PN.incoming_values()) {
+ if (match(V, m_Freeze(m_Undef())))
+ continue;
+
+ Constant *C = dyn_cast<Constant>(V);
+ if (!C)
+ return nullptr;
+
+ if (!isGuaranteedNotToBeUndefOrPoison(C))
+ return nullptr;
+
+ if (BestValue && BestValue != C)
+ return nullptr;
+
+ BestValue = C;
+ }
+ return BestValue;
+ };
+
Value *NullValue = Constant::getNullValue(Ty);
- for (const auto *U : I.users()) {
+ Value *BestValue = nullptr;
+ for (auto *U : I.users()) {
Value *V = NullValue;
if (match(U, m_Or(m_Value(), m_Value())))
V = ConstantInt::getAllOnesValue(Ty);
@@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) {
if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT))
V = NullValue;
+ } else if (auto *PHI = dyn_cast<PHINode>(U)) {
+ if (Value *MaybeV = pickCommonConstantFromPHI(*PHI))
+ V = MaybeV;
}
if (!BestValue)
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 123881e..21b2652 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J,
// Preserve !nosanitize if both K and J have it.
K->setMetadata(Kind, JMD);
break;
+ case LLVMContext::MD_captures:
+ K->setMetadata(
+ Kind, MDNode::fromCaptureComponents(
+ K->getContext(), MDNode::toCaptureComponents(JMD) |
+ MDNode::toCaptureComponents(KMD)));
+ break;
}
}
// Set !invariant.group from J if J has it. If both instructions have it