diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced..b7fa899 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { + // On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + auto applyPendingXcntGroup = [this](unsigned E) { + unsigned LowerBound = getScoreLB(X_CNT); + applyWaitcnt(X_CNT, 0); + PendingEvents |= (1 << E); + setScoreLB(X_CNT, LowerBound); + }; + // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); + if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { + if (hasPendingEvent(VMEM_GROUP)) + applyPendingXcntGroup(VMEM_GROUP); + else + applyWaitcnt(X_CNT, 0); + return; + } // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + !hasPendingEvent(STORE_CNT)) { + if (hasPendingEvent(SMEM_GROUP)) + applyPendingXcntGroup(SMEM_GROUP); + else + applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + return; + } applyWaitcnt(X_CNT, Wait.XCnt); } |
