aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp96
1 files changed, 81 insertions, 15 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d67e4a..8f44c03 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- // Flat -> private/local is a simple truncate.
- // Flat -> global is no-op
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
+ return false;
+ }
+
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
return true;
+ }
const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !Subtarget->hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// Note: this feature (register) is broken. When used as a 32-bit operand,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ SDValue FlatScratchBaseLo(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, SL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
+ 0);
+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ }
+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;
@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
-
- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- SDValue CvtPtr =
- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ SDValue CvtPtr;
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
+ AllOnes, ThreadID);
+ if (Subtarget->isWave64())
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
+ AllOnes, ThreadID);
+ SDValue ShAmt = DAG.getShiftAmountConstant(
+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ SDValue FlatScratchBase = {
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B64, SL, MVT::i64,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
+ 0};
+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
+ } else {
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ }
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
SDLoc SL(Op);
- unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
- ? AMDGPUAS::LOCAL_ADDRESS
- : AMDGPUAS::PRIVATE_ADDRESS;
- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
SDValue SrcVec =
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
-
SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
DAG.getConstant(1, SL, MVT::i32));
+
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
+ ? AMDGPUAS::LOCAL_ADDRESS
+ : AMDGPUAS::PRIVATE_ADDRESS;
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ SDValue FlatScratchBaseHi(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
+ 0);
+ // Test bits 63..58 against the aperture address.
+ return DAG.getSetCC(
+ SL, MVT::i1,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
+ }
+
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
case Intrinsic::amdgcn_perm:
@@ -17630,6 +17695,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
RMW->hasMetadata("amdgpu.no.remote.memory"))
return true;
+ if (Subtarget.hasEmulatedSystemScopeAtomics())
+ return true;
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
return true;
@@ -17877,8 +17944,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- // Always expand system scope min/max atomics.
- if (HasSystemScope)
+ if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics())
return AtomicExpansionKind::CmpXChg;
}