aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp87
1 files changed, 77 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1fdf272..a6e4a63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !ST.hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// FIXME: It would be more natural to emit a COPY here, but then copy
// coalescing would kick in and it would think it's okay to use the "HI"
// subregister (instead of extracting the HI 32 bits) which is an artificial
@@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+ auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ const LLT S32 = LLT::scalar(32);
+ Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
+ Register FlatScratchBaseLo =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
+ Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
+ // Extract low 32-bits of the pointer.
+ return B.buildExtract(Dst, Src, 0).getReg(0);
+ };
+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
- // Extract low 32-bits of the pointer.
- B.buildExtract(Dst, Src, 0);
+ castFlatToLocalOrPrivate(Dst);
MI.eraseFromParent();
return true;
}
@@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto FlatNull = B.buildConstant(SrcTy, 0);
// Extract low 32-bits of the pointer.
- auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
+ auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
@@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
- Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
- if (!ApertureReg.isValid())
- return false;
-
// Coerce the type of the low half of the result so we can use
// merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ Register AllOnes = B.buildConstant(S32, -1).getReg(0);
+ Register ThreadID = B.buildConstant(S32, 0).getReg(0);
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ if (ST.isWave64()) {
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ }
+ Register ShAmt =
+ B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
+ Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
+ Register CvtPtr =
+ B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ Register FlatScratchBase =
+ B.buildInstr(AMDGPU::S_MOV_B64, {S64},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
+ return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
+ }
+
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
- Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ const LLT S32 = LLT::scalar(32);
+ auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
- B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ Register FlatScratchBaseHi =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
+ // Test bits 63..58 against the aperture address.
+ Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
+ B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
+ B.buildConstant(S32, 1u << 26));
+ } else {
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ }
MI.eraseFromParent();
return true;
}