diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2025-08-05 16:25:23 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-08-05 16:25:23 -0700 |
commit | b8eb61adc92bb384bc63f01b7ccddd931409b223 (patch) | |
tree | eb55d78905e7c7b86fd948036e5d2ae7af10711b /llvm/lib | |
parent | 4c2d56318fec16d1d5241cd91d38b59cb65bce83 (diff) | |
download | llvm-main.zip llvm-main.tar.gz llvm-main.tar.bz2 |
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 87 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 91 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 3 |
3 files changed, 157 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1fdf272..a6e4a63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE; + assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE || + !ST.hasGloballyAddressableScratch()) && + "Cannot use src_private_base with globally addressable scratch!"); // FIXME: It would be more natural to emit a COPY here, but then copy // coalescing would kick in and it would think it's okay to use the "HI" // subregister (instead of extracting the HI 32 bits) which is an artificial @@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (SrcAS == AMDGPUAS::FLAT_ADDRESS && (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { + auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register { + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + // flat -> private with globally addressable scratch: subtract + // src_flat_scratch_base_lo. + const LLT S32 = LLT::scalar(32); + Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0); + Register FlatScratchBaseLo = + B.buildInstr(AMDGPU::S_MOV_B32, {S32}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)}) + .getReg(0); + MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass); + Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0); + return B.buildIntToPtr(Dst, Sub).getReg(0); + } + + // Extract low 32-bits of the pointer. + return B.buildExtract(Dst, Src, 0).getReg(0); + }; + // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for // G_ADDRSPACE_CAST we need to guess. if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { - // Extract low 32-bits of the pointer. - B.buildExtract(Dst, Src, 0); + castFlatToLocalOrPrivate(Dst); MI.eraseFromParent(); return true; } @@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( auto FlatNull = B.buildConstant(SrcTy, 0); // Extract low 32-bits of the pointer. - auto PtrLo32 = B.buildExtract(DstTy, Src, 0); + auto PtrLo32 = castFlatToLocalOrPrivate(DstTy); auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); @@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register { - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); - if (!ApertureReg.isValid()) - return false; - // Coerce the type of the low half of the result so we can use // merge_values. Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr + // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr + Register AllOnes = B.buildConstant(S32, -1).getReg(0); + Register ThreadID = B.buildConstant(S32, 0).getReg(0); + ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32}) + .addUse(AllOnes) + .addUse(ThreadID) + .getReg(0); + if (ST.isWave64()) { + ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32}) + .addUse(AllOnes) + .addUse(ThreadID) + .getReg(0); + } + Register ShAmt = + B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0); + Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0); + Register CvtPtr = + B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0); + // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full + // 64-bit hi:lo value. + Register FlatScratchBase = + B.buildInstr(AMDGPU::S_MOV_B64, {S64}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)}) + .getReg(0); + MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass); + return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0); + } + + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; + // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0); @@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const { - Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); - auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); + const LLT S32 = LLT::scalar(32); + auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg()); Register Hi32 = Unmerge.getReg(1); - B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + Register FlatScratchBaseHi = + B.buildInstr(AMDGPU::S_MOV_B32, {S32}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)}) + .getReg(0); + MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass); + // Test bits 63..58 against the aperture address. + Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0); + B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR, + B.buildConstant(S32, 1u << 26)); + } else { + Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + } MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4d67e4a..63826b7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - // Flat -> private/local is a simple truncate. - // Flat -> global is no-op - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // Flat -> private requires subtracting src_flat_scratch_base_lo. + return false; + } + + // Flat -> private/local is a simple truncate. + // Flat -> global is no-op return true; + } const GCNTargetMachine &TM = static_cast<const GCNTargetMachine &>(getTargetMachine()); @@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE; + assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE || + !Subtarget->hasGloballyAddressableScratch()) && + "Cannot use src_private_base with globally addressable scratch!"); // Note: this feature (register) is broken. When used as a 32-bit operand, // it returns a wrong value (all zeroes?). The real value is in the upper 32 // bits. @@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, DestAS == AMDGPUAS::PRIVATE_ADDRESS) { SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // flat -> private with globally addressable scratch: subtract + // src_flat_scratch_base_lo. + SDValue FlatScratchBaseLo( + DAG.getMachineNode( + AMDGPU::S_MOV_B32, SL, MVT::i32, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)), + 0); + Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo); + } + if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) return Ptr; @@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, if (DestAS == AMDGPUAS::FLAT_ADDRESS) { if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { - - SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); - SDValue CvtPtr = - DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); - CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + SDValue CvtPtr; + if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr + // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr + SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32); + SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32); + ThreadID = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32), + AllOnes, ThreadID); + if (Subtarget->isWave64()) + ThreadID = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32), + AllOnes, ThreadID); + SDValue ShAmt = DAG.getShiftAmountConstant( + 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL); + SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt); + CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full + // 64-bit hi:lo value. + SDValue FlatScratchBase = { + DAG.getMachineNode( + AMDGPU::S_MOV_B64, SL, MVT::i64, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)), + 0}; + CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase); + } else { + SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); + CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + } if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) return CvtPtr; @@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: { SDLoc SL(Op); - unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) - ? AMDGPUAS::LOCAL_ADDRESS - : AMDGPUAS::PRIVATE_ADDRESS; - SDValue Aperture = getSegmentAperture(AS, SL, DAG); SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, DAG.getConstant(1, SL, MVT::i32)); + + unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) + ? AMDGPUAS::LOCAL_ADDRESS + : AMDGPUAS::PRIVATE_ADDRESS; + if (AS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + SDValue FlatScratchBaseHi( + DAG.getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)), + 0); + // Test bits 63..58 against the aperture address. + return DAG.getSetCC( + SL, MVT::i1, + DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi), + DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT); + } + + SDValue Aperture = getSegmentAperture(AS, SL, DAG); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } case Intrinsic::amdgcn_perm: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index ed6b973..81655f5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -866,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE, - SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> { + SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA, + SRC_FLAT_SCRATCH_BASE)> { let CopyCost = 1; let AllocationPriority = 1; let HasSGPR = 1; |