diff options
author | vangthao95 <vang.thao@amd.com> | 2024-06-24 14:18:23 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-24 14:18:23 -0700 |
commit | 3aef525aa4b9a5395b6ac4ae771e28e64b27a126 (patch) | |
tree | d4bf7678a21d4fa8c30967b1c54d411ce03f3859 | |
parent | 5413a2bb843a3c71e0891aa5984afd63cd580dea (diff) | |
download | llvm-3aef525aa4b9a5395b6ac4ae771e28e64b27a126.zip llvm-3aef525aa4b9a5395b6ac4ae771e28e64b27a126.tar.gz llvm-3aef525aa4b9a5395b6ac4ae771e28e64b27a126.tar.bz2 |
[AMDGPU] Fix negative immediate offset for unbuffered smem loads (#89165)
For unbuffered smem loads, it is illegal for the immediate offset to be
negative if the resulting IOFFSET + (SGPR[Offset] or M0 or zero) is
negative.
New PR of https://github.com/llvm/llvm-project/pull/79553.
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 56 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll | 68 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 204 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 38 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/smrd.ll | 6 |
12 files changed, 357 insertions, 104 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5b616d8..64868c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2004,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// For unbuffered smem loads, it is illegal for the Immediate Offset to be +// negative if the resulting (Offset + (M0 or SOffset or zero) is negative. +// Handle the case where the Immediate Offset + SOffset is negative. +bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, + bool Imm32Only, + bool IsBuffer, + int64_t ImmOffset) const { + if (!IsBuffer && !Imm32Only && ImmOffset < 0 && + AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) { + KnownBits SKnown = CurDAG->computeKnownBits(*SOffset); + if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0) + return false; + } + + return true; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); @@ -2017,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!C) { if (!SOffset) return false; + if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode; - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode.getOperand(0); - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } } return false; @@ -2036,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, // GFX9 and GFX10 have signed byte immediate offsets. The immediate // offset for S_BUFFER instructions is unsigned. int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); - std::optional<int64_t> EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer); + std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset( + *Subtarget, ByteOffset, IsBuffer, HasSOffset); if (EncodedOffset && Offset && !Imm32Only) { *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; @@ -2096,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // true, match only 32-bit immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, - bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && - SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); + + if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + return false; + + int64_t ImmOff = 0; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) + ImmOff = C->getSExtValue(); + + return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, + ImmOff); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2121,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, } if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) { + + if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) { + if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N1; return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index ae6fdb5..bb3bcc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -132,6 +132,8 @@ private: bool isFlatScratchBaseLegal(SDValue Addr) const; bool isFlatScratchBaseLegalSV(SDValue Addr) const; bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; + bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only, + bool IsBuffer, int64_t ImmOffset = 0) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -174,11 +176,13 @@ private: bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -190,6 +194,8 @@ private: bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, SDValue &Offset) const; + bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 03e2d62..5959fcf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4216,10 +4216,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; const GEPInfo &GEPI = AddrInfo[0]; - std::optional<int64_t> EncodedImm = - AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); + std::optional<int64_t> EncodedImm; if (SOffset && Offset) { + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/true); if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; @@ -4229,6 +4230,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; + if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) + return true; + + // For unbuffered smem loads, it is illegal for the Immediate Offset + // to be negative if the resulting (Offset + (M0 or SOffset or zero) + // is negative. Handle the case where the Immediate Offset + SOffset + // is negative. + auto SKnown = KB->getKnownBits(*SOffset); + if (*Offset + SKnown.getMinValue().getSExtValue() < 0) + return false; + return true; } } @@ -4236,6 +4248,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; } + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/false); if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { Base = GEPI.SgprParts[0]; *Offset = *EncodedImm; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e6a439e..9886235 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -159,6 +159,12 @@ namespace llvm { namespace AMDGPU { +/// \returns true if the target supports signed immediate offset for SMRD +/// instructions. +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { + return isGFX9Plus(ST); +} + /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI) { return STI.getTargetTriple().getOS() == Triple::AMDHSA; @@ -2819,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { return isGCN3Encoding(ST) || isGFX10Plus(ST); } -static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { - return isGFX9Plus(ST); -} - bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset) { if (isGFX12Plus(ST)) @@ -2857,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, } std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer) { + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset) { + // For unbuffered smem loads, it is illegal for the Immediate Offset to be + // negative if the resulting (Offset + (M0 or SOffset or zero) is negative. + // Handle case where SOffset is not present. + if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST)) + return std::nullopt; + if (isGFX12Plus(ST)) // 24 bit signed offsets return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset) : std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 88b449c..af2f0bc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1307,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI); bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -1479,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); /// S_LOAD instructions have a signed offset, on other subtargets it is /// unsigned. S_BUFFER has an unsigned offset for all subtargets. std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer); + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset = false); /// \return The encoding that can be used for a 32-bit literal offset in an SMRD /// instruction. This is only useful on CI.s diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir index c444772..504f769 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -1234,7 +1234,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -1 @@ -1304,7 +1312,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -524288 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 139f82b..9ee0acf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -88,11 +88,13 @@ entry: ret void } -; GFX9_10 can use a signed immediate byte offset +; GFX9+ can use a signed immediate byte offset but not without sgpr[offset] ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 -; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index 54dc5b8..41d2360 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -297,9 +297,11 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: .LBB5_1: ; %loop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190 ; GFX9-NEXT: s_add_i32 s2, s2, -1 +; GFX9-NEXT: s_add_u32 s4, s0, 0xfffffe70 +; GFX9-NEXT: s_addc_u32 s5, s1, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %end @@ -307,10 +309,14 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; ; GFX12-LABEL: test_sink_smem_offset_neg400: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_movk_i32 s4, 0xfe70 +; GFX12-NEXT: s_mov_b32 s5, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: .LBB5_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll index c69207c..08da89e 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll @@ -19,15 +19,31 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace( } define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) { -; GCN-LABEL: test_s_load_i8_imm: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_i8 s0, s[0:1], -0x64 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; DAG-LABEL: test_s_load_i8_imm: +; DAG: ; %bb.0: +; DAG-NEXT: s_movk_i32 s2, 0xff9c +; DAG-NEXT: s_mov_b32 s3, -1 +; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; DAG-NEXT: s_load_i8 s0, s[0:1], 0x0 +; DAG-NEXT: s_wait_kmcnt 0x0 +; DAG-NEXT: v_mov_b32_e32 v2, s0 +; DAG-NEXT: global_store_b32 v[0:1], v2, off +; DAG-NEXT: s_nop 0 +; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_s_load_i8_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff9c +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GISEL-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100 %ld = load i8, ptr addrspace(4) %gep %sext = sext i8 %ld to i32 @@ -195,15 +211,31 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace } define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) { -; GCN-LABEL: test_s_load_i16_imm: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_i16 s0, s[0:1], -0xc8 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; DAG-LABEL: test_s_load_i16_imm: +; DAG: ; %bb.0: +; DAG-NEXT: s_movk_i32 s2, 0xff38 +; DAG-NEXT: s_mov_b32 s3, -1 +; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; DAG-NEXT: s_load_i16 s0, s[0:1], 0x0 +; DAG-NEXT: s_wait_kmcnt 0x0 +; DAG-NEXT: v_mov_b32_e32 v2, s0 +; DAG-NEXT: global_store_b32 v[0:1], v2, off +; DAG-NEXT: s_nop 0 +; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_s_load_i16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff38 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100 %ld = load i16, ptr addrspace(4) %gep %sext = sext i16 %ld to i32 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index d9cbbc1..2f7e91f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -157,12 +157,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4096: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1000 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf000 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -198,12 +211,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4097: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1001 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xefff +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffefff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -239,12 +265,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4098: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1002 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xeffe +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffeffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -376,12 +415,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2048: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf800 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -413,12 +465,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2049: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x801 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7ff +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -450,12 +515,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2050: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x802 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7fe +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7fe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -525,12 +603,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800000 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b32 s0, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -1721,12 +1812,29 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:-0x18 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b32 s5, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[4:5] +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xffe8 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffffe8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index 77fd0bc..2b51773 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -53,14 +53,25 @@ entry: } define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: prefetch_data_sgpr_min_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_prefetch_data s[0:1], -0x800000, null, 0 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_mov_b32 s2, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: prefetch_data_sgpr_min_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -215,14 +226,25 @@ entry: } define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: prefetch_inst_sgpr_min_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_prefetch_inst s[0:1], -0x800000, null, 0 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_mov_b32 s2, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: prefetch_inst_sgpr_min_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 4ce9260..52db7fea 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -88,11 +88,13 @@ entry: ret void } -; GFX9_10 can use a signed immediate byte offset +; GFX9+ can use a signed immediate byte offset but not without sgpr[offset] ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, -0x4 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1 |