diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 52 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SMInstructions.td | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 568 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll | 370 |
13 files changed, 1016 insertions, 47 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b72e6f0..8b8fc8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -280,6 +280,12 @@ def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..108842f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -446,5 +446,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7ecba1e..a6ce745 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 877c3ac..8ca9a97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -7068,6 +7068,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 5f7f05c..61d9de1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -414,6 +414,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index a10dca2..787db67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3501,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0b855a4..5ccf1e5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2184,6 +2184,50 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { def : GCNPat < (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), @@ -2198,6 +2242,14 @@ multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { } let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + // Patterns for target intrinsics defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4d76b79..88a269f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -247,6 +247,7 @@ protected: bool HasInstFwdPrefetchBug = false; bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -995,6 +996,8 @@ public: bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c..40b8bcd 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -392,11 +392,13 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0f04a5a..74fe2b8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -4444,19 +4444,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 38cc51b..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } @@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index 429b3b8..6e24a6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -1,36 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s ; Scalar data prefetch define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1) ret void } define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:512 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr_offset: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -40,14 +58,20 @@ entry: ; Check large offsets define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr_max_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:8388607 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -55,6 +79,20 @@ entry: } define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { +; GFX1250-LABEL: prefetch_data_sgpr_min_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:-8388608 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset: +; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000) +; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm +; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_min_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm @@ -68,6 +106,13 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-SDAG-NEXT: s_endpgm ; +; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset: +; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm +; ; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset: ; SPREFETCH-GISEL: ; %bb.0: ; %entry ; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 @@ -81,6 +126,18 @@ entry: } define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { +; GFX1250-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000 +; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm +; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_too_large_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm @@ -91,6 +148,13 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre ; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-SDAG-NEXT: s_endpgm ; +; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm +; ; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset: ; SPREFETCH-GISEL: ; %bb.0: ; %entry ; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 @@ -105,15 +169,113 @@ entry: ; Check divergent address -define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) { -; GCN-LABEL: prefetch_data_vgpr: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +define amdgpu_ps void @prefetch_data_vgpr_global(ptr addrspace(1) %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_global: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_global: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1) ret void } +define amdgpu_ps void @prefetch_data_vgpr_flat(ptr %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_flat: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_global(ptr addrspace(1) inreg %ptr, i32 %offset) { +; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_endpgm +; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_global: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset + tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_flat(ptr inreg %ptr, i32 %offset) { +; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_endpgm +; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_flat: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +entry: + %gep1 = getelementptr i8, ptr %ptr, i32 %offset + %gep2 = getelementptr i8, ptr %gep1, i32 128 + tail call void @llvm.prefetch.pf(ptr %gep2, i32 0, i32 0, i32 1) + ret void +} + ; Check LDS and Scratch, we cannot prefetch it define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) { @@ -137,43 +299,59 @@ entry: ; Check supported address spaces define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr_flat: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr_flat: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1) ret void } define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr_global: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr_global: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr_global: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_global: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1) ret void } define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_mov_b32 s1, 0 ; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1) ret void @@ -182,28 +360,36 @@ entry: ; I$ prefetch define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_inst_sgpr: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_inst_sgpr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_inst_sgpr: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_inst_sgpr: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0) ret void } define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_inst_sgpr_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_inst_sgpr_offset: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) @@ -213,14 +399,18 @@ entry: ; Check large offsets define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { -; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: -; NOSPREFETCH: ; %bb.0: ; %entry -; NOSPREFETCH-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_inst_sgpr_max_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm ; ; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: ; SPREFETCH: ; %bb.0: ; %entry ; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0 ; SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) @@ -228,6 +418,18 @@ entry: } define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { +; GFX1250-LABEL: prefetch_inst_sgpr_min_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset: +; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000) +; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm +; ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_min_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm @@ -241,6 +443,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 ; SPREFETCH-SDAG-NEXT: s_endpgm ; +; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset: +; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm +; ; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset: ; SPREFETCH-GISEL: ; %bb.0: ; %entry ; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 @@ -254,6 +463,16 @@ entry: } define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { +; GFX1250-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000 +; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm +; ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_too_large_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm @@ -264,6 +483,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre ; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 ; SPREFETCH-SDAG-NEXT: s_endpgm ; +; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm +; ; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset: ; SPREFETCH-GISEL: ; %bb.0: ; %entry ; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 @@ -276,6 +502,282 @@ entry: ret void } +; Check cache locality + +define amdgpu_ps void @prefetch_data_vgpr_flat_dev(ptr %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_flat_dev: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_dev: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 1, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_flat_se(ptr %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_flat_se: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_se: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 2, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_flat_cu(ptr %ptr) { +; GL2-ONLY-LABEL: prefetch_data_vgpr_flat_cu: +; GL2-ONLY: ; %bb.0: ; %entry +; GL2-ONLY-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE +; GL2-ONLY-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; SAFE-CU-LABEL: prefetch_data_vgpr_flat_cu: +; SAFE-CU: ; %bb.0: ; %entry +; SAFE-CU-NEXT: flat_prefetch_b8 v[0:1] +; SAFE-CU-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_cu: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 3, i32 1) + ret void +} + +; flat offset + +define amdgpu_ps void @prefetch_data_vgpr_flat_offset(ptr %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_flat_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + %gep = getelementptr float, ptr %ptr, i32 128 + tail call void @llvm.prefetch.pf(ptr %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_global_offset(ptr addrspace(1) %ptr) { +; GFX1250-LABEL: prefetch_data_vgpr_global_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + %gep = getelementptr float, ptr addrspace(1) %ptr, i32 128 + tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_global_saddr(ptr addrspace(1) inreg %ptr, i32 %voffset) { +; GFX1250-LABEL: prefetch_data_vgpr_global_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset + tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_global_saddr_offset(ptr addrspace(1) inreg %ptr, i32 %voffset) { +; GFX1250-LABEL: prefetch_data_vgpr_global_saddr_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset + %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128 + tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep2, i32 0, i32 0, i32 1) + ret void +} + +; Cannot prefetch I$ with flat or global instructions. + +define amdgpu_ps void @prefetch_inst_vgpr_global(ptr addrspace(1) %ptr) { +; GCN-LABEL: prefetch_inst_vgpr_global: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_vgpr_flat(ptr %ptr) { +; GCN-LABEL: prefetch_inst_vgpr_flat: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 0) + ret void +} + +; Force vector prefetch for uniform address with rw = 1 argument. + +define amdgpu_ps void @prefetch_data_sgpr_flat_force_vector(ptr inreg %ptr) { +; GFX1250-LABEL: prefetch_data_sgpr_flat_force_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 1, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_global_force_vector(ptr addrspace(1) inreg %ptr) { +; GFX1250-LABEL: prefetch_data_sgpr_global_force_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 1, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_global_saddr_force_vector(ptr addrspace(1) inreg %ptr) { +; GFX1250-LABEL: prefetch_data_sgpr_global_saddr_force_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x400, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 1024 + tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 1, i32 0, i32 1) + ret void +} + declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32) declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32) declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 874dece..1e6b77e 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GFX12-LABEL: copy_flat: @@ -55,6 +56,33 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end ; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_flat: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB0_3 +; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX1250-NEXT: .LBB0_2: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176 +; GFX1250-NEXT: flat_prefetch_b8 v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_add_co_i32 s6, s6, -1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b128 v0, v[2:5], s[0:1] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX1250-NEXT: s_cbranch_scc1 .LBB0_2 +; GFX1250-NEXT: .LBB0_3: ; %for.end +; GFX1250-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 br i1 %cmp6.not, label %for.end, label %for.body @@ -123,6 +151,33 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end ; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_global: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB1_3 +; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX1250-NEXT: .LBB1_2: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176 +; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_add_co_i32 s6, s6, -1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX1250-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1250-NEXT: .LBB1_3: ; %for.end +; GFX1250-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 br i1 %cmp6.not, label %for.end, label %for.body @@ -193,6 +248,34 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end ; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_constant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB2_3 +; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: .LBB2_2: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] offset:176 scope:SCOPE_SE +; GFX1250-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 +; GFX1250-NEXT: s_add_co_i32 s6, s6, -1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GFX1250-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX1250-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1250-NEXT: .LBB2_3: ; %for.end +; GFX1250-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 br i1 %cmp6.not, label %for.end, label %for.body @@ -262,6 +345,29 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end ; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_local: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX1250-NEXT: .LBB3_1: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v4, s0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, -1 +; GFX1250-NEXT: s_add_co_i32 s0, s0, 16 +; GFX1250-NEXT: s_add_co_i32 s1, s1, 16 +; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 +; GFX1250-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-NEXT: s_wait_dscnt 0x1 +; GFX1250-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 +; GFX1250-NEXT: s_wait_dscnt 0x1 +; GFX1250-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 +; GFX1250-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1250-NEXT: .LBB3_2: ; %for.end +; GFX1250-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 br i1 %cmp6.not, label %for.end, label %for.body @@ -280,3 +386,267 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %for.body, %entry ret void } + +define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { +; GFX12-LABEL: copy_flat_divergent: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_3 +; GFX12-NEXT: ; %bb.1: ; %for.body.preheader +; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: .LBB4_2: ; %for.body +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b128 v[0:1], v[4:7] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX12-NEXT: .LBB4_3: ; %for.end +; GFX12-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: copy_flat_divergent: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 +; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3 +; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader +; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body +; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe +; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7] +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX12-SPREFETCH-NEXT: .LBB4_3: ; %for.end +; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_flat_divergent: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 +; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader +; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] +; GFX1250-NEXT: .LBB4_2: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] +; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1] +; GFX1250-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX1250-NEXT: .LBB4_3: ; %for.end +; GFX1250-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %s.tid = getelementptr inbounds <4 x i32>, ptr %s, i32 %tid + %d.tid = getelementptr inbounds <4 x i32>, ptr %d, i32 %tid + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr %s.tid, i64 %idxprom + %ld = load <4 x i32>, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d.tid, i64 %idxprom + store <4 x i32> %ld, ptr %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { +; GFX12-LABEL: copy_global_divergent: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX12-NEXT: ; %bb.1: ; %for.body.preheader +; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: .LBB5_2: ; %for.body +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX12-NEXT: .LBB5_3: ; %for.end +; GFX12-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: copy_global_divergent: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 +; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader +; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: .LBB5_2: ; %for.body +; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe +; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0 +; GFX12-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX12-SPREFETCH-NEXT: .LBB5_3: ; %for.end +; GFX12-SPREFETCH-NEXT: s_endpgm +; +; GFX1250-LABEL: copy_global_divergent: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader +; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] +; GFX1250-NEXT: .LBB5_2: ; %for.body +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 +; GFX1250-NEXT: global_prefetch_b8 v[2:3], off scope:SCOPE_SE +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] +; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1] +; GFX1250-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX1250-NEXT: .LBB5_3: ; %for.end +; GFX1250-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %s.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i32 %tid + %d.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i32 %tid + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s.tid, i64 %idxprom + %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d.tid, i64 %idxprom + store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() |