diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 83 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 73 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 76 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 9 |
8 files changed, 164 insertions, 109 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 49d8b44..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f25ce87..6118933 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4846,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..38f9ee5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -836,8 +836,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +918,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7207c25..0f172e0d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d51ec6..9017f4f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15896,6 +15896,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15944,6 +16016,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 607825e..f1262e11 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,8 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -603,8 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); if (!CPol) return false; @@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return false; @@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); - Changed |= CC->finalizeStore(MI, /*Atomic=*/true); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->finalizeStore(MI, /*Atomic=*/false); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..83e63ac 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { |