aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-06-23 10:30:20 +0200
committerGitHub <noreply@github.com>2024-06-23 10:30:20 +0200
commit70c8b9c24a7cf2b7c6e65675cbdb42a65ff668ba (patch)
treee75bc9aeb76f2a0cf3230fd64f76517a441177e8
parent414c74149c0085e3c11496af171217d5317481e1 (diff)
downloadllvm-70c8b9c24a7cf2b7c6e65675cbdb42a65ff668ba.zip
llvm-70c8b9c24a7cf2b7c6e65675cbdb42a65ff668ba.tar.gz
llvm-70c8b9c24a7cf2b7c6e65675cbdb42a65ff668ba.tar.bz2
AMDGPU: Remove ds atomic fadd intrinsics (#95396)
These have been replaced with atomicrmw fadd
-rw-r--r--clang/lib/CodeGen/CGBuiltin.cpp2
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAMDGPU.td5
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp92
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp15
-rw-r--r--llvm/test/Bitcode/amdgcn-atomic.ll136
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll125
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll279
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll102
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll25
18 files changed, 255 insertions, 654 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6316e2b..77fd711 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19105,7 +19105,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
} else {
- // The ds_fadd_* builtins do not have syncscope/order arguments.
+ // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
SSID = llvm::SyncScope::System;
AO = AtomicOrdering::SequentiallyConsistent;
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8a5566a..7a5e919 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -571,7 +571,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
-def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
@@ -2930,10 +2929,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
- [llvm_v2i16_ty],
- [LLVMQualPointerType<3>, llvm_v2i16_ty],
- [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index fc6bea0..d7825d9 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,6 +1033,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
break; // No other 'amdgcn.atomic.*'
}
+ if (Name.starts_with("ds.fadd")) {
+ // Replaced with atomicrmw fadd, so there's no new declaration.
+ NewFn = nullptr;
+ return true;
+ }
+
if (Name.starts_with("ldexp.")) {
// Target specific intrinsic became redundant
NewFn = Intrinsic::getDeclaration(
@@ -2331,40 +2337,74 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
llvm_unreachable("Unknown function for ARM CallBase upgrade.");
}
+// These are expected to have the arguments:
+// atomic.intrin (ptr, rmw_value, ordering, scope, isVolatile)
+//
+// Except for int_amdgcn_ds_fadd_v2bf16 which only has (ptr, rmw_value).
+//
static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
- const bool IsInc = Name.starts_with("atomic.inc.");
- if (IsInc || Name.starts_with("atomic.dec.")) {
- if (CI->getNumOperands() != 6) // Malformed bitcode.
- return nullptr;
+ AtomicRMWInst::BinOp RMWOp =
+ StringSwitch<AtomicRMWInst::BinOp>(Name)
+ .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+ .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
+ .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+
+ unsigned NumOperands = CI->getNumOperands();
+ if (NumOperands < 3) // Malformed bitcode.
+ return nullptr;
- AtomicRMWInst::BinOp RMWOp =
- IsInc ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
+ Value *Ptr = CI->getArgOperand(0);
+ if (!isa<PointerType>(Ptr->getType())) // Malformed.
+ return nullptr;
- Value *Ptr = CI->getArgOperand(0);
- Value *Val = CI->getArgOperand(1);
- ConstantInt *OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ Value *Val = CI->getArgOperand(1);
+ if (Val->getType() != CI->getType()) // Malformed.
+ return nullptr;
+
+ ConstantInt *OrderArg = nullptr;
+ bool IsVolatile = false;
+
+ // These should have 5 arguments (plus the callee). A separate version of the
+ // ds_fadd intrinsic was defined for bf16 which was missing arguments.
+ if (NumOperands > 3)
+ OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+
+ // Ignore scope argument at 3
+
+ if (NumOperands > 5) {
ConstantInt *VolatileArg = dyn_cast<ConstantInt>(CI->getArgOperand(4));
+ IsVolatile = !VolatileArg || !VolatileArg->isZero();
+ }
- AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
- if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
- Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
- if (Order == AtomicOrdering::NotAtomic ||
- Order == AtomicOrdering::Unordered)
- Order = AtomicOrdering::SequentiallyConsistent;
-
- // The scope argument never really worked correctly. Use agent as the most
- // conservative option which should still always produce the instruction.
- SyncScope::ID SSID = F->getContext().getOrInsertSyncScopeID("agent");
- AtomicRMWInst *RMW =
- Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
-
- if (!VolatileArg || !VolatileArg->isZero())
- RMW->setVolatile(true);
- return RMW;
+ AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
+ if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
+ Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
+ if (Order == AtomicOrdering::NotAtomic || Order == AtomicOrdering::Unordered)
+ Order = AtomicOrdering::SequentiallyConsistent;
+
+ LLVMContext &Ctx = F->getContext();
+
+ // Handle the v2bf16 intrinsic which used <2 x i16> instead of <2 x bfloat>
+ Type *RetTy = CI->getType();
+ if (VectorType *VT = dyn_cast<VectorType>(RetTy)) {
+ if (VT->getElementType()->isIntegerTy(16)) {
+ VectorType *AsBF16 =
+ VectorType::get(Type::getBFloatTy(Ctx), VT->getElementCount());
+ Val = Builder.CreateBitCast(Val, AsBF16);
+ }
}
- llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
+ // The scope argument never really worked correctly. Use agent as the most
+ // conservative option which should still always produce the instruction.
+ SyncScope::ID SSID = Ctx.getOrInsertSyncScopeID("agent");
+ AtomicRMWInst *RMW =
+ Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
+
+ if (IsVolatile)
+ RMW->setVolatile(true);
+
+ return Builder.CreateBitCast(RMW, RetTy);
}
/// Helper to unwrap intrinsic call MetadataAsValue operands.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 21f541d..c6dbc58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -630,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
-defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4ff945e..e7251a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5403,8 +5403,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
- return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
@@ -7333,7 +7331,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0510a1d..9e7694f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4905,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
- case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd_v2bf16: {
+ case Intrinsic::amdgcn_ds_ordered_swap: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 7b29d57..ed5bae3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -252,10 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c..1192b49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
@@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
@@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e22f8b7..219246b 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1142,16 +1142,6 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
-
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
-let AddedComplexity = 1 in
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
let OtherPredicates = [HasGDS] in
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 592ef83..4f8882e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1280,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1451,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
@@ -8700,19 +8698,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
- case Intrinsic::amdgcn_ds_fadd: {
- MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
- Opc = ISD::ATOMIC_LOAD_FADD;
- break;
- }
-
- return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
- M->getOperand(0), M->getOperand(2), M->getOperand(3),
- M->getMemOperand());
- }
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 2e6286a..311bd88 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -112,4 +112,140 @@ declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32
declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+; ptr, rmw_value, ordering, scope, isVolatile)
+declare float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)
+declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) nocapture, <2 x half>, i32 immarg, i32 immarg, i1 immarg)
+declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) nocapture, <2 x i16>)
+
+define float @upgrade_amdgcn_ds_fadd_f32(ptr addrspace(3) %ptr, float %val) {
+ ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+ %result0 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+ %result1 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+ %result2 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
+ %result3 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)
+
+ ret float %result3
+}
+
+; Handle missing type suffix
+declare float @llvm.amdgcn.ds.fadd(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fadd_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+ ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+ %result0 = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+ ret float %result0
+}
+
+define void @immarg_violations_ds_fadd_f32(ptr addrspace(3) %ptr, float %fval, i32 %val32, i1 %val1) {
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, float %fval syncscope("agent") seq_cst, align 4
+ %result0 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr, float %fval, i32 %val32, i32 %val32, i1 %val1)
+ ret void
+}
+
+declare float @llvm.amdgcn.ds.fadd.f32broken0(i32, float, i32 immarg, i32 immarg, i1 immarg)
+
+; This will just delete the invalid call, which isn't ideal, but these
+; cases were never emitted.
+; CHECK-LABEL: define void @ds_fadd_f32_invalid_not_ptr(
+; CHECK-NEXT: ret void
+define void @ds_fadd_f32_invalid_not_ptr(i32 %ptr, float %fval) {
+ %result0 = call float @llvm.amdgcn.ds.fadd.f32broken0(i32 %ptr, float %fval, i32 0, i32 0, i1 false)
+ ret void
+}
+
+declare float @llvm.amdgcn.ds.fadd.f32broken1(ptr addrspace(3), double, i32 immarg, i32 immarg, i1 immarg)
+
+; CHECK-LABEL: define void @ds_fadd_f32_invalid_misatch(
+; CHECK-NEXT: ret void
+define void @ds_fadd_f32_invalid_misatch(ptr addrspace(3) %ptr, double %fval) {
+ %result0 = call float @llvm.amdgcn.ds.fadd.f32broken1(ptr addrspace(3) %ptr, double %fval, i32 0, i32 0, i1 false)
+ ret void
+}
+
+define double @upgrade_amdgcn_ds_fadd_f64(ptr addrspace(3) %ptr, double %val) {
+ ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+ %result0 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)
+
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+ %result1 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+ %result2 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
+ %result3 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)
+
+ ret double %result3
+}
+
+; CHECK-LABEL: @immarg_violations_ds_fadd_f64(
+define void @immarg_violations_ds_fadd_f64(ptr addrspace(3) %ptr, double %fval, i32 %val32, i1 %val1) {
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, double %fval syncscope("agent") seq_cst, align 8
+ %result0 = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %fval, i32 %val32, i32 %val32, i1 %val1)
+ ret void
+}
+
+define <2 x half> @upgrade_amdgcn_ds_fadd_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) {
+ ; CHECK: atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+ %result0 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 0, i32 0, i1 false)
+
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+ %result1 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 0, i32 0, i1 true)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") seq_cst, align 4
+ %result2 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 43, i32 3, i1 false)
+
+ ; CHECK: = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val syncscope("agent") acquire, align 4
+ %result3 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %val, i32 4, i32 2, i1 false)
+
+ ret <2 x half> %result3
+}
+
+define void @immarg_violations_ds_fadd_v2f16(ptr addrspace(3) %ptr, <2 x half> %fval, i32 %val32, i1 %val1) {
+ ; CHECK: = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x half> %fval syncscope("agent") seq_cst, align 4
+ %result0 = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %fval, i32 %val32, i32 %val32, i1 %val1)
+ ret void
+}
+
+define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__as_i16(ptr addrspace(3) %ptr, <2 x i16> %val) {
+ ; CHECK: [[BC0:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+ ; CHECK-NEXT: [[RMW0:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+ ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW0]] to <2 x i16>
+ %result0 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 0, i32 0, i1 false)
+
+ ; CHECK: [[BC1:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+ ; CHECK-NEXT: [[RMW1:%[0-9]+]] = atomicrmw volatile fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC1]] syncscope("agent") seq_cst, align 4
+ ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW1]] to <2 x i16>
+ %result1 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 0, i32 0, i1 true)
+
+ ; CHECK: [[BC2:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+ ; CHECK-NEXT: [[RMW2:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC2]] syncscope("agent") seq_cst, align 4
+ ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW2]] to <2 x i16>
+ %result2 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 43, i32 3, i1 false)
+
+ ; CHECK: [[BC3:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+ ; CHECK-NEXT: [[RMW3:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC3]] syncscope("agent") acquire, align 4
+ ; CHECK-NEXT: = bitcast <2 x bfloat> [[RMW3]] to <2 x i16>
+ %result3 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val, i32 4, i32 2, i1 false)
+
+ ret <2 x i16> %result3
+}
+
+; Somehow the bf16 version was defined as a separate intrinsic with missing arguments.
+define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspace(3) %ptr, <2 x i16> %val) {
+ ; CHECK: [[BC0:%[0-9]+]] = bitcast <2 x i16> %val to <2 x bfloat>
+ ; CHECK-NEXT: [[RMW0:%[0-9]+]] = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+ ; CHECK-NEXT: [[BC1:%[0-9]+]] = bitcast <2 x bfloat> [[RMW0]] to <2 x i16>
+ ; CHECK-NEXT: ret <2 x i16> [[BC1]]
+ %result0 = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %val)
+ ret <2 x i16> %result0
+}
+
attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index fff341b..632dbd4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -7,8 +7,6 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h
; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
-declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
-declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
@@ -156,59 +154,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
ret <2 x i16> %ret
}
-define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: ds_pk_add_f16 v0, v1
-; GFX940-NEXT: s_endpgm
- %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
- ret void
-}
-
-define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
- ret <2 x half> %ret
-}
-
-define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s1
-; GFX940-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: ds_pk_add_bf16 v1, v0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
; GFX940: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a64..66b22be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -20,7 +20,6 @@ declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr,
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
-declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -1923,54 +1922,6 @@ main_body:
ret double %ret
}
-define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_noret:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: local_atomic_fadd_f64_noret:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: ds_add_f64 v2, v[0:1]
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret void
-}
-
-define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v2
-; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret double %ret
-}
-
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
@@ -1980,7 +1931,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB65_2
+; GFX90A-NEXT: s_cbranch_execz .LBB63_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -1990,7 +1941,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB65_2:
+; GFX90A-NEXT: .LBB63_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
@@ -2001,7 +1952,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB65_2
+; GFX940-NEXT: s_cbranch_execz .LBB63_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2011,7 +1962,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB65_2:
+; GFX940-NEXT: .LBB63_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2027,7 +1978,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB66_2
+; GFX90A-NEXT: s_cbranch_execz .LBB64_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2037,7 +1988,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB66_2:
+; GFX90A-NEXT: .LBB64_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
@@ -2048,7 +1999,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB66_2
+; GFX940-NEXT: s_cbranch_execz .LBB64_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2058,14 +2009,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB66_2:
+; GFX940-NEXT: .LBB64_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
ret void
}
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #2 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b64 s[2:3], exec
@@ -2074,7 +2025,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB67_2
+; GFX90A-NEXT: s_cbranch_execz .LBB65_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2084,7 +2035,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB67_2:
+; GFX90A-NEXT: .LBB65_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2095,7 +2046,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB67_2
+; GFX940-NEXT: s_cbranch_execz .LBB65_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2105,7 +2056,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB67_2:
+; GFX940-NEXT: .LBB65_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2134,54 +2085,6 @@ main_body:
ret double %ret
}
-define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v2
-; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret double %ret
-}
-
-define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v2
-; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret double %ret
-}
-
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" }
-attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" }
-attributes #3 = { "denormal-fp-math"="ieee,ieee" }
-attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
deleted file mode 100644
index 03ed683..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
-
-define amdgpu_ps float @ds_fadd_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fadd_f32_ss:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fadd_f32_ss:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fadd_f32_ss:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ; return to shader part epilog
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
- ret float %ret
-}
-
-define amdgpu_ps float @ds_fadd_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_offset:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fadd_f32_ss_offset:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fadd_f32_ss_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fadd_f32_ss_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ; return to shader part epilog
- %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
- ret float %ret
-}
-
-define amdgpu_ps void @ds_fadd_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_nortn:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_f32 v0, v1
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: ds_fadd_f32_ss_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_add_f32 v0, v1
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: ds_fadd_f32_ss_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: ds_add_f32 v0, v1
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: ds_fadd_f32_ss_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: ds_add_f32 v0, v1
-; GFX11-NEXT: s_endpgm
- %unused = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_ps void @ds_fadd_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_f32 v1, v0 offset:512
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: ds_add_f32 v1, v0 offset:512
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: ds_add_f32 v1, v0 offset:512
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: ds_fadd_f32_ss_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: ds_add_f32 v1, v0 offset:512
-; GFX11-NEXT: s_endpgm
- %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
- %unused = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
- ret void
-}
-
-define float @ds_fadd_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
- ret float %ret
-}
-
-define float @ds_fadd_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_offset:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_offset:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
- ret float %ret
-}
-
-define void @ds_fadd_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_nortn:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_f32 v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_add_f32 v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_nortn:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: ds_add_f32 v0, v1
-; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
- ret void
-}
-
-define void @ds_fadd_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_f32 v0, v1 offset:512
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_add_f32 v0, v1 offset:512
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset_nortn:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: ds_add_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
- ret void
-}
-
-define float @ds_fadd_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fadd_f32_vv_volatile:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fadd_f32_vv_volatile:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fadd_f32_vv_volatile:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
- ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fadd(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 9f339af..37a201e 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -8,111 +8,9 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i3
declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
-declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
-declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
-define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
-; GFX12-GISEL-NEXT: s_endpgm
- %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
- ret void
-}
-
-define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
-; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
-; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
-; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
-; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
- ret <2 x half> %ret
-}
-
-define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5761c19..0746b93 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -321,6 +321,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: ds_pk_add_f16 v0, v1
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
@@ -329,6 +330,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: ds_pk_add_f16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret void
@@ -349,8 +352,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
@@ -363,10 +368,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
@@ -376,7 +379,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
@@ -386,10 +389,8 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
@@ -402,7 +403,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index bf3dbec..5690b99 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1938,22 +1938,44 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
; GFX90A-LABEL: local_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB63_2
+; GFX90A-NEXT: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB63_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB63_2
+; GFX940-NEXT: ; %bb.1:
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GFX940-NEXT: s_load_dword s6, s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: .LBB63_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
deleted file mode 100644
index 74aa755..0000000
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-
-declare float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-
-; GCN-LABEL: {{^}}lds_ds_fadd:
-; VI-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
-; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
-; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; GCN: s_waitcnt lgkmcnt(1)
-; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
-define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
- %idx.add = add nuw i32 %idx, 4
- %shl0 = shl i32 %idx.add, 3
- %shl1 = shl i32 %idx.add, 4
- %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
- %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
- %a1 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
- %a2 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
- %a3 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
- store float %a3, ptr addrspace(1) %out
- ret void
-}