From 76894c5e6e20bfe8a30f7d8bdd39c41a7af54d65 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Jun 2024 20:51:14 +0200 Subject: clang/AMDGPU: Emit atomicrmw from ds_fadd builtins (#95395) We should have done this for the f32/f64 case a long time ago. Now that codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting it instead. This also does upgrade the behavior to respect a volatile qualified pointer, which was previously ignored (for the cases that don't have an explicit volatile argument). --- clang/lib/CodeGen/CGBuiltin.cpp | 113 +++++++++++++-------- clang/test/CodeGenCUDA/builtins-amdgcn.cu | 2 +- clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu | 2 +- .../CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu | 5 +- .../builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu | 2 +- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl | 37 ++++++- .../CodeGenOpenCL/builtins-fp-atomics-gfx12.cl | 14 ++- .../test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl | 9 +- .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl | 4 +- .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl | 10 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 3 +- 11 files changed, 139 insertions(+), 62 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index eb56bba..08a89bd 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, break; } + // Some of the atomic builtins take the scope as a string name. StringRef scp; - llvm::getConstantStringInfo(Scope, scp); - SSID = getLLVMContext().getOrInsertSyncScopeID(scp); + if (llvm::getConstantStringInfo(Scope, scp)) { + SSID = getLLVMContext().getOrInsertSyncScopeID(scp); + return; + } + + // Older builtins had an enum argument for the memory scope. + int scope = cast(Scope)->getZExtValue(); + switch (scope) { + case 0: // __MEMORY_SCOPE_SYSTEM + SSID = llvm::SyncScope::System; + break; + case 1: // __MEMORY_SCOPE_DEVICE + SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); + break; + case 2: // __MEMORY_SCOPE_WRKGRP + SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup"); + break; + case 3: // __MEMORY_SCOPE_WVFRNT + SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront"); + break; + case 4: // __MEMORY_SCOPE_SINGLE + SSID = llvm::SyncScope::SingleThread; + break; + default: + SSID = llvm::SyncScope::System; + break; + } } llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, @@ -18558,14 +18584,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } - case AMDGPU::BI__builtin_amdgcn_ds_faddf: case AMDGPU::BI__builtin_amdgcn_ds_fminf: case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { Intrinsic::ID Intrin; switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_ds_faddf: - Intrin = Intrinsic::amdgcn_ds_fadd; - break; case AMDGPU::BI__builtin_amdgcn_ds_fminf: Intrin = Intrinsic::amdgcn_ds_fmin; break; @@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()}); return Builder.CreateCall(F, {Addr, Val}); } - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: { - Intrinsic::ID IID; - llvm::Type *ArgTy; - switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: - ArgTy = llvm::Type::getFloatTy(getLLVMContext()); - IID = Intrinsic::amdgcn_ds_fadd; - break; - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: - ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); - IID = Intrinsic::amdgcn_ds_fadd; - break; - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 2); - IID = Intrinsic::amdgcn_ds_fadd; - break; - } - llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); - llvm::Value *Val = EmitScalarExpr(E->getArg(1)); - llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue( - llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true)); - llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue( - llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0)); - llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); - return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); - } case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: @@ -19044,7 +19037,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_atomic_inc32: case AMDGPU::BI__builtin_amdgcn_atomic_inc64: case AMDGPU::BI__builtin_amdgcn_atomic_dec32: - case AMDGPU::BI__builtin_amdgcn_atomic_dec64: { + case AMDGPU::BI__builtin_amdgcn_atomic_dec64: + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: { llvm::AtomicRMWInst::BinOp BinOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_atomic_inc32: @@ -19055,23 +19053,54 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_atomic_dec64: BinOp = llvm::AtomicRMWInst::UDecWrap; break; + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: + BinOp = llvm::AtomicRMWInst::FAdd; + break; } Address Ptr = CheckAtomicAlignment(*this, E); Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Type *OrigTy = Val->getType(); + QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); - ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), - EmitScalarExpr(E->getArg(3)), AO, SSID); + bool Volatile; - QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); - bool Volatile = - PtrTy->castAs()->getPointeeType().isVolatileQualified(); + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) { + // __builtin_amdgcn_ds_faddf has an explicit volatile argument + Volatile = + cast(EmitScalarExpr(E->getArg(4)))->getZExtValue(); + } else { + // Infer volatile from the passed type. + Volatile = + PtrTy->castAs()->getPointeeType().isVolatileQualified(); + } + + if (E->getNumArgs() >= 4) { + // Some of the builtins have explicit ordering and scope arguments. + ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3)), AO, SSID); + } else { + // The ds_fadd_* builtins do not have syncscope/order arguments. + SSID = llvm::SyncScope::System; + AO = AtomicOrdering::SequentiallyConsistent; + + // The v2bf16 builtin uses i16 instead of a natural bfloat type. + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) { + llvm::Type *V2BF16Ty = FixedVectorType::get( + llvm::Type::getBFloatTy(Builder.getContext()), 2); + Val = Builder.CreateBitCast(Val, V2BF16Ty); + } + } llvm::AtomicRMWInst *RMW = Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID); if (Volatile) RMW->setVolatile(true); - return RMW; + return Builder.CreateBitCast(RMW, OrigTy); } case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn: case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: { diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu index 1fc2fb9..132cbd2 100644 --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -115,7 +115,7 @@ __global__ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu index 8dbb8c5..7bb756a 100644 --- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -112,7 +112,7 @@ __global__ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu index 66ec200..03b39cd 100644 --- a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu +++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu @@ -11,9 +11,10 @@ typedef __attribute__((address_space(3))) float *LP; // CHECK: store ptr %addr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8 // CHECK: %[[ADDR_ADDR_ASCAST:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8 // CHECK: %[[AS_CAST:.*]] = addrspacecast ptr %[[ADDR_ADDR_ASCAST]] to ptr addrspace(3) -// CHECK: %3 = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[AS_CAST]] +// CHECK: [[TMP2:%.+]] = load float, ptr %val.addr.ascast, align 4 +// CHECK: [[TMP3:%.+]] = atomicrmw fadd ptr addrspace(3) %[[AS_CAST]], float [[TMP2]] monotonic, align 4 // CHECK: %4 = load ptr, ptr %rtn.ascast, align 8 -// CHECK: store float %3, ptr %4, align 4 +// CHECK: store float [[TMP3]], ptr %4, align 4 __device__ void test_ds_atomic_add_f32(float *addr, float val) { float *rtn; *rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0); diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu index 1ea1d5f..e01d9d7 100644 --- a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu +++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu @@ -20,7 +20,7 @@ typedef __attribute__((address_space(3))) float *LP; // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3) // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4 // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 // CHECK-NEXT: ret void diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index ea2aedf..46af87f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -117,13 +117,44 @@ void test_update_dpp(global int* out, int arg1, int arg2) } // CHECK-LABEL: @test_ds_fadd -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} +// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} + +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} #if !defined(__SPIRV__) void test_ds_faddf(local float *out, float src) { #else -void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) { + void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) { #endif - *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false); + + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, true); + + // Test all orders. + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid + + // Test all syncscopes. + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid } // CHECK-LABEL: @test_ds_fmin diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl index 0b4038a..63381942 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -10,7 +10,10 @@ typedef half __attribute__((ext_vector_type(2))) half2; typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_local_add_2bf16 -// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> + // GFX12-LABEL: test_local_add_2bf16 // GFX12: ds_pk_add_rtn_bf16 short2 test_local_add_2bf16(__local short2 *addr, short2 x) { @@ -18,7 +21,10 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2bf16_noret -// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> + // GFX12-LABEL: test_local_add_2bf16_noret // GFX12: ds_pk_add_bf16 void test_local_add_2bf16_noret(__local short2 *addr, short2 x) { @@ -26,7 +32,7 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2f16 -// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 // GFX12-LABEL: test_local_add_2f16 // GFX12: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { @@ -34,7 +40,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) { } // CHECK-LABEL: test_local_add_2f16_noret -// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 // GFX12-LABEL: test_local_add_2f16_noret // GFX12: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl index 823316d..ad4d0b7 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl @@ -6,7 +6,7 @@ // REQUIRES: amdgpu-registered-target // CHECK-LABEL: test_fadd_local -// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false) +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 // GFX8-LABEL: test_fadd_local$local: // GFX8: ds_add_rtn_f32 v2, v0, v1 // GFX8: s_endpgm @@ -14,3 +14,10 @@ kernel void test_fadd_local(__local float *ptr, float val){ float *res; *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val); } + +// CHECK-LABEL: test_fadd_local_volatile +// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 +kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){ + volatile float *res; + *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl index 8e81650..e2117f1 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){ } // CHECK-LABEL: test_ds_add_local_f64 -// CHECK: call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %{{.*}}, double %{{.*}}, +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8 // GFX90A: test_ds_add_local_f64$local // GFX90A: ds_add_rtn_f64 void test_ds_add_local_f64(__local double *addr, double x){ @@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){ } // CHECK-LABEL: test_ds_addf_local_f32 -// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 // GFX90A-LABEL: test_ds_addf_local_f32$local // GFX90A: ds_add_rtn_f32 void test_ds_addf_local_f32(__local float *addr, float x){ diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl index e415a95..92a33ce 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl @@ -42,7 +42,11 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2bf16 -// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % + +// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> + // GFX940-LABEL: test_local_add_2bf16 // GFX940: ds_pk_add_rtn_bf16 short2 test_local_add_2bf16(__local short2 *addr, short2 x) { @@ -50,7 +54,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2f16 -// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 // GFX940-LABEL: test_local_add_2f16 // GFX940: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { @@ -58,7 +62,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) { } // CHECK-LABEL: test_local_add_2f16_noret -// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 // GFX940-LABEL: test_local_add_2f16_noret // GFX940: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 45f1092..8a5566a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2933,8 +2933,7 @@ def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< [llvm_v2i16_ty], [LLVMQualPointerType<3>, llvm_v2i16_ty], - [IntrArgMemOnly, NoCapture>]>, - ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; + [IntrArgMemOnly, NoCapture>]>; defset list AMDGPUMFMAIntrinsics940 = { def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; -- cgit v1.1