// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx942 -emit-llvm -fcuda-is-device %s -o - | FileCheck %s #define __device__ __attribute__((device)) #define __shared__ __attribute__((shared)) #define __constant__ __attribute__((constant)) __constant__ float const_float; __constant__ double const_double; __device__ float global_float; __device__ double global_double; // CHECK-LABEL: define dso_local void @_Z30test_flat_atomic_fadd_f32_flatPff( // CHECK-SAME: ptr noundef [[PTR:%.*]], float noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] // CHECK-NEXT: store float [[TMP2]], ptr [[RESULT_ASCAST]], align 4 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f32_flat(float *ptr, float val) { float result; result = __builtin_amdgcn_flat_atomic_fadd_f32(ptr, val); } // CHECK-LABEL: define dso_local void @_Z30test_flat_atomic_fadd_f64_flatPdd( // CHECK-SAME: ptr noundef [[PTR:%.*]], double noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store double [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], double [[TMP1]] syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]] // CHECK-NEXT: store double [[TMP2]], ptr [[RESULT_ASCAST]], align 8 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f64_flat(double *ptr, double val) { double result; result = __builtin_amdgcn_flat_atomic_fadd_f64(ptr, val); } // CHECK-LABEL: define dso_local void @_Z32test_flat_atomic_fadd_f32_sharedPff( // CHECK-SAME: ptr noundef [[PTR:%.*]], float noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // CHECK-NEXT: store float [[TMP2]], ptr [[RESULT_ASCAST]], align 4 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f32_shared(__shared__ float *ptr, float val) { float result; result = __builtin_amdgcn_flat_atomic_fadd_f32(ptr, val); } // CHECK-LABEL: define dso_local void @_Z32test_flat_atomic_fadd_f64_sharedPdd( // CHECK-SAME: ptr noundef [[PTR:%.*]], double noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store double [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], double [[TMP1]] syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]] // CHECK-NEXT: store double [[TMP2]], ptr [[RESULT_ASCAST]], align 8 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f64_shared(__shared__ double *ptr, double val) { double result; result = __builtin_amdgcn_flat_atomic_fadd_f64(ptr, val); } // CHECK-LABEL: define dso_local void @_Z34test_flat_atomic_fadd_f32_constantf( // CHECK-SAME: float noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store float [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(4) @const_float to ptr), float [[TMP0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // CHECK-NEXT: store float [[TMP1]], ptr [[RESULT_ASCAST]], align 4 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f32_constant(float val) { float result; result = __builtin_amdgcn_flat_atomic_fadd_f32(&const_float, val); } // CHECK-LABEL: define dso_local void @_Z34test_flat_atomic_fadd_f64_constantd( // CHECK-SAME: double noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store double [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(4) @const_double to ptr), double [[TMP0]] syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]] // CHECK-NEXT: store double [[TMP1]], ptr [[RESULT_ASCAST]], align 8 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f64_constant(double val) { double result; result = __builtin_amdgcn_flat_atomic_fadd_f64(&const_double, val); } // CHECK-LABEL: define dso_local void @_Z32test_flat_atomic_fadd_f32_globalf( // CHECK-SAME: float noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store float [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[VAL_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @global_float to ptr), float [[TMP0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // CHECK-NEXT: store float [[TMP1]], ptr [[RESULT_ASCAST]], align 4 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f32_global(float val) { float result; result = __builtin_amdgcn_flat_atomic_fadd_f32(&global_float, val); } // CHECK-LABEL: define dso_local void @_Z32test_flat_atomic_fadd_f64_globald( // CHECK-SAME: double noundef [[VAL:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[RESULT:%.*]] = alloca double, align 8, addrspace(5) // CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT]] to ptr // CHECK-NEXT: store double [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[VAL_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @global_double to ptr), double [[TMP0]] syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]] // CHECK-NEXT: store double [[TMP1]], ptr [[RESULT_ASCAST]], align 8 // CHECK-NEXT: ret void // __device__ void test_flat_atomic_fadd_f64_global(double val) { double result; result = __builtin_amdgcn_flat_atomic_fadd_f64(&global_double, val); } //. // CHECK: [[META4]] = !{} //.