// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ // RUN: -emit-llvm -o - %s | FileCheck --check-prefix=HOST %s // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -emit-llvm -o - -fcuda-is-device %s | FileCheck --check-prefix=DEV %s // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -fatomic-fine-grained-memory -fatomic-ignore-denormal-mode \ // RUN: -emit-llvm -o - -fcuda-is-device %s | FileCheck --check-prefix=OPT %s #include "Inputs/cuda.h" // HOST-LABEL: define dso_local void @_Z12test_defaultPf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z12test_defaultPf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z12test_defaultPf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_default(float *a) { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } // HOST-LABEL: define dso_local void @_Z8test_onePf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z8test_onePf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z8test_onePf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_one(float *a) { [[clang::atomic(no_remote_memory)]] { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } } // HOST-LABEL: define dso_local void @_Z8test_twoPf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z8test_twoPf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z8test_twoPf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_two(float *a) { [[clang::atomic(remote_memory, ignore_denormal_mode)]] { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } } // HOST-LABEL: define dso_local void @_Z10test_threePf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z10test_threePf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z10test_threePf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_three(float *a) { [[clang::atomic(no_remote_memory, fine_grained_memory, no_ignore_denormal_mode)]] { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } } // HOST-LABEL: define dso_local void @_Z19test_multiple_attrsPf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z19test_multiple_attrsPf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z19test_multiple_attrsPf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_multiple_attrs(float *a) { [[clang::atomic(no_remote_memory)]] [[clang::atomic(remote_memory)]] { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } } // HOST-LABEL: define dso_local void @_Z11test_nestedPf( // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // HOST-NEXT: [[ENTRY:.*:]] // HOST-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // HOST-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // HOST-NEXT: [[DOTATOMICTMP1:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca float, align 4 // HOST-NEXT: [[DOTATOMICTMP3:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca float, align 4 // HOST-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4 // HOST-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4 // HOST-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 // HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 // HOST-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 // HOST-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 // HOST-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4 // HOST-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4 // HOST-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] seq_cst, align 4 // HOST-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4 // HOST-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4 // HOST-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4 // HOST-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4 // HOST-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] acquire, align 4 // HOST-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4 // HOST-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4 // HOST-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // HOST-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4 // HOST-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4 // HOST-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] release, align 4 // HOST-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4 // HOST-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4 // HOST-NEXT: ret void // // DEV-LABEL: define dso_local void @_Z11test_nestedPf( // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // DEV-NEXT: [[ENTRY:.*:]] // DEV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr // DEV-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr // DEV-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr // DEV-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr // DEV-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr // DEV-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] // DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // DEV-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // DEV-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 // DEV-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // DEV-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // DEV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // DEV-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // DEV-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] // DEV-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // DEV-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // DEV-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // DEV-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // DEV-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] // DEV-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // DEV-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z11test_nestedPf( // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP1:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP3:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr // OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr // OPT-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr // OPT-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr // OPT-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr // OPT-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr // OPT-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] // OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // OPT-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // OPT-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 // OPT-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // OPT-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // OPT-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // OPT-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // OPT-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] // OPT-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // OPT-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // OPT-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // OPT-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // OPT-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] // OPT-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // OPT-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // OPT-NEXT: ret void // __device__ __host__ void test_nested(float *a) { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); { [[clang::atomic(remote_memory, fine_grained_memory, no_ignore_denormal_mode)]] { __scoped_atomic_fetch_max(a, 2, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); { [[clang::atomic(no_remote_memory)]] { __scoped_atomic_fetch_min(a, 3, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WRKGRP); } } { [[clang::atomic(no_fine_grained_memory)]] { __scoped_atomic_fetch_sub(a, 4, __ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); } } } } } // // // // template __device__ __host__ void test_template(T *a) { [[clang::atomic(no_remote_memory, fine_grained_memory)]] { __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } } template __device__ __host__ void test_template(float *a); //. // DEV: [[META4]] = !{} //. // OPT: [[META4]] = !{} //.