diff options
author | U-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de> | 2024-11-04 14:08:55 +0100 |
---|---|---|
committer | U-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de> | 2024-11-04 14:08:55 +0100 |
commit | 54bc5758988e505e17618c04a9a36d1cfa72913d (patch) | |
tree | 37e93309d367876bc1b6ffac21988665f85dc43d /llvm/test | |
parent | dacf10168ae0dfd222ed301b93369834d411d139 (diff) | |
parent | 3cffa3474fd20518e19afa0c0ad3ff602864f688 (diff) | |
download | llvm-users/meinersbur/irbuilder-extract.zip llvm-users/meinersbur/irbuilder-extract.tar.gz llvm-users/meinersbur/irbuilder-extract.tar.bz2 |
Merge commit '3cffa3474fd20518e19afa0c0ad3ff602864f688' into users/meinersbur/irbuilder-extractusers/meinersbur/irbuilder-extract
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/NVPTX/load-store.ll | 6221 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/SPIRV/decoration-order.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/tls-function-argument.ll | 30 | ||||
-rw-r--r-- | llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll | 127 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll | 24 |
6 files changed, 4992 insertions, 1437 deletions
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index f922fd9..a4be81a 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 @@ -20,1595 +21,4947 @@ ; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . -; generic statespace - -; CHECK-LABEL: generic_weak -define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +;; generic statespace + +; generic + +define void @generic_i8(ptr %a) { +; CHECK-LABEL: generic_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0]; +; CHECK-NEXT: ld.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 - ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr %a + ret void +} + +define void @generic_i16(ptr %a) { +; CHECK-LABEL: generic_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0]; +; CHECK-NEXT: ld.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr %a + ret void +} + +define void @generic_i32(ptr %a) { +; CHECK-LABEL: generic_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr %a + ret void +} + +define void @generic_i64(ptr %a) { +; CHECK-LABEL: generic_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0]; +; CHECK-NEXT: ld.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr %a + ret void +} + +define void @generic_float(ptr %a) { +; CHECK-LABEL: generic_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0]; +; CHECK-NEXT: ld.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr %a + ret void +} + +define void @generic_double(ptr %a) { +; CHECK-LABEL: generic_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0]; +; CHECK-NEXT: ld.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr %a + ret void +} + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the next tests. This test lowers to a weak PTX +; vector op, but next test lowers to a vector PTX op. +define void @generic_2xi8(ptr %a) { +; CHECK-LABEL: generic_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; +; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the previous test. This test lowers to a weak +; PTX scalar op, but prior test lowers to a vector PTX op. +define void @generic_4xi8(ptr %a) { +; CHECK-LABEL: generic_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr %a + ret void +} - ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr %b - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr %c - - ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr %d - - ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr %c - - ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %d - %f.add = fadd double %f.load, 1. - ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %d - - ; TODO: make the lowering of this weak vector ops consistent with - ; the ones of the next tests. This test lowers to a weak PTX - ; vector op, but next test lowers to a vector PTX op. - ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr %b - - ; TODO: make the lowering of this weak vector ops consistent with - ; the ones of the previous test. This test lowers to a weak - ; PTX scalar op, but prior test lowers to a vector PTX op. - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr %c - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr %c - - ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr %d - - ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr %d - - ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr %d - - ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr %d - - ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr %d - - ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr %d - - ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr %d - - ret void -} - -; CHECK-LABEL: generic_volatile -define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_2xi16(ptr %a) { +; CHECK-LABEL: generic_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_4xi16(ptr %a) { +; CHECK-LABEL: generic_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; +; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_2xi32(ptr %a) { +; CHECK-LABEL: generic_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; +; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_4xi32(ptr %a) { +; CHECK-LABEL: generic_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; +; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_2xi64(ptr %a) { +; CHECK-LABEL: generic_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; +; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_2xfloat(ptr %a) { +; CHECK-LABEL: generic_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; +; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_4xfloat(ptr %a) { +; CHECK-LABEL: generic_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; +; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_2xdouble(ptr %a) { +; CHECK-LABEL: generic_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; +; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr %a + ret void +} + +; generic_volatile + +define void @generic_volatile_i8(ptr %a) { +; CHECK-LABEL: generic_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i16(ptr %a) { +; CHECK-LABEL: generic_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i32(ptr %a) { +; CHECK-LABEL: generic_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i64(ptr %a) { +; CHECK-LABEL: generic_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr %a + ret void +} + +define void @generic_volatile_float(ptr %a) { +; CHECK-LABEL: generic_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr %a + ret void +} + +define void @generic_volatile_double(ptr %a) { +; CHECK-LABEL: generic_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr %a + ret void +} + +; TODO: volatile, atomic, and volatile atomic memory operations on vector types. +; Currently, LLVM: +; - does not allow atomic operations on vectors. +; - it allows volatile operations but not clear what that means. +; Following both semantics make sense in general and PTX supports both: +; - volatile/atomic/volatile atomic applies to the whole vector +; - volatile/atomic/volatile atomic applies elementwise +; Actions required: +; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those +; Below tests show that the current implementation picks the semantics in an inconsistent way +; * volatile <2 x i8> lowers to "elementwise volatile" +; * <4 x i8> lowers to "full vector volatile" +; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics +; - update tests in load-store-sm70.ll as well. + +; TODO: make this operation consistent with the one for <4 x i8> +; This operation lowers to a "element wise volatile PTX operation". +define void @generic_volatile_2xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make this operation consistent with the one for <2 x i8> +; This operation lowers to a "full vector volatile PTX operation". +define void @generic_volatile_4xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi64(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xdouble(ptr %a) { +; CHECK-LABEL: generic_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr %a + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr %b - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr %c - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr %d - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr %c - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr %c - - ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. - ; Currently, LLVM: - ; - does not allow atomic operations on vectors. - ; - it allows volatile operations but not clear what that means. - ; Following both semantics make sense in general and PTX supports both: - ; - volatile/atomic/volatile atomic applies to the whole vector - ; - volatile/atomic/volatile atomic applies elementwise - ; Actions required: - ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those - ; Below tests show that the current implementation picks the semantics in an inconsistent way - ; * volatile <2 x i8> lowers to "elementwise volatile" - ; * <4 x i8> lowers to "full vector volatile" - ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics - ; - update tests in load-store-sm70.ll as well. - - ; TODO: make this operation consistent with the one for <4 x i8> - ; This operation lowers to a "element wise volatile PTX operation". - ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr %b - - ; TODO: make this operation consistent with the one for <2 x i8> - ; This operation lowers to a "full vector volatile PTX operation". - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr %c - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr %c - - ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr %d - - ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr %d - - ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr %d - - ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr %d - - ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr %d - - ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr %d - - ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr %d - - ret void -} - -; CHECK-LABEL: generic_unordered_sys -define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; generic_unordered_sys + +define void @generic_unordered_sys_i8(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a unordered, align 1 + ret void +} + +define void @generic_unordered_sys_i16(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr %a unordered, align 2 + ret void +} - ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 - - ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 - - ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 - - ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 - - ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_unordered_volatile_sys -define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_unordered_sys_i32(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr %a unordered, align 4 + ret void +} + +define void @generic_unordered_sys_i64(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr %a unordered, align 8 + ret void +} + +define void @generic_unordered_sys_float(ptr %a) { +; SM60-LABEL: generic_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr %a unordered, align 4 + ret void +} + +define void @generic_unordered_sys_double(ptr %a) { +; SM60-LABEL: generic_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr %a unordered, align 8 + ret void +} + +; generic_unordered_volatile_sys + +define void @generic_unordered_volatile_sys_i8(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a unordered, align 1 + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 +define void @generic_unordered_volatile_sys_i16(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr %a unordered, align 2 + ret void +} - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 +define void @generic_unordered_volatile_sys_i32(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr %a unordered, align 4 + ret void +} - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 +define void @generic_unordered_volatile_sys_i64(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr %a unordered, align 8 + ret void +} - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 +define void @generic_unordered_volatile_sys_float(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr %a unordered, align 4 + ret void +} +define void @generic_unordered_volatile_sys_double(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr %a unordered, align 8 ret void } -; CHECK-LABEL: generic_monotonic_sys -define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; generic_monotonic_sys + +define void @generic_monotonic_sys_i8(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a monotonic, align 1 + ret void +} + +define void @generic_monotonic_sys_i16(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr %a monotonic, align 2 + ret void +} - ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 - - ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 - - ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 - - ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 - - ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic_volatile_sys -define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_monotonic_sys_i32(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr %a monotonic, align 4 + ret void +} + +define void @generic_monotonic_sys_i64(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr %a monotonic, align 8 + ret void +} + +define void @generic_monotonic_sys_float(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr %a monotonic, align 4 + ret void +} + +define void @generic_monotonic_sys_double(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr %a monotonic, align 8 + ret void +} + +; generic_monotonic_volatile_sys + +define void @generic_monotonic_volatile_sys_i8(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 +define void @generic_monotonic_volatile_sys_i16(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr %a monotonic, align 2 + ret void +} - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 +define void @generic_monotonic_volatile_sys_i32(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr %a monotonic, align 4 + ret void +} - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 +define void @generic_monotonic_volatile_sys_i64(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr %a monotonic, align 8 + ret void +} - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 +define void @generic_monotonic_volatile_sys_float(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr %a monotonic, align 4 + ret void +} +define void @generic_monotonic_volatile_sys_double(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr %a monotonic, align 8 ret void } ;; global statespace -; CHECK-LABEL: global_weak -define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; global + +define void @global_i8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0]; +; CHECK-NEXT: ld.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.global.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0]; +; CHECK-NEXT: ld.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.global.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(1) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(1) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0]; +; CHECK-NEXT: ld.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.global.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(1) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_float(ptr addrspace(1) %a) { +; CHECK-LABEL: global_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0]; +; CHECK-NEXT: ld.global.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.global.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(1) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_double(ptr addrspace(1) %a) { +; CHECK-LABEL: global_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0]; +; CHECK-NEXT: ld.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.global.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(1) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; +; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} - ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(1) %c - - ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(1) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(1) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(1) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(1) %c - - ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(1) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(1) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(1) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(1) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(1) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(1) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(1) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(1) %d - - ret void -} - -; CHECK-LABEL: global_volatile -define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; +; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; +; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; +; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; +; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; +; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; +; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; +; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(1) %a + ret void +} + +; global_volatile + +define void @global_volatile_i8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(1) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(1) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(1) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_float(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(1) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_double(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(1) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(1) %a + ret void +} - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(1) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile<2 x i8> %h.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(1) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile<4 x i8> %i.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(1) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile<2 x i16> %j.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(1) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile<4 x i16> %k.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(1) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile<2 x i32> %l.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(1) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile<4 x i32> %m.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(1) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile<2 x i64> %n.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(1) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile<2 x float> %o.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(1) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile<4 x float> %p.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(1) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile<2 x double> %q.add, ptr addrspace(1) %d - - ret void -} - -; CHECK-LABEL: global_unordered_sys -define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; global_unordered_sys + +define void @global_unordered_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ret void +} + +define void @global_unordered_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(1) %a unordered, align 2 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: global_unordered_volatile_sys -define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_unordered_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +define void @global_unordered_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +; global_unordered_volatile_sys + +define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ret void +} + +define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(1) %a unordered, align 2 + ret void +} + +define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: global_monotonic_sys -define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +; global_monotonic_sys + +define void @global_monotonic_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: global_monotonic_volatile_sys -define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_monotonic_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(1) %a monotonic, align 2 + ret void +} + +define void @global_monotonic_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +define void @global_monotonic_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +; global_monotonic_volatile_sys + +define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 +define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(1) %a monotonic, align 2 + ret void +} +define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(1) %a monotonic, align 8 ret void } ;; shared statespace -; CHECK-LABEL: shared_weak -define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; shared + +define void @shared_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0]; +; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0]; +; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(3) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(3) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0]; +; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(3) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0]; +; CHECK-NEXT: ld.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(3) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0]; +; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(3) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; +; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} - ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(3) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(3) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(3) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(3) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(3) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(3) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(3) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(3) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(3) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(3) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(3) %d - - ret void -} - -; CHECK-LABEL: shared_volatile -define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; +; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; +; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +; shared_volatile + +define void @shared_volatile_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(3) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(3) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(3) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(3) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(3) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(3) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(3) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(3) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(3) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(3) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(3) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(3) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(3) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(3) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(3) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr addrspace(3) %d - - ret void -} - -; CHECK-LABEL: shared_unordered_sys -define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +; shared_unordered_sys + +define void @shared_unordered_sys_i8(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ret void +} - ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - - ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - - ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - - ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: shared_unordered_volatile_sys -define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_unordered_sys_i16(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(3) %a unordered, align 2 + ret void +} + +define void @shared_unordered_sys_i32(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} + +define void @shared_unordered_sys_i64(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} + +define void @shared_unordered_sys_float(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} + +define void @shared_unordered_sys_double(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} + +; shared_unordered_volatile_sys + +define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 +define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(3) %a unordered, align 2 + ret void +} - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 +define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 +define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 +define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} +define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(3) %a unordered, align 8 ret void } -; CHECK-LABEL: shared_monotonic_sys -define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; shared_monotonic_sys + +define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ret void +} + +define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(3) %a monotonic, align 2 + ret void +} + +define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} + +define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} - ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - - ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - - ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 - - ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: shared_monotonic_volatile_sys -define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_monotonic_sys_float(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} + +define void @shared_monotonic_sys_double(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} + +; shared_monotonic_volatile_sys + +define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 +define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(3) %a monotonic, align 2 + ret void +} - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 +define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 +define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 +define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} +define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(3) %a monotonic, align 8 ret void } ;; local statespace -; CHECK-LABEL: local_weak -define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local + +define void @local_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(5) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(5) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(5) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(5) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(5) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(5) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(5) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(5) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(5) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(5) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(5) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(5) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(5) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(5) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(5) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(5) %d - - ret void -} - -; CHECK-LABEL: local_volatile -define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @local_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(5) %a + ret void +} + +; local_volatile + +define void @local_volatile_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(5) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(5) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(5) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(5) %a + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(5) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(5) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(5) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(5) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(5) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(5) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(5) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(5) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(5) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(5) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr addrspace(5) %d - - ret void -} - -; CHECK-LABEL: local_unordered_sys -define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @local_volatile_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(5) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(5) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(5) %a + ret void +} + +; local_unordered_sys + +define void @local_unordered_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 +define void @local_unordered_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(5) %a unordered, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 +define void @local_unordered_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 +define void @local_unordered_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(5) %a unordered, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 +define void @local_unordered_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} +define void @local_unordered_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(5) %a unordered, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile_sys -define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_unordered_volatile_sys + +define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 +define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(5) %a unordered, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 +define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 +define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(5) %a unordered, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 +define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} +define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(5) %a unordered, align 8 ret void } -; CHECK-LABEL: local_monotonic_sys -define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_monotonic_sys + +define void @local_monotonic_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 +define void @local_monotonic_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(5) %a monotonic, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 +define void @local_monotonic_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 +define void @local_monotonic_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(5) %a monotonic, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 +define void @local_monotonic_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} +define void @local_monotonic_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(5) %a monotonic, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_monotonic_volatile_sys + +define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + ret void +} + +define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(5) %a monotonic, align 2 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 +define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} + +define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(5) %a monotonic, align 8 + ret void +} + +define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} +define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(5) %a monotonic, align 8 ret void } diff --git a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll index e139d3c..5bd3580 100644 --- a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll +++ b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll @@ -1,12 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s ; The load is to the high byte of the 2-byte store @g = global i8 -75 define void @f(i16 %v) { -; CHECK-LABEL: f -; CHECK: sth 3, -2(1) -; CHECK: lbz 3, -2(1) +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: addis 4, 2, .LC0@toc@ha +; CHECK-NEXT: sth 3, -2(1) +; CHECK-NEXT: ld 4, .LC0@toc@l(4) +; CHECK-NEXT: lbz 3, -2(1) +; CHECK-NEXT: stb 3, 0(4) +; CHECK-NEXT: blr %p32 = alloca i16 store i16 %v, ptr %p32 %tmp = load i8, ptr %p32 diff --git a/llvm/test/CodeGen/SPIRV/decoration-order.ll b/llvm/test/CodeGen/SPIRV/decoration-order.ll new file mode 100644 index 0000000..e8299e9 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/decoration-order.ll @@ -0,0 +1,15 @@ +; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; This test checks the OpDecorate MIR is generated after the associated +; vreg definition in the case of an array size declared through this lowering. + +define spir_func i32 @foo() { +entry: + %var = alloca i64 + br label %block + +block: + call void @llvm.memset.p0.i64(ptr align 8 %var, i8 0, i64 24, i1 false) + ret i32 0 +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/CodeGen/X86/tls-function-argument.ll b/llvm/test/CodeGen/X86/tls-function-argument.ll new file mode 100644 index 0000000..9b6ab52 --- /dev/null +++ b/llvm/test/CodeGen/X86/tls-function-argument.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s + +; Passing a pointer to thread-local storage to a function can be problematic +; since computing such addresses requires a function call that is introduced +; very late in instruction selection. We need to ensure that we don't introduce +; nested call sequence markers if this function call happens in a call sequence. + +@TLS = internal thread_local global i64 zeroinitializer, align 8 +declare void @bar(ptr) +define internal void @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: leaq TLS@TLSLD(%rip), %rdi +; CHECK-NEXT: callq __tls_get_addr@PLT +; CHECK-NEXT: leaq TLS@DTPOFF(%rax), %rbx +; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + call void @bar(ptr @TLS) + call void @bar(ptr @TLS) + ret void +} diff --git a/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll new file mode 100644 index 0000000..d692294 --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 +; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=1 \ +; RUN: -funcspec-for-literal-constant=true \ +; RUN: -funcspec-min-codesize-savings=50 \ +; RUN: -funcspec-min-latency-savings=0 \ +; RUN: -S < %s | FileCheck %s + +; Verify that we are able to estimate the codesize savings arising from a branch +; based on a comparison with a value found to have a constant range by IPSCCP. +define i32 @main() { + %notspec = call i32 @test_use_on_lhs(i32 8) + %spec1 = call i32 @test_use_on_lhs(i32 0) + %spec2 = call i32 @test_use_on_rhs(i32 1) + %sum1 = add i32 %notspec, %spec1 + %sum2 = add i32 %sum1, %spec2 + ret i32 %sum2 +} + +define i32 @test_use_on_lhs(i32 %x) { +entry: + %range = call i32 @foo(), !range !{ i32 1, i32 0 } + %bound = shl nsw nuw i32 %range, 3 + %cmp = icmp uge i32 %x, %bound + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @do_something() + call void @do_something() + call void @do_something() + call void @do_something() + br label %if.end + +if.end: + %res = phi i32 [ 0, %entry ], [ 1, %if.then] + ret i32 %res +} + +define i32 @test_use_on_rhs(i32 %x) { +entry: + %range = call i32 @foo(), !range !{ i32 1, i32 0 } + %bound = shl nsw nuw i32 %range, 3 + %x.sub = sub nsw nuw i32 %x, 1 + %cmp = icmp ult i32 %bound, %x.sub + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @do_something() + call void @do_something() + call void @do_something() + call void @do_something() + br label %if.end + +if.end: + %res = phi i32 [ 0, %entry ], [ 1, %if.then] + ret i32 %res +} + +declare i32 @foo() +declare void @do_something() +; CHECK-LABEL: define range(i32 0, 2) i32 @main() { +; CHECK-NEXT: [[NOTSPEC:%.*]] = call i32 @test_use_on_lhs(i32 8) +; CHECK-NEXT: [[SPEC1:%.*]] = call i32 @test_use_on_lhs.specialized.1(i32 0) +; CHECK-NEXT: [[SPEC2:%.*]] = call i32 @test_use_on_rhs.specialized.2(i32 1) +; CHECK-NEXT: [[SUM:%.*]] = add nuw nsw i32 [[NOTSPEC]], 0 +; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[SUM]], 0 +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_lhs( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[X]], [[BOUND]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_rhs( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: [[X_SUB:%.*]] = sub nuw nsw i32 [[X]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[BOUND]], [[X_SUB]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define internal i32 @test_use_on_lhs.specialized.1( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret i32 poison +; +; +; CHECK-LABEL: define internal i32 @test_use_on_rhs.specialized.2( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret i32 poison +; +;. +; CHECK: [[RNG0]] = !{i32 1, i32 0} +;. diff --git a/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll new file mode 100644 index 0000000..ac47c60 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define i1 @test(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[D]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: ret i1 [[TMP6]] +; + %cmp = icmp sgt i32 %x, 1 + %cmp2 = icmp sgt i32 %b, 1 + %cmp3 = icmp sgt i32 %c, 1 + %cmp4 = icmp sgt i32 %d, 1 + %sel2 = select i1 %cmp4, i1 true, i1 %cmp2 + %sel3 = select i1 %sel2, i1 true, i1 %cmp3 + %sel4 = select i1 %cmp, i1 true, i1 %cmp4 + %ret = or i1 %sel3, %sel4 + ret i1 %ret +} |